In [None]:
!pip install transformers sentencepiece datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import tensorflow as tf
import transformers
import numpy as np
import pandas as pd

from transformers import AutoTokenizer, TFRobertaForMaskedLM

In [None]:
RANDOM_SEED = 1337

transformers.set_seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
def count_mask(input_sentences):
    mask_count = []
    for sentence in input_sentences:
        mask_count.append(sentence.count('[MASK]'))
    return mask_count

In [None]:
# Select scenario to use for prediction
scenario_mode = ['narration_mode', 'picture_description','open_ended']
scenario_mode = scenario_mode[0]

In [None]:
if scenario_mode == 'narration_mode' or scenario_mode == 'picture_description':
  df = pd.read_csv("protocol_aphasia_cosine.csv", index_col=0)
  df = df.loc[(df['similar_sentence_score'] < 1)]
  df = df.loc[(df['similar_sentence_score'] > 0.500)]
else:
  df = pd.read_csv("protocol_aphasia_anomic_preprocessed.csv", index_col=0)

In [None]:
if scenario_mode == 'narration_mode':
  # Using trained model on Narration scenario:
  selected_scenario = ["Cinderella", "Sandwich"]
  MODEL_NAME = "Middelz2/roberta-large-aphasia-narration-10e"
  MODEL_MODE = "narration"
  df = df.loc[df['scenario'].isin(selected_scenario)]
elif scenario_mode == 'picture_description':
  # Using trained model on Picture description scenario:
  selected_scenario = ["Window", "Cat", "Umbrella", "Flood"]
  MODEL_NAME = "Middelz2/roberta-large-aphasia-picture-description-10e"
  MODEL_MODE = "picture_description"
  df = df.loc[df['scenario'].isin(selected_scenario)]
else:
  # Using standard (non-pretrained model) on open ended scenario:
  selected_scenario = ["Important_Event", "Speech", "Stroke"]
  MODEL_NAME = "roberta-large"
  MODEL_MODE = "open_ended"
  df = df.loc[df['scenario'].isin(selected_scenario)]


# selected_scenario = ["Cinderella", "Sandwich"]
# MODEL_NAME = "roberta-large"
# MODEL_MODE = "picture_description"
# df = df.loc[df['scenario'].isin(selected_scenario)]

In [None]:
def make_single_mask_df_cosine(input_dataframe):
  """
  Converts sentences into sentences containing only 1 single mask.
  Only used for picture description and narration.
  """
  df = input_dataframe

  output_df = pd.DataFrame()
  original_sentences = []
  scenario = []
  sentence_variations = []  # The list that contains all sentences containing a single [MASK]
  similar_sentences = []
  similarity_score = []
  for i in range(0, len(df) - 1):
      mask_sentence = df['preprocessed_text'].iloc[i]
      words = mask_sentence.split(" ")
      mask_locations = list(np.where(np.array(words) == '[MASK]')[0])
      mask_count = len(mask_locations)

      for j in range(0, mask_count):
          current_mask_count = 0
          word_variations = []
          for word in words:
              if word == '[MASK]':
                  if current_mask_count == j:
                      word_variations.append(word)
                  current_mask_count += 1
              else:
                  word_variations.append(word)

          scenario.append(df['scenario'].iloc[i])
          original_sentences.append(df['preprocessed_text'].iloc[i])
          sentence_variations.append(" ".join(word_variations))
          similar_sentences.append(df['similar_sentences'].iloc[i])
          similarity_score.append(df['similar_sentence_score'].iloc[i])

  output_df['scenario'] = scenario
  output_df['original_sentences'] = original_sentences
  output_df['sentence_variations'] = sentence_variations
  output_df['similar_sentences'] = similar_sentences
  output_df['similar_sentence_score'] = similarity_score

  return output_df


In [None]:
def make_single_mask_df_open_ended(input_dataframe):
  """
  Converts sentences into sentences containing only 1 single mask.
  Only used for open ended questions.
  """
  df = input_dataframe

  output_df = pd.DataFrame()
  original_sentences = []
  scenario = []
  sentence_variations = []  # The list that contains all sentences containing a single [MASK]
  for i in range(0, len(df) - 1):
      mask_sentence = df['preprocessed_text'].iloc[i]
      words = mask_sentence.split(" ")
      mask_locations = list(np.where(np.array(words) == '[MASK]')[0])
      mask_count = len(mask_locations)

      for j in range(0, mask_count):
          current_mask_count = 0
          word_variations = []
          for word in words:
              if word == '[MASK]':
                  if current_mask_count == j:
                      word_variations.append(word)
                  current_mask_count += 1
              else:
                  word_variations.append(word)

          scenario.append(df['scenario'].iloc[i])
          original_sentences.append(df['preprocessed_text'].iloc[i])
          sentence_variations.append(" ".join(word_variations))

  output_df['scenario'] = scenario
  output_df['original_sentences'] = original_sentences
  output_df['sentence_variations'] = sentence_variations

  return output_df

In [None]:
if scenario_mode == 'narration_mode' or scenario_mode == 'picture_description':
  df = make_single_mask_df_cosine(df)
else:
  df = make_single_mask_df_open_ended(df)

df.head()

Unnamed: 0,scenario,original_sentences,sentence_variations,similar_sentences,similar_sentence_score
0,Cinderella,well [MASK] Cinderella she had [MASK] a stepmo...,well [MASK] Cinderella she had a stepmother an...,Cinderella was had two stepsisters .,0.843088
1,Cinderella,well [MASK] Cinderella she had [MASK] a stepmo...,well Cinderella she had [MASK] a stepmother an...,Cinderella was had two stepsisters .,0.843088
2,Cinderella,well [MASK] Cinderella she had [MASK] a stepmo...,well Cinderella she had a stepmother and two [...,Cinderella was had two stepsisters .,0.843088
3,Cinderella,and [MASK] and a fairy godmother comes appears...,and [MASK] and a fairy godmother comes appears...,and her fairy godmother appears to her .,0.805084
4,Cinderella,and [MASK] all of the mean mad mouses .,and [MASK] all of the mean mad mouses .,and the four mice they are just sitting .,0.599064


In [None]:
if scenario_mode == 'narration_mode' or scenario_mode == 'picture_description':
  df['word_count'] = df['sentence_variations'].str.split().str.len()
  df = df.loc[(df['word_count'] > 2)]
  df['sentence_variations'] = df['sentence_variations'].apply(lambda x: x.replace("[MASK]", "<mask>"))
  df['combined_sentences_w_sep'] = df['similar_sentences'] + " </s> " +  df['sentence_variations']

  df.head()
else:
  df['word_count'] = df['sentence_variations'].str.split().str.len()
  df = df.loc[(df['word_count'] > 2)]
  df['sentence_variations'] = df['sentence_variations'].apply(lambda x: x.replace("[MASK]", "<mask>"))
  df.head()


In [None]:
from transformers import BertTokenizer, TFBertModel, pipeline

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = TFRobertaForMaskedLM.from_pretrained(MODEL_NAME)

predicted_words = [[],[],[],[],[]]
predicted_scores = [[],[],[],[],[]]

model = pipeline('fill-mask', model=MODEL_NAME, device=0)

if scenario_mode == 'narration_mode' or scenario_mode == 'picture_description':
  predictions = model(df['combined_sentences_w_sep'].to_list(), top_k=5)
else:
  predictions = model(df['sentence_variations'].to_list(), top_k=5)


for prediction in predictions:
  for i in range(0, 5):
    try:
      predicted_words[i].append(prediction[i]['token_str'])
    except:
      predicted_words[i].append('')
    try:
      predicted_scores[i].append(prediction[i]['score'])
    except:
      predicted_scores[i].append(0.0)

All model checkpoint layers were used when initializing TFRobertaForMaskedLM.

All the layers of TFRobertaForMaskedLM were initialized from the model checkpoint at roberta-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForMaskedLM for predictions without further training.


In [None]:
df['word_pred_0'] = predicted_words[0]
df['score_pred_0'] = predicted_scores[0]
df['word_pred_1'] = predicted_words[1]
df['score_pred_1'] = predicted_scores[1]
df['word_pred_2'] = predicted_words[2]
df['score_pred_2'] = predicted_scores[2]
df['word_pred_3'] = predicted_words[3]
df['score_pred_3'] = predicted_scores[3]
df['word_pred_4'] = predicted_words[4]
df['score_pred_4'] = predicted_scores[4]

In [None]:
df.to_csv("maskandsep_preds_" + MODEL_MODE + "_non_pretrained" +  ".csv")