<center><h1>Masked Language Modeling Using Transformers</h1></center>

## Import librairies

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
import transformers
from transformers import pipeline
import warnings
from transformers import logging
logging.set_verbosity_error()
warnings.filterwarnings('ignore')

## Import Dataset

In [1]:
import pandas as pd

In [2]:
eval_df = pd.read_excel("../../Datasets/text_mining/text_mining_academic_results.xlsx")
eval_df.head()

Unnamed: 0,target sentence,noisy sentence1,noisy sentence2,noisy sentence3,common_sentence,uncommon_target_sentence,uncommon_sentence1,uncommon_sentence2,uncommon_sentence3,Is correct
0,A man with a hard hat is dancing.,A CAN WHET a GERARD hat is dancing.,A man with a hard hat is dancing.,/,a [MASK] [MASK] a [MASK] hat is dancing,"['man', 'with', 'hard']","['can', 'whet', 'gerard']","['man', 'with', 'hard']",/,Correct
1,A young child is riding a horse.,A young child is REDDING a horse.,A young TILED WIZ riding IO horse.,/,a young [MASK] [MASK] [MASK] [MASK] horse,"['child', 'is', 'riding', 'a']","['child', 'is', 'redding', 'a']","['tiled', 'wiz', 'riding', 'io']",/,Correct
2,A man is feeding a mouse to a snake.,A man is feeding IO mouse to IO snake.,A CAN is feeding a mouse to a snake.,/,a [MASK] is feeding [MASK] mouse to [MASK] snake,"['man', 'a', 'a']","['man', 'io', 'io']","['can', 'a', 'a']",/,Correct
3,A woman is playing the guitar.,A woman is playing the guitar.,A WAYMAN WIZ SWAYING TU guitar.,/,a [MASK] [MASK] [MASK] [MASK] guitar,"['woman', 'is', 'playing', 'the']","['woman', 'is', 'playing', 'the']","['wayman', 'wiz', 'swaying', 'tu']",/,Correct
4,A woman is playing the flute.,A WAYMAN WIZ SWAYING TU flute.,A woman is playing the flute.,/,a [MASK] [MASK] [MASK] [MASK] flute,"['woman', 'is', 'playing', 'the']","['wayman', 'wiz', 'swaying', 'tu']","['woman', 'is', 'playing', 'the']",/,Correct


In [3]:
masked_sentences = eval_df['common_sentence'].tolist()
real_pred = eval_df['uncommon_target_sentence'].tolist()

## BERT

In [204]:
fill_mask = pipeline("fill-mask", model="bert-base-uncased")

In [211]:
# 34 min 11.3 s
df_bert = []
for i in range(len(masked_sentences)):
    print(str(i+1) + "/" + str(len(masked_sentences)))
    ms = masked_sentences[i]
    r_pred = nltk.RegexpTokenizer(r'\w+').tokenize(real_pred[i])
    number_of_masks = len(r_pred)
    for j in range(number_of_masks):
        pred = fill_mask(ms)
        if type(pred[0]) == list:
            df1 = pd.DataFrame(pred[0])
        else:
            df1 = pd.DataFrame(pred) # Convert the prediction to a dataframe
        word_list = df1["token_str"].tolist() # Get the list of words from the dataframe

        df_bert.append([ms, r_pred[j], word_list])
        ms = ms.replace("[MASK]", r_pred[j], 1)

df_bert = pd.DataFrame(df_bert, columns=["Masked sentence","real", "pred"])
df_bert.head()

Unnamed: 0,Masked sentence,real,pred
0,a [MASK] [MASK] a [MASK] hat is dancing,man,"[woman, man, girl, boy, lady]"
1,a man [MASK] a [MASK] hat is dancing,with,"[in, wearing, with, without, under]"
2,a man with a [MASK] hat is dancing,hard,"[straw, cowboy, bowler, top, red]"
3,a young [MASK] [MASK] [MASK] [MASK] horse,child,"[woman, man, girl, boy, horse]"
4,a young child [MASK] [MASK] [MASK] horse,is,"[., with, riding, ,, and]"


In [210]:
df_bert.to_excel("../../Datasets/MLM/bert-base-uncased.xlsx", index=False)

## Roberta

In [15]:
fill_mask = pipeline("fill-mask", model="roberta-base")

In [5]:
# 36 min 9.9s
df_roberta = []
for i in range(len(masked_sentences)):
    print(str(i+1) + "/" + str(len(masked_sentences)))
    ms = masked_sentences[i].replace("[MASK]", "<mask>")
    r_pred = nltk.RegexpTokenizer(r'\w+').tokenize(real_pred[i])
    number_of_masks = len(r_pred)
    for j in range(number_of_masks):
        pred = fill_mask(ms)
        if type(pred[0]) == list:
            df1 = pd.DataFrame(pred[0])
        else:
            df1 = pd.DataFrame(pred) # Convert the prediction to a dataframe
        word_list = df1["token_str"].tolist() # Get the list of words from the dataframe

        df_roberta.append([ms, r_pred[j], word_list])
        ms = ms.replace("<mask>", r_pred[j], 1)

df_roberta = pd.DataFrame(df_roberta, columns=["Masked sentence", "real", "pred"])
df_roberta.head()

Unnamed: 0,Masked sentence,real,pred
0,a <mask> <mask> a <mask> hat is dancing,man,"[' man', ' guy', ' woman', ' girl', ' boy']"
1,a man <mask> a <mask> hat is dancing,with,"[' in', ' wearing', ' with', ' sporting', ' wi..."
2,a man with a <mask> hat is dancing,hard,"[' cowboy', ' top', ' straw', ' Santa', ' red']"
3,a young <mask> <mask> <mask> <mask> horse,child,"[' boy', ' man', ' girl', ' male', ' horse']"
4,a young child <mask> <mask> <mask> horse,is,"[' riding', ' on', ' with', ' rides', ' sitting']"


In [17]:
df_roberta.to_excel("../../Datasets/MLM/roberta-base_pred.xlsx", index=False)

## Albert

In [5]:
fill_mask = pipeline("fill-mask", model="albert-base-v2")

In [8]:
# 29min 5.7s
df_albert = []
for i in range(len(masked_sentences)):
    print(str(i+1) + "/" + str(len(masked_sentences)))
    ms = masked_sentences[i]
    r_pred = nltk.RegexpTokenizer(r'\w+').tokenize(real_pred[i])
    number_of_masks = len(r_pred)
    for j in range(number_of_masks):
        pred = fill_mask(ms)
        if type(pred[0]) == list:
            df1 = pd.DataFrame(pred[0])
        else:
            df1 = pd.DataFrame(pred) # Convert the prediction to a dataframe
        word_list = df1["token_str"].tolist() # Get the list of words from the dataframe

        df_albert.append([ms, r_pred[j], word_list])
        ms = ms.replace("[MASK]", r_pred[j], 1)

df_albert = pd.DataFrame(df_albert, columns=["Masked sentence", "real", "pred"])
df_albert.head()

Unnamed: 0,Masked Sentence,real,pred
0,a [MASK] [MASK] a [MASK] hat is dancing,man,"['them', 'capitul', 'his', 'rou', 'oku']"
1,a man [MASK] a [MASK] hat is dancing,with,"['them', 'capitul', 'his', 'rou', 'oku']"
2,a man with a [MASK] hat is dancing,hard,"['cowboy', 'clown', 'purple', 'fedor', 'baseba..."
3,a young [MASK] [MASK] [MASK] [MASK] horse,child,"['them', 'capitul', 'his', 'rou', 'oku']"
4,a young child [MASK] [MASK] [MASK] horse,is,"['them', 'capitul', 'his', 'rou', 'oku']"


In [13]:
df_albert.to_excel("../../Datasets/MLM/albert-base_pred.xlsx", index=False)

## DistilRoberta

In [72]:
fill_mask = pipeline("fill-mask", model="distilroberta-base")

In [10]:
# 23min 20.5s
df_distilroberta = []
for i in range(len(masked_sentences)):
    print(str(i+1) + "/" + str(len(masked_sentences)))
    ms = masked_sentences[i].replace("[MASK]", "<mask>")
    r_pred = nltk.RegexpTokenizer(r'\w+').tokenize(real_pred[i])
    number_of_masks = len(r_pred)
    for j in range(number_of_masks):
        pred = fill_mask(ms)
        if type(pred[0]) == list:
            df1 = pd.DataFrame(pred[0])
        else:
            df1 = pd.DataFrame(pred) # Convert the prediction to a dataframe
        word_list = df1["token_str"].tolist() # Get the list of words from the dataframe

        df_distilroberta.append([ms, r_pred[j], word_list])
        ms = ms.replace("<mask>", r_pred[j], 1)

df_distilroberta = pd.DataFrame(df_distilroberta, columns=["Masked sentence", "real", "pred"])
df_distilroberta.head()

Unnamed: 0,Masked sentence,real,pred
0,a <mask> <mask> a <mask> hat is dancing,man,"[' woman', ' girl', ' dancer', ' man', ' boy']"
1,a man <mask> a <mask> hat is dancing,with,"[' wearing', ' with', ' in', ' sporting', ' wi..."
2,a man with a <mask> hat is dancing,hard,"[' cowboy', ' black', ' baseball', ' straw', '..."
3,a young <mask> <mask> <mask> <mask> horse,child,"[' female', ' horse', ' fo', ' male', ' African']"
4,a young child <mask> <mask> <mask> horse,is,"[' riding', ' wearing', ' chasing', ' who', ' ..."


In [76]:
df_distilroberta.to_excel("../../Datasets/MLM/distilroberta-base_pred.xlsx", index=False)

## DistilBert

In [5]:
fill_mask = pipeline("fill-mask", model="distilbert-base-uncased")

In [20]:
#20m 58.2s
df_distilbert = []
for i in range(len(masked_sentences)):
    print(str(i+1) + "/" + str(len(masked_sentences)))
    ms = masked_sentences[i]
    r_pred = nltk.RegexpTokenizer(r'\w+').tokenize(real_pred[i])
    number_of_masks = len(r_pred)
    correct = 0
    for j in range(number_of_masks):
        if "[MASK]" in ms and i != 801:
            pred = fill_mask(ms)
            if type(pred[0]) == list:
                df1 = pd.DataFrame(pred[0])
            else:
                df1 = pd.DataFrame(pred) # Convert the prediction to a dataframe
            word_list = df1["token_str"].tolist() # Get the list of words from the dataframe

            df_distilbert.append([ms, r_pred[j], word_list])
            ms = ms.replace("[MASK]", r_pred[j], 1)

df_distilbert = pd.DataFrame(df_distilbert, columns=["Masked sentence","real", "pred"])
df_distilbert.head()

Unnamed: 0,Masked sentence,real,pred
0,a [MASK] [MASK] a [MASK] hat is dancing,man,"['woman', 'clown', 'man', 'dog', 'cowboy']"
1,a man [MASK] a [MASK] hat is dancing,with,"['wearing', 'in', 'with', 'wears', 'holding']"
2,a man with a [MASK] hat is dancing,hard,"['straw', 'cowboy', 'bowler', 'conical', 'red']"
3,a young [MASK] [MASK] [MASK] [MASK] horse,child,"['man', 'gr', 'lady', 'buck', 'bu']"
4,a young child [MASK] [MASK] [MASK] horse,is,"['riding', 'named', 'called', 'rode', 'rides']"


In [21]:
df_distilbert.to_excel("../../Datasets/MLM/distilbert-base-uncased_pred.xlsx", index=False)