In [312]:
from transformers import RobertaForMaskedLM, RobertaTokenizer, RobertaModel
from tokenizers import ByteLevelBPETokenizer
import torch
import numpy as np
import pandas as pd

In [326]:
tokenizer = RobertaTokenizerFast("../ScilitBERT/ScilitBERT_tokenizer/scilitBERT_tok-vocab.json","../ScilitBERT/ScilitBERT_tokenizer/scilitBERT_tok-merges.txt")

ScilitBERT = RobertaForMaskedLM.from_pretrained("../ScilitBERT/ScilitBERT_cased")

In [327]:
# There is a problem with the tokenizer it does not tokenize <mask> as the special mask token, ...

def replace_masks(sentance):
    inputs = ScilitBERT_tokenizer(sentance)
    new_inputs=[]
    new_mask=[]
    mask_index=-1
    for i,tok in enumerate(inputs["input_ids"]):
        if tok==34 and inputs["input_ids"][i-1]==44174 and inputs["input_ids"][i-2]==1388:
            new_inputs=new_inputs[:-2]
            new_mask=new_mask[:-2]
            new_inputs.append(4)
            new_mask.append(0)
            mask_index=len(new_inputs)-1
        else:  
            new_inputs.append(tok)
            new_mask.append(1)
    return {"input_ids":torch.IntTensor(new_inputs).view(-1,len(new_inputs)), "attention_mask":torch.IntTensor(new_mask).view(-1,len(new_mask))}, mask_index

def softmax(logits: list)-> list:
    proba = [np.exp(i) for i in logits]
    total = sum(proba)
    proba = [i / total for i in proba]
    return proba

def top_predict_masked_token(sentance, nb_pred):
    inputs, mask_index=replace_masks(sentance)
    outputs = ScilitBERT(**inputs)
    logits = outputs.logits.detach().numpy()
    logits=softmax(logits[0][mask_index])
    max_indexes=sorted(range(len(logits)), key=lambda i: logits[i])[-nb_pred:]
    max_indexes.reverse()
    proba=[logits[i] for i in max_indexes]
    res=ScilitBERT_tokenizer.convert_ids_to_tokens(max_indexes)
    return res , proba

def display_output(sentance, nb_pred):
    words,proba=top_predict_masked_token(sentance,nb_pred)
    df=pd.DataFrame({"word":words,"proba":proba})
    print(f"completions for sentance:{sentance}")
    display(df.round(2))

## Some examples

In [328]:
display_output("a language model can be pretrained then <mask>-tuned on a downstream task",3)

completions for sentance:a language model can be pretrained then <mask>-tuned on a downstream task


Unnamed: 0,word,proba
0,Ġfine,0.96
1,Ġre,0.01
2,Ġhand,0.01


In [329]:
display_output("The <mask> are located on the valence layer",3)
# True answer: electrons

completions for sentance:The <mask> are located on the valence layer


Unnamed: 0,word,proba
0,Ġelectrons,0.31
1,Ġparticles,0.05
2,Ġelectrodes,0.05


In [330]:
display_output(" Unlike recent language representation models, BERT is designed to pre-train deep <mask> representations from unlabeled text",3)
# True answer: bidirectional

completions for sentance: Unlike recent language representation models, BERT is designed to pre-train deep <mask> representations from unlabeled text


Unnamed: 0,word,proba
0,Ġlanguage,0.29
1,Ġtext,0.16
2,Ġsemantic,0.12


In [331]:
display_output("The Masked Language Modelling <mask> enables the representation to fuse the left and the right context.",3)
# True answer: objective

completions for sentance:The Masked Language Modelling <mask> enables the representation to fuse the left and the right context.


Unnamed: 0,word,proba
0,Ġframework,0.24
1,Ġapproach,0.11
2,Ġtechnique,0.1


In [332]:
display_output("vaccine was reported to have an efficacy of 94.1% at preventing symptomatic COVID-19 due to infection with ‘wild-type’ variants in a randomized <mask> trial.",3)
# True Answer: clinical

completions for sentance:vaccine was reported to have an efficacy of 94.1% at preventing symptomatic COVID-19 due to infection with ‘wild-type’ variants in a randomized <mask> trial.


Unnamed: 0,word,proba
0,Ġcontrolled,0.58
1,Ġclinical,0.35
2,Ġcontrol,0.03


In [333]:
display_output("vaccine was <mask> to have an efficacy of 94.1% at preventing symptomatic COVID-19 due to infection with ‘wild-type’ variants in a randomized clinical trial.",3)
#True answer: reported

completions for sentance:vaccine was <mask> to have an efficacy of 94.1% at preventing symptomatic COVID-19 due to infection with ‘wild-type’ variants in a randomized clinical trial.


Unnamed: 0,word,proba
0,Ġfound,0.25
1,Ġdetermined,0.19
2,Ġshown,0.11
