In [62]:
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
import torch
import numpy as np
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from string import punctuation
import transformers
import random

In [63]:
model = transformers.AutoModelWithLMHead.from_pretrained('lordtt13/COVID-SciBERT')
tokenizer = transformers.AutoTokenizer.from_pretrained('lordtt13/COVID-SciBERT')

In [64]:
tokenizer.mask_token, tokenizer.mask_token_id

('[MASK]', 104)

In [65]:
text = "Based on the Inception-v3 architecture, our system performs better in terms of processing complexity and accuracy than many existing models for imitation learning."
text

'Based on the Inception-v3 architecture, our system performs better in terms of processing complexity and accuracy than many existing models for imitation learning.'

In [66]:
sentence_split = text.split(" ")
sentence_length = len(sentence_split)
sentence_length

23

In [67]:
mask_id = random.randint(0, sentence_length)
mask_id

12

In [68]:
word2mask = sentence_split[1]
word2mask

'on'

In [69]:
word2mask = word2mask.replace(",","")
word2mask = word2mask.replace(".","")
word2mask

'on'

In [70]:
sentence_split[mask_id] = "[MASK]"

In [71]:
sentence_final = " ".join(sentence_split)
sentence_final

'Based on the Inception-v3 architecture, our system performs better in terms of [MASK] complexity and accuracy than many existing models for imitation learning.'

In [72]:
inputs = tokenizer(sentence_final, return_tensors="pt")
inputs

{'input_ids': tensor([[  102,   791,   191,   111,   306,  2613,   110,   579,   171, 30138,
          3652,   422,   580,   429,  8629,  1883,   121,  1615,   131,   104,
          3480,   137,  2683,   506,  1164,   199, 30109, 31862,  1262,   168,
         27248,  1904,   205,   103]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [73]:
sentence = ""
for word in inputs['input_ids'][0]:
    sentence += tokenizer.decode([word]) + " "
print(sentence)

[CLS] based on the inc ##epti ##on - v ##3 architecture , our system performs better in terms of [MASK] complexity and accuracy than many ex ##i sting models for imitation learning . [SEP] 


In [74]:
token_logits = model(**inputs).logits

In [75]:
token_logits.shape

torch.Size([1, 34, 31941])

In [76]:
mask_token_index = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]
mask_token_index

tensor([19])

In [77]:
mask_token_logits = token_logits[0,mask_token_index,:]

In [78]:
top_5_tokens = torch.topk(mask_token_logits,20,dim=1).indices[0].tolist()
top_5_tokens


[3989,
 532,
 2208,
 2307,
 11055,
 2411,
 437,
 2848,
 1904,
 2318,
 12185,
 3937,
 3480,
 2188,
 453,
 655,
 3838,
 4197,
 6402,
 1150]

In [79]:
for token in top_5_tokens:
    print(tokenizer.decode([token]))

computational
time
training
processing
decoding
memory
model
implementation
learning
parameter
runtime
computation
complexity
task
data
both
prediction
storage
compression
performance


In [81]:
for token in top_5_tokens:
    print(f"'>>> {sentence_final.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> Based on the Inception-v3 architecture, our system performs better in terms of computational complexity and accuracy than many existing models for imitation learning.'
'>>> Based on the Inception-v3 architecture, our system performs better in terms of time complexity and accuracy than many existing models for imitation learning.'
'>>> Based on the Inception-v3 architecture, our system performs better in terms of training complexity and accuracy than many existing models for imitation learning.'
'>>> Based on the Inception-v3 architecture, our system performs better in terms of processing complexity and accuracy than many existing models for imitation learning.'
'>>> Based on the Inception-v3 architecture, our system performs better in terms of decoding complexity and accuracy than many existing models for imitation learning.'
'>>> Based on the Inception-v3 architecture, our system performs better in terms of memory complexity and accuracy than many existing models for imitation le