In [143]:
import torch
import transformers
import random

In [144]:
model = transformers.AutoModelWithLMHead.from_pretrained('lordtt13/COVID-SciBERT')
tokenizer = transformers.AutoTokenizer.from_pretrained('lordtt13/COVID-SciBERT')

In [145]:
tokenizer.mask_token, tokenizer.mask_token_id

('[MASK]', 104)

In [146]:
text = "Based on the Inception-v3 architecture, our system performs better in terms of processing complexity and accuracy than many existing models for imitation learning."
text

'Based on the Inception-v3 architecture, our system performs better in terms of processing complexity and accuracy than many existing models for imitation learning.'

In [147]:
sentence_tokens = [token[0] for token in tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)]
sentence_tokens

['Based',
 'on',
 'the',
 'Inception',
 '-',
 'v3',
 'architecture',
 ',',
 'our',
 'system',
 'performs',
 'better',
 'in',
 'terms',
 'of',
 'processing',
 'complexity',
 'and',
 'accuracy',
 'than',
 'many',
 'existing',
 'models',
 'for',
 'imitation',
 'learning',
 '.']

In [148]:
sentence_length = len(sentence_tokens)
sentence_length

27

In [149]:
mask_id = random.randint(0, sentence_length)
mask_id

17

In [150]:
word2mask = sentence_tokens[mask_id]
word2mask

'and'

In [151]:
sentence_tokens[mask_id] = "[MASK]"

In [152]:
sentence_final = " ".join(sentence_tokens)
sentence_final

'Based on the Inception - v3 architecture , our system performs better in terms of processing complexity [MASK] accuracy than many existing models for imitation learning .'

In [153]:
inputs = tokenizer(sentence_final, return_tensors="pt")
inputs

{'input_ids': tensor([[  102,   791,   191,   111,   306,  2613,   110,   579,   171, 30138,
          3652,   422,   580,   429,  8629,  1883,   121,  1615,   131,  2307,
          3480,   104,  2683,   506,  1164,   199, 30109, 31862,  1262,   168,
         27248,  1904,   205,   103]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [154]:
sentence = ""
for word in inputs['input_ids'][0]:
    sentence += tokenizer.decode([word]) + " "
print(sentence)

[CLS] based on the inc ##epti ##on - v ##3 architecture , our system performs better in terms of processing complexity [MASK] accuracy than many ex ##i sting models for imitation learning . [SEP] 


In [155]:
token_logits = model(**inputs).logits

In [156]:
token_logits.shape

torch.Size([1, 34, 31941])

In [157]:
mask_token_index = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]
mask_token_index

tensor([21])

In [158]:
mask_token_logits = token_logits[0,mask_token_index,:]

In [165]:
top_n_tokens = torch.topk(mask_token_logits,20,dim=1).indices[0].tolist()
predicted_tokens = [tokenizer.decode([token]) for token in top_n_tokens]
predicted_tokens

['and',
 'prediction',
 'computational',
 'classification',
 'retrieval',
 'recognition',
 'estimation',
 'decoding',
 'or',
 'search',
 'learning',
 '/',
 'coding',
 'overall',
 'reconstruction',
 'extraction',
 'performance',
 'processing',
 'predictive',
 '(']

In [166]:
#for token in top_5_tokens:
#    print(f"'>>> {sentence_final.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

In [167]:
word2mask in predicted_tokens

True