In [19]:
import random
import pandas as pd
import pickle
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [83]:
def unmask(tokens, checkpoint, ids=False):
    """ Currently useless """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForMaskedLM.from_pretrained(checkpoint).to(device)

    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    if ids:
        input_ids = torch.tensor(tokens).to(device)
    else:
        input_ids = tokenizer(tokens, add_special_tokens=False, return_tensors='pt')['input_ids'].to(device)
    
    masked_idx = torch.where(input_ids == tokenizer.mask_token_id)
    print(masked_idx)
    masked_idx = masked_idx[1].item()
    
    model.eval()
    with torch.no_grad():
        logits = model(input_ids).logits # shape: [1, seq_len, vocab_size]
    
    probs = F.softmax(logits[0, masked_idx, :], dim=-1) # shape: [vocab_size]
    top_ids = torch.argsort(probs, axis=-1, descending=True)[:5]
    top_tokens = [tokenizer.convert_ids_to_tokens(id.item()) for id in top_ids]
    top_token_probs = probs[top_ids]

    return pd.DataFrame({'Prediction': top_tokens, 'Probability': top_token_probs})

In [84]:
simple_agrmt_results = pd.read_csv('../results/syntax_results/simple_agrmt_results.csv', sep='\t')
sentence = random.choice(simple_agrmt_results['masked_sent'])
print('Masked sentence:')
print(sentence + '\n')

print('Training step: 0 (random initialization)')
print('Top 5 BERT predictions:')
unmask(sentence, 'google/multiberts-seed_0-step_0k')

Masked sentence:
the teacher [MASK] tall

Training step: 0 (random initialization)
Top 5 BERT predictions:
(tensor([0]), tensor([2]))


Unnamed: 0,Prediction,Probability
0,upgrades,0.00019
1,minorities,0.000187
2,misconduct,0.000182
3,occult,0.000177
4,gabriel,0.000175


In [85]:
print('Training step: 2,000,000 (fully trained model)')
print('Top 5 BERT predictions:')
unmask(sentence, 'google/multiberts-seed_0-step_2000k')

Training step: 2,000,000 (fully trained model)
Top 5 BERT predictions:
(tensor([0]), tensor([2]))


Unnamed: 0,Prediction,Probability
0,tall,0.179006
1,was,0.032808
2,short,0.026078
3,high,0.025274
4,-,0.024847


In [20]:
with open('../data/wikitext/sample_sents.pickle', 'rb') as f:
    sample_sents = pickle.load(f)

sample_sents = pd.DataFrame(sample_sents)
sample_sents

Unnamed: 0,0,1,2
0,100th,16919,"[[101, 2002, 2680, 2010, 1015, 1010, 3156, 270..."
1,10th,6049,"[[101, 2119, 2122, 4898, 3662, 1996, 3747, 199..."
2,11th,6252,"[[101, 2009, 2001, 1999, 1996, 103, 2301, 1010..."
3,12th,5940,"[[101, 2006, 2258, 2676, 1010, 2002, 2001, 258..."
4,13th,6122,"[[101, 2127, 1996, 103, 2301, 1010, 1996, 2887..."
...,...,...,...
9128,zone,4224,"[[101, 2006, 1996, 20198, 3483, 1005, 1055, 21..."
9129,zones,10019,"[[101, 6059, 3387, 2435, 1996, 3036, 6987, 315..."
9130,zoo,9201,"[[101, 2350, 14345, 2421, 1996, 17692, 2103, 1..."
9131,zoom,24095,"[[101, 27916, 1005, 7444, 2806, 2001, 15063, 1..."


In [87]:
tokenizer = AutoTokenizer.from_pretrained('google/multiberts-seed_0')
sample_sentences = sample_sents.sample(1)[2].tolist()[0]
sent_ids = random.choice(sample_sentences)
sent_ids

[101,
 1999,
 6356,
 2575,
 1010,
 2002,
 2333,
 2000,
 1996,
 3007,
 1999,
 2019,
 3535,
 2000,
 24501,
 3126,
 2890,
 6593,
 2010,
 2880,
 2476,
 1012,
 102,
 2002,
 2165,
 1996,
 2942,
 2326,
 11360,
 1037,
 2117,
 2051,
 2076,
 1996,
 2206,
 2095,
 1010,
 2021,
 2035,
 1996,
 5347,
 2020,
 3478,
 2011,
 1996,
 3539,
 2704,
 1006,
 4593,
 1999,
 2344,
 2000,
 4652,
 1996,
 103,
 1997,
 2825,
 9169,
 1007,
 1012,
 102]