In [1]:
import sys
import random
from collections import Counter

from transformers import BertForMaskedLM, BertConfig, BertTokenizer
import torch
import torch.optim as optim

In [2]:
dataset_path = "/Users/Gianni/semiolog/models/en_bnc_berttest/chains_BNC_101-150_1078011_voc30K.txt"

with open(dataset_path, "r") as f:
    lines = [l[:-1].split(" ") for l in f.readlines()[:10]]


In [4]:
freq_counter = Counter()
for l in lines:
    freq_counter.update(l)
id_to_token_and_freq = dict(enumerate(freq_counter.most_common()))
id_to_token_and_freq[len(id_to_token_and_freq)] = ("<mask>", 0)
id_to_token_and_freq[len(id_to_token_and_freq)] = ("<pad>", 0)
id_to_token_and_freq[len(id_to_token_and_freq)] = ("<unk>", 0)
id_to_token_and_freq[len(id_to_token_and_freq)] = ("<s>", 0)
id_to_token_and_freq[len(id_to_token_and_freq)] = ("</s>", 0)

token_to_id = {token[0]:id for id, token in id_to_token_and_freq.items()}


In [5]:
# turn dataset into torch Tensor
data = []
max_len = max([len(line) for line in lines])
for line in lines:
    line_ids = [token_to_id[token] for token in line]  # have to do case for unknowns
    data.append(line_ids)
    #data.append(torch.tensor(line_ids + [token_to_id["<pad>"]] * (max_len - len(line_ids)))

if torch.cuda.is_available():
    device = "gpu"
    print("Using gpu")
else:
    device = "cpu"

model = BertForMaskedLM(BertConfig(vocab_size=len(token_to_id))).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

EPOCHS = 20
BATCH_SIZE = 8


In [6]:

def prep_input(input_lines, mask=True):
    input_ids = []
    label_ids = []
    att_masks = []
    for line in input_lines:
        pad_len = max_len - len(line)
        masked_pos = random.randint(0, len(line) - 1)
        inp_line = line[:]
        if mask:
            inp_line[masked_pos] = token_to_id["<mask>"]
        input_ids.append(inp_line + [token_to_id["<pad>"] for _ in range(pad_len)])

        # This line can't be simply uncommented but need to be adapted a bit
        # labels = torch.tensor([input_line + [token_to_id["<pad>"] for _ in range(pad_len)]])
        # TODO: Decide whether to ignore other parts
        if mask:
            labels = [-100 for _ in range(max_len)]
            labels[masked_pos] = line[masked_pos]
        else:
            pass
        labels = line[:] + [token_to_id["<pad>"] for _ in range(pad_len)]
        label_ids.append(labels)
        att_mask = [1 for _ in range(len(line))] + [0 for _ in range(pad_len)]
        att_masks.append(att_mask)
    input_tensor = torch.tensor(input_ids).to(device)
    labels = torch.tensor(label_ids).to(device)
    token_type_ids = torch.zeros_like(labels).to(device)
    attention_mask = torch.tensor(att_masks).to(device)  # set the padding ids to 0
    return input_tensor, labels, token_type_ids, attention_mask

In [7]:
test_sentence = [data[0]]
input_tensor, labels, token_type_ids, attention_mask = prep_input(test_sentence, mask=False)
outputs = model(input_ids=input_tensor,
                labels=labels,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask)
print(outputs)

MaskedLMOutput(loss=tensor(5.2992, grad_fn=<NllLossBackward>), logits=tensor([[[ 0.0000,  0.0078,  0.0547,  ..., -0.1755, -0.4897,  0.4224],
         [ 0.0000, -0.5832,  0.5144,  ..., -0.2148, -0.5365,  0.0297],
         [ 0.0000,  0.2059,  0.4069,  ..., -0.3505, -1.2915, -0.4742],
         ...,
         [ 0.0000,  0.0685, -0.1134,  ..., -0.5047, -0.8725,  0.6245],
         [ 0.0000,  0.2854, -0.2177,  ...,  0.1600, -0.6393, -0.1853],
         [ 0.0000, -0.0200,  0.6925,  ..., -1.4898, -0.1954,  0.1009]]],
       grad_fn=<AddBackward0>), hidden_states=None, attentions=None)


In [8]:
for epoch in range(EPOCHS):
    print("Training epoch:", epoch)
    epoch_loss = 0
    for idx in range(0, len(data), BATCH_SIZE):
        input_lines = data[idx:idx+BATCH_SIZE]
        input_tensor, labels, token_type_ids, attention_mask = prep_input(input_lines)
        outputs = model(input_ids=input_tensor,
                        labels=labels,
                        token_type_ids=token_type_ids,
                        attention_mask=attention_mask)
        epoch_loss += outputs.loss
        outputs.loss.backward()
        optimizer.step()
    print(epoch_loss)

Training epoch: 0
tensor(10.2336, grad_fn=<AddBackward0>)
Training epoch: 1
tensor(9.5558, grad_fn=<AddBackward0>)
Training epoch: 2
tensor(8.8982, grad_fn=<AddBackward0>)
Training epoch: 3
tensor(8.5236, grad_fn=<AddBackward0>)
Training epoch: 4
tensor(7.0252, grad_fn=<AddBackward0>)
Training epoch: 5
tensor(8.6173, grad_fn=<AddBackward0>)
Training epoch: 6
tensor(8.7849, grad_fn=<AddBackward0>)
Training epoch: 7
tensor(8.5119, grad_fn=<AddBackward0>)
Training epoch: 8
tensor(9.7094, grad_fn=<AddBackward0>)
Training epoch: 9
tensor(10.5023, grad_fn=<AddBackward0>)
Training epoch: 10
tensor(8.4807, grad_fn=<AddBackward0>)
Training epoch: 11
tensor(10.0132, grad_fn=<AddBackward0>)
Training epoch: 12
tensor(9.5135, grad_fn=<AddBackward0>)
Training epoch: 13
tensor(9.0205, grad_fn=<AddBackward0>)
Training epoch: 14
tensor(8.9648, grad_fn=<AddBackward0>)
Training epoch: 15
tensor(9.2731, grad_fn=<AddBackward0>)
Training epoch: 16
tensor(9.3327, grad_fn=<AddBackward0>)
Training epoch: 17
te

In [43]:

input_tensor, labels, token_type_ids, attention_mask = prep_input(test_sentence, mask=True)
outputs = model(input_ids=input_tensor,
                    labels=labels,
                    token_type_ids=token_type_ids,
                    attention_mask=attention_mask)

In [44]:
out_tensor = outputs[1]

In [45]:
len(out_tensor[0])

21

In [46]:
[[id_to_token_and_freq[k][0] for v,k in sorted([(v,i) for i,v in enumerate(out_tensor[0,i])], reverse=True)[:10]] for i in range(len(out_tensor[0]))]

[['<pad>',
  'youare',
  'and',
  'askedto',
  'to',
  'an',
  'acet',
  'asked',
  'adequately',
  'theycan'],
 ['<pad>',
  'youare',
  'and',
  'to',
  'theycan',
  'an',
  'askedto',
  'acet',
  'ensure',
  'asked'],
 ['<pad>',
  'youare',
  'and',
  'to',
  'an',
  'ensure',
  'askedto',
  'asked',
  'theycan',
  'acet'],
 ['<pad>',
  'youare',
  'and',
  'to',
  'acet',
  'askedto',
  'an',
  'theycan',
  'asked',
  'one'],
 ['<pad>',
  'youare',
  'and',
  'askedto',
  'theycan',
  'an',
  'to',
  'acet',
  'asked',
  'ensure'],
 ['<pad>',
  'youare',
  'and',
  'to',
  'an',
  'theycan',
  'asked',
  'askedto',
  'ensure',
  'acet'],
 ['<pad>',
  'youare',
  'and',
  'to',
  'acet',
  'theycan',
  'adequately',
  'askedto',
  'an',
  'one'],
 ['<pad>',
  'youare',
  'and',
  'to',
  'acet',
  'askedto',
  'theycan',
  'an',
  'asked',
  'adequately'],
 ['<pad>',
  'youare',
  'and',
  'acet',
  'to',
  'askedto',
  'an',
  'ensure',
  'theycan',
  'your'],
 ['<pad>',
  'youare',

In [25]:
[id_to_token_and_freq[id][0] for id in test_sentence[0]]

['aids',
 'acquired',
 'immun',
 'ede',
 'ficiency',
 's',
 'yndrome',
 'is',
 'a',
 'condition',
 'causedby',
 'a',
 'virus',
 'called',
 'hiv',
 'human',
 'immuno',
 'de',
 'ficiency',
 'virus']

In [22]:
id_to_token_and_freq[0]

('and', 6)

In [23]:
test_sentence

[[2, 15, 16, 17, 3, 18, 19, 20, 4, 5, 21, 4, 6, 22, 23, 24, 25, 26, 3, 6]]

In [2]:
from transformers import BertTokenizer, TFBertForMaskedLM
import tensorflow as tf

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertForMaskedLM.from_pretrained('bert-base-cased')

inputs = tokenizer("The capital of France is [MASK].", return_tensors="tf")
inputs["labels"] = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"]

outputs = model(inputs)
loss = outputs.loss
logits = outputs.logits

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.
