In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [15]:
train_dataset_path='news_commentary_v15.en'
batch_size=32
num_epochs=2
model_save_path='finetuned_bert_on_mlm.pt'

In [16]:
class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_masks[idx], self.labels[idx]

In [17]:
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Set up the optimizer and loss function
optimizer = optim.Adam(bert_model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss(ignore_index=-1)

# Load training dataset
with open(train_dataset_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()
input_ids = []
attention_masks = []
labels = []
for line in tqdm(lines):
    line = line.strip()
    encoded = tokenizer.encode_plus(line, add_special_tokens=True, max_length=512,
                                     pad_to_max_length=True, return_attention_mask=True)
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])
    labels.append(encoded['input_ids'])

train_dataset = CustomDataset(input_ids, attention_masks, labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/608912 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by

KeyboardInterrupt: 

In [None]:
bert_model.train()
for epoch in range(num_epochs):
    for batch in tqdm(train_dataloader):
        input_ids, attention_masks, labels = batch
        input_ids = input_ids
        attention_masks = attention_masks
        labels = labels
        optimizer.zero_grad()
        outputs = bert_model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = criterion(outputs.logits.view(-1, tokenizer.vocab_size), labels.view(-1))
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1} completed. Saving checkpoint...')
    torch.save(bert_model.state_dict(), model_save_path)

# Save the final finetuned model
torch.save(bert_model.state_dict(), model_save_path)