In [1]:
import pandas as pd
import torch
import torch.optim as optim
torch.cuda.empty_cache()

torch.cuda.is_available()

True

# BERT next sentence

In [2]:
# https://huggingface.co/docs/transformers/training
# https://towardsdatascience.com/how-to-train-bert-aaad00533168
# https://medium.com/@prakashakshay/fine-tuning-bert-model-using-pytorch-f34148d58a37

In [3]:
# https://wandb.ai/cayush/bert-finetuning/reports/Sentence-classification-with-Huggingface-BERT-and-W&B--Vmlldzo4MDMwNA
# https://colab.research.google.com/drive/1SQ-FOgji8AiyrQ08sIVfDiA8OUw4bC12?usp=sharing

Load the data

In [4]:
df = pd.read_csv('poems/szymborska.csv', sep=';')

In [5]:
df.head()

Unnamed: 0,poem
0,Historia nierychliwa\nna trąbkach mi przygrywa...
1,"Jestem za blisko, żeby mu się śnić.\nNie fruwa..."
2,"Z trapezu na\nna trapez, w ciszy po\npo nagle ..."
3,Nikt w rodzinie nie umarł z miłości.\nCo tam b...
4,Jesteś piękne - mówią życiu -\nbujniej już nie...


Using HerBERT by allegro team

In [8]:
from transformers import BertTokenizer, BertForMaskedLM, BertModel, BertForPreTraining
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")
model = BertForPreTraining.from_pretrained("dkleczek/bert-base-polish-uncased-v1").to("cuda")
#tokenizer = BertTokenizer.from_pretrained("allegro/herbert-base-cased")

Create bag of sentences from poems

In [9]:
bag = []
for poem in df['poem'].values.tolist():
    for vers in poem.split('\n'):
        if vers.strip() != '':
            bag.append(vers)
        
bag_size = len(bag)

Create dataset with true or false predeceasing and following sentences 

In [10]:
import random

sentence_a = []
sentence_b = []
label = []

for paragraph in df['poem'].values.tolist():
    sentences = [
        sentence for sentence in paragraph.split('.') if sentence != ''
    ]
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)

In [11]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt',
                   max_length=512, truncation=True, padding='max_length')

Ignored unknown kwarg option direction


In [12]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [13]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T

In [14]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [15]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [16]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [17]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [18]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [19]:
dataset = OurDataset(inputs)

In [20]:
loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=1)

In [21]:
from torch.optim import AdamW 
from transformers import get_linear_schedule_with_warmup

lr = 2e-5
adam_epsilon = 1e-8

epochs = 3

num_warmup_steps = 0
num_training_steps = len(loader)*epochs

device = torch.device('cuda') 

### In Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr, eps=adam_epsilon)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

In [22]:
import gc

In [23]:
from tqdm import tqdm  # for our progress bar

epochs = 3

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:      
        torch.cuda.empty_cache()
        # initialize calculated gradients (from prev step)
        optimizer.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)

        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)
        del input_ids
        del token_type_ids
        del attention_mask
        del next_sentence_label
        del labels
        gc.collect()
        torch.cuda.empty_cache()
        
        # extract loss
        loss = outputs.loss
        
        del outputs
        gc.collect()
        torch.cuda.empty_cache()

        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optimizer.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 0: 100%|█████████████████████| 219/219 [01:08<00:00,  3.19it/s, loss=1.05]
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 1: 100%|█████████████████████| 219/219 [01:04<00:00,  3.38it/s, loss=1.05]
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 2: 100%|████████████████████| 219/219 [01:05<00:00,  3.34it/s, loss=0.182]


In [24]:
model.save_pretrained('models/bert-base-polish-uncased-v1-szymborska')

In [25]:
model.eval()

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(60000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine