In [2]:
import pandas as pd
import torch
import torch.optim as optim
torch.cuda.empty_cache()

torch.cuda.is_available()

True

In [3]:
# https://huggingface.co/docs/transformers/training
# https://towardsdatascience.com/how-to-train-bert-aaad00533168
# https://medium.com/@prakashakshay/fine-tuning-bert-model-using-pytorch-f34148d58a37

In [4]:
df = pd.read_csv('poems/szymborska.csv', sep=';')

In [5]:
df.head()

Unnamed: 0,poem
0,Historia nierychliwa\nna trąbkach mi przygrywa...
1,"Jestem za blisko, żeby mu się śnić.\nNie fruwa..."
2,"Z trapezu na\nna trapez, w ciszy po\npo nagle ..."
3,Nikt w rodzinie nie umarł z miłości.\nCo tam b...
4,Jesteś piękne - mówią życiu -\nbujniej już nie...


In [6]:
from transformers import BertTokenizer, BertForMaskedLM, BertModel, BertForPreTraining
from transformers import AutoTokenizer

model = BertForPreTraining.from_pretrained("dkleczek/bert-base-polish-uncased-v1").to("cuda")
tokenizer = BertTokenizer.from_pretrained("dkleczek/bert-base-polish-uncased-v1")

In [7]:
bag = []
for poem in df['poem'].values.tolist():
    for vers in poem.split('\n'):
        if vers.strip() != '':
            bag.append(vers)
        
bag_size = len(bag)

In [8]:
import random

sentence_a = []
sentence_b = []
label = []

for paragraph in df['poem'].values.tolist():
    sentences = [
        sentence for sentence in paragraph.split('.') if sentence != ''
    ]
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)

In [9]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt',
                   max_length=512, truncation=True, padding='max_length')

In [10]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [11]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T

In [12]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [13]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [14]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [15]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [50]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [51]:
dataset = OurDataset(inputs)

In [52]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True, num_workers=4)

In [53]:
from torch.optim import AdamW 
from transformers import get_linear_schedule_with_warmup

lr = 2e-5
adam_epsilon = 1e-8

epochs = 3

num_warmup_steps = 0
num_training_steps = len(loader)*epochs

device = torch.device('cuda') 

### In Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr, eps=adam_epsilon)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

In [56]:
from tqdm import tqdm, trange

## Store our loss and accuracy for plotting
train_loss_set = []
learning_rate = []

# Gradients gets accumulated by default
model.zero_grad()

# trange is a tqdm wrapper around the normal python range
for _ in trange(1,epochs+1,desc='Epoch'):
    print("<" + "="*22 + F" Epoch {_} "+ "="*22 + ">")
    # Calculate total loss for this epoch
    batch_loss = 0

    for step, batch in enumerate(loader):
        # Set our model to training mode (as opposed to evaluation mode)
        model.train()
        
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)
        
        loss = outputs[0]
        
        # Backward pass
        loss.backward()
        
        # Clip the norm of the gradients to 1.0
        # Gradient clipping is not in AdamW anymore
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        
        # Update learning rate schedule
        scheduler.step()
        
        # Clear the previous accumulated gradients
        optimizer.zero_grad()
        # Update tracking variables
        batch_loss += loss.item()
        
        # Calculate the average loss over the training data.
        avg_train_loss = batch_loss / len(train_dataloader)

    #store the current learning rate
    for param_group in optimizer.param_groups:
        print("\n\tCurrent Learning rate: ",param_group['lr'])
        learning_rate.append(param_group['lr'])
        
    train_loss_set.append(avg_train_loss)
    print(F'\n\tAverage Training loss: {avg_train_loss}')

    # Validation

    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Tracking variables 
    eval_accuracy,eval_mcc_accuracy,nb_eval_steps = 0, 0, 0

Epoch:   0%|                                              | 0/3 [00:00<?, ?it/s]



  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch:   0%|                                              | 0/3 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 24.00 MiB (GPU 0; 5.81 GiB total capacity; 3.94 GiB already allocated; 20.12 MiB free; 3.99 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [49]:
from tqdm import tqdm  # for our progress bar

epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:      
        # initialize calculated gradients (from prev step)
        optimizer.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optimizer.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  return [torch.tensor(val[idx]) for key, val in self.encodings.items()]
  return [torch.tensor(val[idx]) for key, val in self.encodings.items()]
  return [torch.tensor(val[idx]) for key, val in self.encodings.items()]
  0%|                                                    | 0/14 [00:00<?, ?it/s]


TypeError: list indices must be integers or slices, not str