Nessecary installations and imports.

In [None]:
!pip install transformers==3

import torch
from transformers import BertTokenizer
from transformers import AutoModel
import pandas as pd
import numpy as np
from sklearn.metrics import *
import torch.nn as nn
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,RegexpTokenizer

#nltk stopwords and punctuation datasets download
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Loading the datasets and cleaning them in the same way I did for the previous 2 projects.

In [None]:
#dataset and variables initialization
fields = ['tweet', 'label']
trainset = pd.read_csv("/content/data/vaccine_train_set.csv", usecols=fields)
testset = pd.read_csv("/content/data/vaccine_validation_set.csv", usecols=fields)
X_train = []
Y_train = []
X_train = trainset.tweet
Y_train = trainset.label
X_test = []
Y_test = []
X_test = testset.tweet
Y_test = testset.label

#data cleaning
def clean_data(dataset):
    stop_words = set(stopwords.words('english'))
    #adding symbols we dont need in the stop words because the tokenizer tokenizes them alone anyway
    stop_words.add(":")
    stop_words.add("@")
    stop_words.add("#")
    stop_words.add(",")
    stop_words.add(".")
    stop_words.add("\'\'")
    new_dataset = []
    for row in dataset:
      dataset_tokens = word_tokenize(row)
      filtered = ""
      for w in dataset_tokens:
        #filtering stop_words and weird symbols (see above), removing the http word and removing links
        if (w not in stop_words) and ("http" not in w) and ("/" not in w ):
          filtered = filtered + " " + w
      new_dataset.append(filtered)
    return new_dataset
X_train = clean_data(X_train)
X_test = clean_data(X_test)
x = X_train
y= Y_train


pretrained_bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Splitting the datasets into tokens and encoding them to be ready for feed in our BERT model later. We also wrap the input with the labels and the masks together so they dont get shuffled when we split the batches later.

In [None]:
seq_len = 25

# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    x,
    max_length = seq_len,
    pad_to_max_length = True,
    truncation = True
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    X_test,
    max_length = seq_len,
    pad_to_max_length = True,
    truncation = True
)

train_mask = torch.tensor(tokens_train['attention_mask'])
train_seq = torch.tensor(tokens_train['input_ids'])
train_y = torch.tensor(y.tolist())
dataset = torch.utils.data.TensorDataset(train_seq, train_mask, train_y)

test_mask = torch.tensor(tokens_test['attention_mask'])
test_seq = torch.tensor(tokens_test['input_ids'])
test_y = torch.tensor(Y_test.tolist())

The first function is used to calculate the model's scores and print as a whole and for each class individually using the scikit learn metric functions. The second function calculates the f1 score, loss and returns it so it can be used later to measure which epoch had the best fine-tuning for our model.

In [None]:
def print_scores(ts, mask, y_testset, my_model,LossFunc):
  #we do the same as in training but in the test dataset so we test our model in the end
  with torch.no_grad():
    my_model.eval()
    x = ts.to(device)
    mask = mask.to(device)
    y_testset = y_testset.to(device)

    #predictions
    y_pred_t = my_model(x, mask)
    y_final = []

    #take the max of the 3 outputs and append y_final with the correct label (0,1,2) 
    #which will be compared with the real labels to calculate score
    for i in range(len(y_pred_t)):
      if(max(y_pred_t[i]) == y_pred_t[i][0]):
        y_final.append(0)
      elif(max(y_pred_t[i]) == y_pred_t[i][1]):
        y_final.append(1)
      else:
        y_final.append(2)

    y_testset = y_testset.cpu()
    #calculate whole scores
    confm=confusion_matrix(y_testset, y_final)
    score = ((int)(confm[0][0] + confm[1][1] +confm[2][2])/len(y_testset))
    print("Testset accuracy score: ", score)

    f1 = f1_score(y_testset, y_final, average='weighted')
    print("Testset f1 score: ", score)

    rec = recall_score(y_testset, y_final, average='weighted')
    print("Testset recall score: ", score)

    prec = precision_score(y_testset, y_final, average='weighted',zero_division=0)
    print("Testset accuracy score: ", score)

    #calculate individual scores
    print("-----------------------------------------------------")
    zero, one, two = f1_score(y_testset, y_final,average=None)
    print("Label 0 f1 score: ", zero)
    print("Label 1 f1 score: ", one)
    print("Label 2 f1 score: ", two)
    print("\n")

    zero, one, two = recall_score(y_testset, y_final, average=None)
    print("Label 0 recall score: ", zero)
    print("Label 1 recall score: ", one)
    print("Label 2 recall score: ", two)
    print("\n")

    zero, one, two = precision_score(y_testset, y_final, average=None,zero_division=0)
    print("Label 0 precision score: ", zero)
    print("Label 1 precision score: ", one)
    print("Label 2 precision score: ", two)
    return

#This function is made to calculate the loss and score of the model in the testset so it prints that info in each epoch to watch the progress
def calc_loss_and_score(data, mask, labels, my_model,LossFunc):
    with torch.no_grad():
      my_model.eval()
      data = data.to(device)
      mask = mask.to(device)
      labels = labels.to(device)

      
      # Get predictions and calculate loss
      output = my_model(data, mask)

      y_final = []

      for i in range(len(output)):
        if(max(output[i]) == output[i][0]):
          y_final.append(0)
        elif(max(output[i]) == output[i][1]):
          y_final.append(1)
        else:
          y_final.append(2)

      loss = LossFunc(output, labels)

      #calculate score
      labels = labels.detach().cpu().numpy()
      output = output.detach().cpu().numpy()
      f1 = f1_score(y_final, labels, average='weighted')

      return loss, f1

This is the bert model with all the suggested hyperparameter tuning suggested in the official BERT paper: https://aclanthology.org/N19-1423.pdf. Those are batch size: 32, LR: 5e-3, number of epochs: 3, CrossEntropyLoss, Adam optimizer, sequence length: 25. After experimenting I concluded that the best gradient cliping ratio is 4.

In [None]:
class MyBERT(nn.Module):
  
    def __init__(self, BERT):
      super(MyBERT, self).__init__()
      self.BERT = BERT

      #input layer
      self.l1 = nn.Linear(768,383)
      # output layer
      self.l2 = nn.Linear(383,3)

      #Activation function and softmax for the output
      self.af = nn.PReLU()
      self.softmax = nn.Softmax(dim=1)
      

    #forward pass
    def forward(self, data, mask):
      #pass the inputs to the model  
      _ , input = self.BERT(data, attention_mask=mask)
      x = self.l1(input)
      x = self.af(x)
      x = self.l2(x)
      #x = self.softmax(x)

      return x


#--------------HYPERPARAMETERS---------------
LR = 0.00005
grad_clip = 4
batch_sz = 32
N_EPOCHS = 3
#-------------------------------------------------

if torch.cuda.is_available():
  device = torch.device('cuda')
  print("cuda")
else:
  device = torch.device('cpu')
  print("cpu")

# pass the pre-trained BERT to our define architecture
model = MyBERT(pretrained_bert)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=LR)

loss_func = nn.CrossEntropyLoss()

#Initialize dataloader
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_sz, shuffle=True)

cuda


This is the fine-tuning cell. We unpack the batches in input and masks and we feed those to our model. Gradient cliping is used for the exploding gradients problem as we saw in RNNs. Finally, after each epoch if it had better stats than the previous epoch we save the model's current state to be used in testing.

In [None]:
losses = []
min_loss = 999999
torch.cuda.empty_cache()
for epoch_i in range(N_EPOCHS):
    # Ensure net in training mode
    model.train()
    batch_losses = []
    for batch in dataloader:
      batch = [row.to(device) for row in batch]
      data , mask, y_batch = batch

      # Get model output, calculate loss, and generate gradients
      output = model(data, mask)

      #Calculate loss
      y_batch = y_batch.type(torch.LongTensor)
      y_batch = y_batch.to(device)
      loss = loss_func(output, y_batch)
      batch_losses.append(loss.item())

      # Zero out gradients
      model.zero_grad()

      # Generate gradients via autodiff
      loss.backward() 
      
      # Clip gradients like we did in RNN nets
      for param in model.parameters():
          if param.grad is None:
              continue
          grad_val = torch.clamp(param.grad, -grad_clip, grad_clip)

      #update gradients
      optimizer.step()

      output=output.detach().cpu().numpy()

      # Track loss
      losses.append(loss.item())
    
    #Calculating score and loss in each epoch just to gather more info about the progress
    temp_loss, temp_score = calc_loss_and_score(test_seq, test_mask, test_y, model, loss_func)

    #If the current state of BERT is the best so far we save it in order to backroll to it later
    if(temp_loss < min_loss):
      min_loss = temp_loss
      torch.save(model.state_dict(), 'min_loss')

    str_ = f'\rEpoch {epoch_i+1}/{N_EPOCHS} -- Train Loss: {sum(batch_losses)/len(dataloader):.5f} -- Testset f1 Score: {temp_score}'
    print(str_)

Epoch 1/3 -- Train Loss: 0.20849 -- Testset f1 Score: 0.7351637851023628
Epoch 2/3 -- Train Loss: 0.11687 -- Testset f1 Score: 0.731839630028322
Epoch 3/3 -- Train Loss: 0.09165 -- Testset f1 Score: 0.7281005042385814


Loading the best model from all the epochs and printing the scores. The first scores are the model's scores and beloq are the scores for each class individually.

In [None]:
  #loading the best fine tuned model from the 3 training epochs
  path = 'min_loss'
  model.load_state_dict(torch.load(path))
  print("BERT Model scores:")
  print_scores(test_seq, test_mask, test_y, model, loss_func)

BERT Model scores:
Testset accuracy score:  0.7357581069237511
Testset f1 score:  0.7357581069237511
Testset recall score:  0.7357581069237511
Testset accuracy score:  0.7357581069237511
-----------------------------------------------------
Label 0 f1 score:  0.7940038684719536
Label 1 f1 score:  0.5426621160409556
Label 2 f1 score:  0.7319371727748691


Label 0 recall score:  0.7708920187793428
Label 1 recall score:  0.5371621621621622
Label 2 recall score:  0.758957654723127


Label 0 precision score:  0.8185443668993021
Label 1 precision score:  0.5482758620689655
Label 2 precision score:  0.7067745197168858
