# BERT Encoder/Decoder Classifier
This notebook contains the code to train an encoder/decoder model Inbox Guardian classification task.  We use DistiliBERT as our encoder and two linear layers as our decoder.  We use a dataset of the most recent 500 email chains we recieved.\
Note: This code requires a gpu to run in a reasonable amount of time

In [None]:
import pandas as pd
import transformers
import numpy as np
import torch
from torch import tensor
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch import nn
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix
import sys
from matplotlib import pyplot as plt

In [None]:
RANDOM_STATE = 42 # random seed to ensure results are reproducible
META = False # True if we only want to show the model senders and subjects, false if we want to pass in email body as well
THREE_CLASSES = False # True for trinary classification task, false for binary

In [None]:
df = pd.read_csv('fullDataset.csv')
# Shuffles the dataset, as it was ordered by label during construction
df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
df.head()

In [None]:
print(len(df))

In [None]:
# print graph of number of documents for each label
df['Label'].plot(kind='hist', bins=20, title='Label')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased') # load the tokenizer

In [None]:
# Converts all "urgent" labels to "relevant"
# Allows us to collpase the trinary classification task into a relevant/irrelevant binary classification task
def changeLabels(x):
  if x == 2:
    return 1
  else:
    return x

In [None]:
# define our docs based on the META flag
if META:
  docs = df["Meta"].tolist()
else:
  docs = df["Full"].tolist()

# define our labels based on the THREE_CLASSES flags
if THREE_CLASSES:
  labels = df["Label"].tolist()
else:
  labels = df['Label'].apply(changeLabels).tolist()

print(len(docs))
print(len(labels))

In [None]:
# shuffles the data and splits it into the train, test, and validation sets
train, validAndTest, trainLabels, validAndTestLabels = train_test_split(docs, labels, test_size=0.3, random_state=RANDOM_STATE)
valid, test, validLabels, testLabels = train_test_split(validAndTest, validAndTestLabels, test_size=0.5, random_state=RANDOM_STATE)

In [None]:
# tokenize the product names - turns them into a format the BERT model can understand
trainTokenized = tokenizer(train, padding='max_length', max_length = 512, truncation=True, return_tensors='pt', return_attention_mask = True)
validTokenized = tokenizer(valid, padding='max_length', max_length = 512, truncation=True, return_tensors='pt', return_attention_mask = True)
testTokenized = tokenizer(test, padding='max_length', max_length = 512, truncation=True, return_tensors='pt', return_attention_mask = True)

# need to keep track of attention masks for each document so our model ignores padding tokens properly
trainTokens = trainTokenized["input_ids"]
print(trainTokens[0].size())
trainMask = trainTokenized["attention_mask"]
print(trainMask[0].size())

validTokens = validTokenized["input_ids"]
validMask = validTokenized["attention_mask"]

testTokens = testTokenized["input_ids"]
testMask = testTokenized["attention_mask"]

In [None]:
print(len(train))
print(len(trainLabels))
print()

print(len(valid))
print(len(validLabels))
print()

print(len(test))
print(len(testLabels))

In [None]:
# a class to represent the train, validation, and test sets
# the Dataset class handles dividing the data into minibatches and producing the minibatches for us
class emailDataset(Dataset):
  def __init__(self, data, labels, mask):
    self.data = data
    self.labels = labels
    self.mask = mask

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    return self.data[idx], self.labels[idx], self.mask[idx]

In [None]:
# build the dataset objects for train, validation, and test sets
trainData = emailDataset(trainTokens, trainLabels, trainMask)
validData = emailDataset(validTokens, validLabels, validMask)
testData = emailDataset(testTokens, testLabels, testMask)

In [None]:
# datloader object for test set
testLoader = DataLoader(testData, batch_size=1)

In [None]:
# class that defines model architecture for us
class BertEncoderDecoder(nn.Module):
    def __init__(self, DROPOUT, dbert):
        super(BertEncoderDecoder, self).__init__()
        # BERT encoder
        self.dbert = dbert
        # Decoder
        self.dropout = nn.Dropout(p=DROPOUT)
        self.linear1 = nn.Linear(768,64)
        self.ReLu = nn.ReLU()
        # classification head depends on how many classes we have
        if THREE_CLASSES:
          self.linear2 = nn.Linear(64,3)
        else:
          self.linear2 = nn.Linear(64,2)

    def forward(self, tokens, mask):
        x = self.dbert(input_ids=tokens, attention_mask=mask)
        x = x["last_hidden_state"][:,0,:] # we use the last hidden state of BERT as our context vector
        x = self.dropout(x) # dropout on BERT output, prevents overfitting
        x = self.linear1(x)
        x = self.ReLu(x)
        logits = self.linear2(x)
        return logits

In [None]:
# Get cpu or gpu device for training - THIS CODE WORKS BEST ON A GPU
device = "cuda" if torch.cuda.is_available() else "cpu" # need to change options to train on GPU
print(f"Using {device} device")

In [None]:
# list of possible values for hyperparameters - we will grid search across all of these values

DROPOUT = [0.25, 0.2, 0.15] # dropout probability
ATTN_DROPOUT = 0.2 # dropout probability for the attention equation
EPOCHS = [4, 8, 16] # number of epochs to train for
LEARNING_RATE = [0.002, 0.001, 0.005, 0.0001, 0.0005] # learning rate
BATCH_SIZE = [8, 4, 16] # batch size

criterion = torch.nn.CrossEntropyLoss() # softmax and loss for classification layer

In [None]:
def setup(batchSize, dropout, atnDropout, lr):
    """
    This function returns the necessary objects we need to train a model with a given set of hyperparameters

    Args:
      batchSize: The batch size to use when training
      dropout: The dropout probability to use when training
      atnDropout: The attention dropout probability to use when training
      lr: The learning rate to use when training
    Returns:
      trainLoader: A dataloader object for the train dataset constructed using the given batch size
      validLoader: A dataloader object for the validation dataset constructed using the given batch size
      classifier: A model object constructed using the given hyperparameters
      optimizer: An Adam optimizer object constructed using the given hyperparameters
    """

    # build the dataloader objects for train, validation sets
    trainLoader = DataLoader(trainData, batch_size=batchSize)
    validLoader = DataLoader(validData, batch_size=batchSize)

    # build distilibert object
    dbert = transformers.DistilBertModel.from_pretrained('distilbert-base-uncased', dropout=dropout, attention_dropout=atnDropout)

    # build model object and send it to the GPU
    classifier = BertEncoderDecoder(dropout, dbert).to(device)

    # build the optimizer object
    optimizer = torch.optim.Adam(classifier.parameters(), lr = lr)

    # freeze DistiliBERT parameters to avoid overfitting
    for param in classifier.dbert.parameters():
      param.requires_grad = False

    return trainLoader, validLoader, classifier, optimizer

def train(trainLoader):
    """
    This function handles a single training loop

    Args:
      trainLoader: A dataloader object for the training set
    Returns
      The macro-averaged f1 of the current epoch
    """

    predictions = []
    groundTruth = []

    classifier.train() # activated dropout

    for description, labels, mask in tqdm(trainLoader):
      # send minibatch to gpu for efficient training
      description = description.to(device)
      labels = labels.to(device)
      mask = mask.to(device)

      # Get prediction & loss
      prediction = classifier(description, mask)
      loss = criterion(prediction, labels)

      # determine the optimal direction to increment parameters
      loss.backward()

      # update parameters
      optimizer.step()

      # zero the gradient so we don't accumulate optimizer steps
      optimizer.zero_grad()

      # get model prediction
      prediction_index = prediction.argmax(axis=1)

      predictions += prediction_index.tolist()
      groundTruth += labels.tolist()

    return f1_score(groundTruth, predictions, average="macro")


def evaluate(validLoader):
  """
  This function evalautes a trained model on the validation set - we use this function to determine how well a given set of hyperparameters perform

  Args:
    validLoader: A dataloader object for the validation set
  Returns:
    The macro-averaged f1 of the model on the validation set
  """
  predictions = []
  groundTruth = []

  classifier.eval() # turn off dropout for evaluation
  with torch.no_grad(): # turn off gradient calculation so we don't train on the validation set
    for description, labels, mask in validLoader:
      # send minibatch to gpu for efficient calcuations
      description = description.to(device)
      labels = labels.to(device)
      mask = mask.to(device)

      # get model prediction
      prediction = classifier(description, mask)
      prediction_index = prediction.argmax(axis=1)

      predictions += prediction_index.tolist()
      groundTruth += labels.tolist()

  return f1_score(groundTruth, predictions, average="macro")

In [None]:
# this cell performs a grid search across all combinations of the hyperparameter values we define

# best f1 and best hperparameter values we have seen so far
bestf1 = 0
bestParams = [0, 0, 0, 0]

for dropout in DROPOUT:
  for epoch in EPOCHS:
    for lr in LEARNING_RATE:
      for batch in BATCH_SIZE:
        print("Dropout probability: " + str(dropout))
        print("Epochs: " + str(epoch))
        print("Learning rate: " + str(lr))
        print("Batch size: " + str(batch))

        trainLoader, validLoader, classifier, optimizer = setup(batch, dropout, ATTN_DROPOUT, lr)

        # training loop
        for e in range(epoch):
          trainf1 = train(trainLoader)
          print(f'Epoch {e+1} Macro-Averaged F1: {trainf1}')


        evalf1 = evaluate(validLoader)
        print(f'Validation Macro-Averaged F1: {evalf1}')

        # update if we see a better f1
        if evalf1 > bestf1:
          bestParams = [dropout, epoch, lr, batch]
          bestf1 = evalf1
        print("----------------------")

print(f'Best f1: {bestf1}')
print(f'Best dropout: {bestParams[0]}')
print(f'Best num epochs: {bestParams[1]}')
print(f'Best learning rate: {bestParams[2]}')
print(f'Best batch size: {bestParams[3]}')


In [None]:
# dictionary to store information about the training of the best performing model - will be useful to generate graphs
history = {}
history["epoch"]=[]
history["train_loss"]=[]
history["valid_loss"]=[]
history["train_accuracy"]=[]
history["valid_accuracy"]=[]

In [None]:
# this cell trains a model using the best performing hyperparameters we found
# we keep track of more information in each epoch and evaluate on the validation set at the end of each epoch

trainLoader, validLoader, classifier, optimizer = setup(bestParams[3], bestParams[0], ATTN_DROPOUT, bestParams[2])

for e in range(bestParams[1]):
  classifier.train() # activated dropout

  train_loss = 0.0
  train_accuracy = []

  # loop over each minibatch
  for description, labels, mask in tqdm(trainLoader):

      # send minibatch to gpu for efficient training
      description = description.to(device)
      labels = labels.to(device)
      mask = mask.to(device)

      # Get prediction & loss
      prediction = classifier(description, mask)
      loss = criterion(prediction, labels)

      train_loss += loss.item()

      # determine the optimal direction to increment parameters
      loss.backward()

      # update parameters
      optimizer.step()

      # zero the gradient so we don't accumulate optimizer steps
      optimizer.zero_grad()

      # get model predictions
      prediction_index = prediction.argmax(axis=1)

      # determine which minibatch documents were classified correctly
      accuracy = (prediction_index==labels)
      train_accuracy += accuracy

  # calculate train accuracy
  train_accuracy = (sum(train_accuracy) / len(train_accuracy))

  classifier.eval() # turn off dropout for evaluation
  valid_loss = 0.0
  valid_accuracy = []

  # evaluate on the validation set
  with torch.no_grad(): # turn off gradient calculation so we don't train on the validation set
    for description, labels, mask in validLoader:
      # send minibatch to gpu for efficient calcualtion
      description = description.to(device)
      labels = labels.to(device)
      mask = mask.to(device)

      # get model predictions and loss
      prediction = classifier(description, mask)
      loss = criterion(prediction, labels)

      valid_loss += loss.item()

      # get model prediction categories
      prediction_index = prediction.argmax(axis=1)

      # calculate accuracy
      accuracy = (prediction_index==labels)
      valid_accuracy += (accuracy)

  valid_accuracy = (sum(valid_accuracy) / len(valid_accuracy)) # sum sums up the boolean tensors, which themselves have a method to sum up

  # keep a record of our training results
  history["epoch"].append(e+1)
  history["train_loss"].append(train_loss / len(trainLoader))
  history["valid_loss"].append(valid_loss / len(validLoader))
  history["train_accuracy"].append(train_accuracy.tolist())
  history["valid_accuracy"].append(valid_accuracy.tolist())

  # output results
  print(f'Epoch {e+1}')
  print(f'\t\t Training Loss: {train_loss / len(trainLoader) :10.3f} \t\t Validation Loss: {valid_loss / len(validLoader) :10.3f}')
  print(f'\t\t Training Accuracy: {train_accuracy :10.3%} \t\t Validation Accuracy: {valid_accuracy :10.3%}')

In [None]:
# graphs of train/validation accuracy over each epoch and train/validation loss over each epoch
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
ax[0].set(title='Loss')
ax[0].plot(history['train_loss'], label='Training')
ax[0].plot(history['valid_loss'], label='Validation')
ax[0].legend(loc="upper right")

ax[1].set(title='Accuracy')
ax[1].plot(history['train_accuracy'], label='Training')
ax[1].plot(history['valid_accuracy'], label='Validation')
ax[1].legend(loc="lower right")

In [None]:
# evaluate the model with the bext-performing hyperparameters on the test set

classifier.eval() # turn off dropout for evaluation

groundTruth = np.zeros(len(testLoader)) # holds the labels for the test set
predictions = np.zeros(len(testLoader)) # holds the model's predictions on the test set

i = 0
with torch.no_grad(): # turn off gradient calculation so we don't train on the test set
  for description, label, mask in tqdm(testLoader):
    # send minibatch to gpu for efficient calculation
    description = description.to(device)
    label = label.to(device)
    mask = mask.to(device)

    # get prediction probabilites and predicted class
    prediction = classifier(description, mask)
    predictedClass = int(prediction.argmax(axis=1).item())
    predictions[i] = predictedClass

    # get ground truth label for a document
    goldClass = int(label.item())
    groundTruth[i] = goldClass

    i+= 1

In [None]:
print(classification_report(groundTruth, predictions)) # print precision, recall, and f1 for each class and overall

In [None]:
print(confusion_matrix(groundTruth, predictions)) # print confusion matrix

In [None]:
# This cell is only necessary if you are running this notebook in Colab
from google.colab import drive
import os

drive.mount('/content/gdrive')

In [None]:
# save the model to be used later for inference
path = "/content/gdrive/MyDrive/ColabOutput/BERTBinaryEmails.pth"
torch.save(classifier.state_dict(), path)