# **Spring 2023 NLP Final Project**

- Mathelide Hou
- Thanh Dang
- Ryan Ruan

# DistilBERT Model

In [None]:
import glob
import nltk
import pandas as pd
import pickle
nltk.download('punkt')
#Store data directory in a variable and only use this variable in your code
dat_dir = './' 
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset
from torch.utils.data import DataLoader

from datasets import load_dataset
from multiprocessing import cpu_count
import numpy as np
from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
from accelerate.utils import find_executable_batch_size
import torch
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import json
from datasets import Dataset
from transformers import TrainingArguments, Trainer, logging
import random

In [None]:
## Set "device" value depending on whether or not you have access to GPUs
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
accelerator = Accelerator()
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-cased",
                                                            num_labels=3).to(device)

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-cased",
                                                      truncation=True,
                                                      do_lower_case=True)

### Code to generate train, dev, and test sets

In [None]:
# import random
# def load_data(dat_dir):                                 
#     fnames = 'sarcasm.json'
#     data = []
#     train = []
#     dev = []
#     test = []
#     df = pd.read_json(fnames, lines=True)
#     df = df.drop(['article_link'], axis=1)
    
#     for index, row in df.iterrows():
#         data.append({'label': row['is_sarcastic'], 'sent':row['headline']})
#     random.shuffle(data)
#     train = data[:int(len(data)*0.8)]
#     dev = data[int(len(data)*0.8):int(len(data)*0.9)]
#     test = data[int(len(data)*0.9):]
      
#     return train, dev, test

In [None]:
# train_dat, dev_dat, test_dat = load_data(dat_dir)

# # Sanity check on the train, dev, and test sets
# print('Number of sentences in Train')
# count = {}
# count[0] = 0
# count[1] = 0
# for d in train_dat:
#     count[d['label']] += 1
# for key,val in count.items():
#     print(key, val)
# print('Total: ', len(train_dat))

# print()
# print('Number of sentences in Dev')
# count = {}
# count[0] = 0
# count[1] = 0
# for d in dev_dat:
#     count[d['label']] += 1
# for key,val in count.items():
#     print(key, val)
# print('Total: ', len(dev_dat))

# print()
# print('Number of sentences in Test')
# count = {}
# count[0] = 0
# count[1] = 0
# for d in test_dat:
#     count[d['label']] += 1
# for key,val in count.items():
#     print(key, val)
# print('Total: ', len(test_dat))

### Code to tokenize, train, and generate predictions

In [None]:
# def tokenize_function(example):
#   #the tokenizer is cached in memory, so will not re-download for every function call. 
#   tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-cased",
#                                                       truncation=True,
#                                                       do_lower_case=True)
#   tokenized = tokenizer(example['sent'],
#                         padding = 'max_length',
#                         return_tensors='pt') #returns dict
#   # convert label to a tensor and add it to the tokenized.
#   lab = example['label']
#   tokenized['labels'] = torch.tensor(int(lab)).to(device)

#   return tokenized

In [None]:
def train(model, tokenized_data, args):
  num_epochs = args['num_epochs']
  batch_size = args['batch_size']
  lr = args['lr']
  # Set up the optimizer
  optimizer = AdamW(model.parameters())

  # Set up a dataloader, which will divide the data into batches
  train_dataloader = DataLoader(
      tokenized_data, shuffle=True, batch_size=batch_size
      )

  num_training_steps = num_epochs * len(train_dataloader)
  lr_scheduler = get_scheduler("linear",
                               optimizer=optimizer,
                               num_warmup_steps=0,
                               num_training_steps=num_training_steps,
                               )
  #Start train
  progress_bar = tqdm(range(num_training_steps))
  for epoch in range(num_epochs):
    print("Epoch",epoch)
    for i,batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        input_ids = batch['input_ids'].squeeze()
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        #forward pass
        outputs = model(input_ids,
                        attention_mask=attention_mask,
                        labels=labels)
        #compute loss and update weights
        loss = outputs[0]
        loss.backward()
          
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        #progress_bar.update(1)

In [None]:
def get_predictions(model, tokenized_dataset, tokenizer, n):
  """
  n: number of examples from the dataset you want predictions for
  """
  preds = []
  eval_dataset = DataLoader(tokenized_dataset[:n], batch_size=1, shuffle=False)
  for i,batch in enumerate(eval_dataset):                
    batch = {k: v.to(device) for k, v in batch.items()}
    input_ids = batch['input_ids'].squeeze()
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    outputs = model(input_ids,
                    attention_mask=attention_mask,
                    labels=labels)

    logits = outputs.logits
    best = torch.argmax(logits)
    pred = best.item()

    preds.append({'sent': tokenizer.decode(batch["input_ids"][0][0]),
                  'pred': pred,
                  'gold': batch["labels"][0].item(),
                  'logits': outputs.logits})
  return preds

## Evaluation metrics


In [None]:
from sklearn.metrics import confusion_matrix
def make_confusion_matrix(predictions):
    output_labels = []
    gold_labels = []
    for item in predictions:
        output_labels.append(item['pred'])
        gold_labels.append(item['gold'])
    return np.array(confusion_matrix(gold_labels, output_labels, labels=list(set(gold_labels))))
# Write a function to calculate accuracy
def calc_accuracy(predictions, average_type='macro'):
  cfm = make_confusion_matrix(predictions)
  tp = np.array([cfm[i][i] for i in range(len(cfm))])
  gold_size = np.sum(cfm,axis=1)
  accuracies = np.divide(tp, gold_size)
  
  if average_type == 'macro':
    return np.mean(accuracies)
  else:
    return np.sum(tp)/np.sum(gold_size)
# Write a function to calculate precision
def calc_precision(predictions, average_type='macro'):
  cfm = make_confusion_matrix(predictions)
  tp = np.array([cfm[i][i] for i in range(len(cfm))])
  output_size = np.sum(cfm,axis=0)
  precisions = []
  for i in range(len(cfm)):
    if output_size[i]==0:
      precisions.append(0)
    else:
      precisions.append(tp[i]/ output_size[i])
  
  if average_type == 'macro':
    return np.mean(precisions)
  else:
    return np.sum(tp)/np.sum(output_size)
# Write a function to calculate recall
def calc_recall(predictions, average_type='macro'):
  cfm = make_confusion_matrix(predictions)
  tp = np.array([cfm[i][i] for i in range(len(cfm))])
  size = np.array([sum([cfm[i][j] for j in range(len(cfm))]) for i in range(len(cfm))])
  recalls = np.divide(tp, size)
  
  if average_type == 'macro':
    return np.mean(recalls)
  else:
    return np.sum(tp)/np.sum(size)
# Write a function to calculate fscore
def calc_fscore(precision, recall, beta):
  beta = beta**2
  return ((beta + 1)*precision*recall)/(beta*precision + recall)
def print_scores(model_type, preds):
  print(model_type)
  print('-------------------------')
  precision = calc_precision(preds, "macro")
  recall = calc_recall(preds,  "macro")
  accuracy = calc_accuracy(preds, "micro")
  f1 = calc_fscore(precision, recall, 1)
  f2 = calc_fscore(precision, recall, 2)
  print('Precision\t', round(precision, 3))
  print('Recall\t\t', round(recall, 3))
  print('Accuracy\t', round(accuracy, 3))
  print('F2\t\t', round(f2, 3))
  print('F1\t\t', round(f1,3))
  print()

In [None]:
# import random

# # Write your code here to load train, dev and test data. 
# train_dat, dev_dat, test_dat = load_data(dat_dir)

# # Shuffle training, dev and test
# random.shuffle(train_dat)
# random.shuffle(dev_dat)
# random.shuffle(test_dat)

# # Create tokenized train, dev and test. 
# ## You might want to look at only a small subset of train, dev and test to avoid RAM issues. 


## Fine-tuning the model

Fine tune the model to the training dataset (or subsets of the dataset) and save it using `torch.save()`. Set the number of epochs to three, and the batch_size to 5. 

- Run on dev sets
- With different training parameters
- Choose argmax for hyperparameters combinations


In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-cased",
                                                            num_labels=2).to(device)

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-cased",
                                                      truncation=True,
                                                      do_lower_case=True)

Evaluate the model on the test set prior to fine-tuning. If you run into RAM issues, evaluate it on a smaller set using the n parameter of get_predictions(). Make sure to print precision, accuracy, recall and f1 in an easy to read format. 

In [None]:
# # Write your code here
# tokenized_train = [tokenize_function(e) for e in train_dat]
# tokenized_test = [tokenize_function(t) for t in test_dat]
# tokenized_dev = [tokenize_function(d) for d in dev_dat]

In [None]:
# # Load the tokenized data into pickle files
# with open('train_tokens.pickle', 'wb') as train:
#     pickle.dump(tokenized_train, train, protocol=pickle.HIGHEST_PROTOCOL)
# train.close()
# with open('dev_tokens.pickle', 'wb') as dev:
#     pickle.dump(tokenized_dev, dev, protocol=pickle.HIGHEST_PROTOCOL)
# dev.close()
# with open('test_tokens.pickle', 'wb') as test:
#     pickle.dump(tokenized_test, test, protocol=pickle.HIGHEST_PROTOCOL)
# test.close()

In [None]:
# RUN THIS TO LOAD TOKENIZED DATA
with open('train_tokens.pickle', 'rb') as train:
    tokenized_train = pickle.load(train)
train.close()
with open('dev_tokens.pickle', 'rb') as dev:
    tokenized_dev = pickle.load(dev)
dev.close()
with open('test_tokens.pickle', 'rb') as test:
    tokenized_test = pickle.load(test)
test.close()

In [None]:
# MODIFY THIS!
args = {
    'num_epochs': 4,
    'batch_size': 16,
    'lr' : 3e-5
}

## Write your code here
train(model, tokenized_train, args)
torch.save(model, 'model8.pt')


 65%|██████▍   | 927/1431 [1:42:00<42:15,  5.03s/it][A
 65%|██████▍   | 928/1431 [1:42:05<42:28,  5.07s/it][A
 65%|██████▍   | 929/1431 [1:42:10<42:56,  5.13s/it][A
 65%|██████▍   | 930/1431 [1:42:16<45:49,  5.49s/it][A
 65%|██████▌   | 931/1431 [1:42:21<44:09,  5.30s/it][A
 65%|██████▌   | 932/1431 [1:42:26<42:48,  5.15s/it][A
 65%|██████▌   | 933/1431 [1:42:31<41:43,  5.03s/it][A
 65%|██████▌   | 934/1431 [1:42:36<41:18,  4.99s/it][A
 65%|██████▌   | 935/1431 [1:42:40<40:59,  4.96s/it][A
 65%|██████▌   | 936/1431 [1:42:46<41:05,  4.98s/it][A
 65%|██████▌   | 937/1431 [1:42:50<40:59,  4.98s/it][A
 66%|██████▌   | 938/1431 [1:42:56<42:09,  5.13s/it][A
 66%|██████▌   | 939/1431 [1:43:02<43:26,  5.30s/it][A
 66%|██████▌   | 940/1431 [1:43:07<42:23,  5.18s/it][A
 66%|██████▌   | 941/1431 [1:43:11<41:26,  5.07s/it][A
 66%|██████▌   | 942/1431 [1:43:16<40:47,  5.00s/it][A
 66%|██████▌   | 943/1431 [1:43:21<40:46,  5.01s/it][A
 66%|██████▌   | 944/1431 [1:43:26<41:11,  5.08

In [None]:
args = {
    'num_epochs': 4,
    'batch_size': 16,
    'lr' : 3e-4
}

## Write your code here
train(model, tokenized_train, args)
torch.save(model, 'model9.pt')

## Evaluating the model

Evaluate the saved model on the test set. Make sure to display the evaluation metrics in an easy-to-view format. 

In [None]:
trained_model8 = torch.load('model8.pt')
preds8 = get_predictions(trained_model8, tokenized_dev, tokenizer, len(tokenized_dev))
print_scores("model after fine-tuning yields these scores", preds8)

In [None]:
trained_model9 = torch.load('model9.pt')
preds9 = get_predictions(trained_model9, tokenized_dev, tokenizer, len(tokenized_dev))
print_scores("model after fine-tuning yields these scores", preds9)

# BERT Model for Sequence Classification

- This class is defined to accept the `tokenizer`, `dataframe` and `max_length` as input and generate tokenized output and tags that is used by the BERT model for training. 
- We are using the BERT tokenizer to tokenize the data in the `comment_text` column of the dataframe.
- The tokenizer uses the `encode_plus` method to perform tokenization and generate the necessary outputs, namely: `ids`, `attention_mask`, `token_type_ids`
---
- *This is the first difference between the distilbert and bert, where the tokenizer generates the token_type_ids in case of Bert*
---
- To read further into the tokenizer, [refer to this document](https://huggingface.co/transformers/model_doc/bert.html#berttokenizer)
- *It is to be noted that the overall mechanisms for a multiclass and multilabel problems are similar, except for few differences namely:*
	- *Loss function is designed to evaluate all the probability of categories individually rather than as compared to other categories. Hence the use of `BCE` rather than `Cross Entropy` when defining loss.*
	- *Sigmoid of the outputs calcuated to rather than Softmax. Again for the reasons defined in the previous point*

In [None]:
def tokenize_bert_function(example):
  #the tokenizer is cached in memory, so will not re-download for every function call. 
  tokenizer_bert = BertTokenizerFast.from_pretrained('bert-base-uncased',
                                                      truncation=True,
                                                      do_lower_case=True)
  tokenized = tokenizer_bert(example['sent'],
                        padding = 'max_length',
                        return_tensors='pt') #returns dict
  # convert label to a tensor and add it to the tokenized.
  lab = example['label']
  tokenized['labels'] = torch.tensor(int(lab)).to(device)

  return tokenized

In [None]:
tokenizer_bert = BertTokenizerFast.from_pretrained('bert-base-uncased',
                                                      truncation=True,
                                                      do_lower_case=True)

In [None]:
def bert_train(model, tokenized_data, args):
  num_epochs = args['num_epochs']
  batch_size = args['batch_size']
  lr = args['lr']
  # Set up the optimizer
  optimizer = AdamW(model.parameters())

  # Set up a dataloader, which will divide the data into batches
  train_dataloader = DataLoader(
      tokenized_data, shuffle=True, batch_size=batch_size
      )

  num_training_steps = num_epochs * len(train_dataloader)
  lr_scheduler = get_scheduler("linear",
                               optimizer=optimizer,
                               num_warmup_steps=0,
                               num_training_steps=num_training_steps,
                               )
  #Start train
  progress_bar = tqdm(range(num_training_steps))
  for epoch in range(num_epochs):
    print("Epoch",epoch)
    for i,batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        input_ids = batch['input_ids'].squeeze()
        attention_mask = batch['attention_mask']
        token_type_ids = batch['token_type_ids']
        labels = batch['labels']
        #forward pass
        outputs = model(input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        labels=labels)
        #compute loss and update weights
        loss = outputs[0]
        loss.backward()
          
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        #progress_bar.update(1)

In [None]:
# Write your code here
bert_tokenized_train = [tokenize_bert_function(e) for e in train_dat]
bert_tokenized_test = [tokenize_bert_function(t) for t in test_dat]
bert_tokenized_dev = [tokenize_bert_function(d) for d in dev_dat]

In [None]:
# Load the tokenized data into pickle files
with open('bert_train_tokens.pickle', 'wb') as train:
    pickle.dump(tokenized_train, train, protocol=pickle.HIGHEST_PROTOCOL)
train.close()
with open('bert_dev_tokens.pickle', 'wb') as dev:
    pickle.dump(tokenized_dev, dev, protocol=pickle.HIGHEST_PROTOCOL)
dev.close()
with open('bert_test_tokens.pickle', 'wb') as test:
    pickle.dump(tokenized_test, test, protocol=pickle.HIGHEST_PROTOCOL)
test.close()

Here we define a training function that trains the model on the training dataset created above, specified number of times (EPOCH), An epoch defines how many times the complete data will be passed through the network. 

Following events happen in this function to fine tune the neural network:
- The dataloader passes data to the model based on the batch size. 
- Subsequent output from the model and the actual category are compared to calculate the loss. 
- Loss value is used to optimize the weights of the neurons in the network.
- After every 5000 steps the loss value is printed in the console.

In [None]:
def get_predictions_bert(model, tokenized_dataset, tokenizer, n):
  """
  n: number of examples from the dataset you want predictions for
  """
  preds = []
  eval_dataset = DataLoader(tokenized_dataset[:n], batch_size=1, shuffle=False)
  for i,batch in enumerate(eval_dataset):                
    batch = {k: v.to(device) for k, v in batch.items()}
    input_ids = batch['input_ids'].squeeze()
    input_shape = input_ids.size()
    print("SHAPE: ", input_shape)
    attention_mask = batch['attention_mask']
    token_type_ids = batch['token_type_ids']
    labels = batch['labels']
    outputs = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    labels=labels)

    logits = outputs.logits
    best = torch.argmax(logits)
    pred = best.item()

    preds.append({'pred': pred,
                  'gold': batch["labels"][0].item()})
  return preds


In [None]:
args = {
    'num_epochs': 4,
    'batch_size': 16,
    'lr' : 3e-5
}

## Write your code here
train(model_bert,bert_tokenized_train, args)
torch.save(model_bert, 'model_bert.pt')
trained_model_bert = torch.load('model_bert.pt')

In [None]:
bert_trained_model = torch.load('model_bert.pt')
bert_preds = get_predictions_bert(bert_trained_model, bert_tokenized_test, tokenizer_bert, 300)
print_scores("model after fine-tuning yields these scores", bert_preds)

In [None]:
bert_trained_model = torch.load('model_bert.pt')
bert_preds = get_predictions(bert_trained_model, bert_tokenized_test, tokenizer_bert, 300)
print_scores("model after fine-tuning yields these scores", bert_preds)