**Jonathan Ng**

**Vinty Dong**

**CSE 354 Final Project Notebook 2**

Just like notebook 1, this notebook is heavily based off the code from HW assignment 3.

This notebook was used to train a classification layer for our DistilBert models finetuned using the MLM task on the math_qa data set. This classification layer was trained on the same dataset as the finetuning. This notebook then evaluates the final models, as well as DistilBert-based-uncased as a control, on the test set of the data. This is where we obtain our results seen in table 2 of our report.

In [1]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import numpy as np
from transformers import AdamW, pipeline
import os
from sklearn.metrics import precision_score, recall_score, f1_score
from datasets import load_dataset

dataset = load_dataset("math_qa")

torch.manual_seed(42)
np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset math_qa (C:/Users/waibong/.cache/huggingface/datasets/math_qa/default/0.1.0/67fc1cc5d22b185002c6fd16e19e4d5215eae01fb04d656bed83204ba6ee55ff)
100%|██████████| 3/3 [00:00<00:00, 428.66it/s]


In [3]:
from transformers import AutoTokenizer, BertForSequenceClassification, AutoModelForMaskedLM, AutoModelForSequenceClassification
class DistillBERT():
  def __init__(self, model_path):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=5)

  def get_tokenizer_and_model(self):
    return self.model, self.tokenizer
  
class DatasetLoader(Dataset):

  def __init__(self, data, tokenizer):
    self.data = data
    self.tokenizer = tokenizer

  def tokenize_data(self):
    print("Processing data..")
    tokens = []
    labels = []
    label_dict = {'a':0, 'b':1, 'c':2, 'd':3, 'e':4}

    for training_instance in self.data:
      problem = ' '.join(training_instance['Problem'].split())
      options = ' '.join(training_instance['options'].split())

      input = (problem + ' ' + options)
      input = self.tokenizer.encode_plus(input, max_length=512, truncation=True, pad_to_max_length=True, add_special_tokens=True, return_tensors='pt')['input_ids'].to("cuda:0" if torch.cuda.is_available() else "cpu")
      tokens.append(input)
      labels.append(label_dict[training_instance['correct']])
    
    tokens = pad_sequence(tokens, batch_first=True)
    labels = torch.tensor(labels)
    dataset = TensorDataset(tokens, labels)
    return dataset

  def get_data_loaders(self, batch_size=32, shuffle=True):
    processed_dataset = self.tokenize_data()

    data_loader = DataLoader(
        processed_dataset,
        shuffle=shuffle,
        batch_size=batch_size
    )

    return data_loader




In [4]:
class Trainer():

  def __init__(self, options):
    self.device = options['device']
    self.train_data = options['train_data']
    self.val_data = options['val_data']
    self.test_data = options['test_data']
    self.batch_size = options['batch_size']
    self.epochs = options['epochs']
    self.save_path = options['save_path']
    self.training_type = options['training_type']
    transformer = DistillBERT(options['model_name'])
    self.model, self.tokenizer = transformer.get_tokenizer_and_model()
    self.model.to(self.device)

  def get_performance_metrics(self, preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    precision = precision_score(labels_flat, pred_flat, average='micro', zero_division=0)
    recall = recall_score(labels_flat, pred_flat, average='micro', zero_division=0)
    f1 = f1_score(labels_flat, pred_flat, average='micro', zero_division=0)
    return precision, recall, f1

  def set_training_parameters(self):
    # TODO(students): start
    t = self.training_type
    if t == 'frozen_embeddings':
      # will not turn on require_grad = True for any layer
      for name, layer in self.model.named_parameters():
        if 'classifier' in name:
          continue
        layer.require_grad = False
    elif t == 'top_2_training':
      # require_grad = True for layers 4,5
      for name, layer in self.model.named_parameters():
        if 'classifier' in name:
          continue
        if 'layer.4' in name or 'layer.5' in name:
          layer.require_grad = True
        else:
          layer.require_grad = False  
    elif t == 'top_4_training':
      #require_grad = True for layers 2,3,4,5
      for name, layer in self.model.named_parameters():
        if 'classifier' in name:
          continue
        if 'layer.2' in name or 'layer.3' in name or 'layer.4' in name or 'layer.5' in name:
          layer.require_grad = True
        else:
          layer.require_grad = False 
    elif t == 'all_training':
      # require_grad = True for layers 0,1,2,3,4,5
      for name, layer in self.model.named_parameters():
        layer.require_grad = True
    else:
      raise KeyError(f"training_type={t} not found")
    # TODO(students): end

  def train(self, data_loader, optimizer):
    self.model.train()
    total_recall = 0
    total_precision = 0
    total_f1 = 0
    total_loss = 0

    for (problem, labels) in tqdm(data_loader):
      self.model.zero_grad()

      problem = problem.squeeze(1).to(self.device)
      labels = labels.to(self.device)
      output = self.model(problem, labels=labels)

      logits = output.logits
      loss = output.loss

      loss.backward()
      optimizer.step()
      total_loss += loss
      precision, recall, f1 = self.get_performance_metrics(preds=logits.detach().cpu().numpy(), labels=labels.detach().cpu().numpy())

      total_recall += recall
      total_precision += precision
      total_f1 += f1

    precision = total_precision/len(data_loader)
    recall = total_recall/len(data_loader)
    f1 = total_f1/len(data_loader)
    loss = total_loss/len(data_loader)

    return precision, recall, f1, loss

  def eval(self, data_loader):
    self.model.eval()
    total_recall = 0
    total_precision = 0
    total_f1 = 0
    total_loss = 0

    with torch.no_grad():
      for (problem, labels) in tqdm(data_loader):
        problem = problem.squeeze(1).to(self.device)
        labels = labels.to(self.device)

        output = self.model(problem, labels=labels)

        logits = output.logits
        loss = output.loss
        
        total_loss += loss
        precision, recall, f1 = self.get_performance_metrics(preds=logits.detach().cpu().numpy(), labels=labels.detach().cpu().numpy())

        total_recall += recall
        total_precision += precision
        total_f1 += f1
    
    precision = total_precision/len(data_loader)
    recall = total_recall/len(data_loader)
    f1 = total_f1/len(data_loader)
    loss = total_loss/len(data_loader)

    return precision, recall, f1, loss
  
  def eval_test(self):
    test_dataset = DatasetLoader(self.test_data, self.tokenizer)
    test_data_loader = test_dataset.get_data_loaders(self.batch_size)
    test_precision, test_recall, test_f1, test_loss = self.eval(test_data_loader)
    print(f'test_loss: {test_loss:.4f} test_precision: {test_precision:.4f} test_recall: {test_recall:.4f} test_f1: {test_f1:.4f}')


  def save_transformer(self):
    self.model.save_pretrained(self.save_path)
    self.tokenizer.save_pretrained(self.save_path)

  def execute(self):
    last_best = 0
    train_dataset = DatasetLoader(self.train_data, self.tokenizer)
    train_data_loader = train_dataset.get_data_loaders(self.batch_size)
    val_dataset = DatasetLoader(self.val_data, self.tokenizer)
    val_data_loader = val_dataset.get_data_loaders(self.batch_size)
    optimizer = torch.optim.AdamW(self.model.parameters(), lr = 3e-4, eps = 1e-8)
    self.set_training_parameters()

    print("Done processing data")

    for epoch_i in range(0, self.epochs):
      train_precision, train_recall, train_f1, train_loss = self.train(train_data_loader, optimizer)
      print(f'Epoch {epoch_i + 1}: train_loss: {train_loss:.4f} train_precision: {train_precision:.4f} train_recall: {train_recall:.4f} train_f1: {train_f1:.4f}')
      val_precision, val_recall, val_f1, val_loss = self.eval(val_data_loader)
      print(f'Epoch {epoch_i + 1}: val_loss: {val_loss:.4f} val_precision: {val_precision:.4f} val_recall: {val_recall:.4f} val_f1: {val_f1:.4f}')

      if val_f1 > last_best:
        print("Saving model..")
        self.save_transformer()
        last_best = val_f1
        print("Model saved.")

In [None]:
BATCH_SIZE = 4
EPOCHS = 10
SAVE_PATH = "models\DistilBERT_MC_30k"

options = {}
options['batch_size'] = BATCH_SIZE
options['device'] = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
options['train_data'] = dataset['train']
options['test_data'] = dataset['test']
options['val_data'] = dataset['validation']
options['save_path'] = SAVE_PATH + '_top_2_training'
options['epochs'] = EPOCHS
options['training_type'] = 'top_2_training'
options['model_name'] = 'models\DistilBERT_MC_30k' #MUST BE CORRECT

print(options['device'])

torch.cuda.empty_cache()
import gc
gc.collect()

trainer = Trainer(options)
trainer.execute()

In [17]:
BATCH_SIZE = 8
EPOCHS = 10

SAVE_PATH = "models\DistilBERT_MC_5k"

options = {}
options['batch_size'] = BATCH_SIZE
options['device'] = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
options['train_data'] = dataset['train']
options['test_data'] = dataset['test']
options['val_data'] = dataset['validation']
options['save_path'] = SAVE_PATH + '_top_2_training'
options['epochs'] = EPOCHS
options['training_type'] = 'top_2_training'
options['model_name'] = 'models\DistilBERT_MC_5k' #MUST BE CORRECT

print(options['device'])

torch.cuda.empty_cache()
import gc
gc.collect()

trainer = Trainer(options)
trainer.execute()

cuda:0


Some weights of the model checkpoint at DistilBERT_top_2_training_5k were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at DistilBERT_top_2_training_5k and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pr

Processing data..
Processing data..
Done processing data


100%|██████████| 3730/3730 [11:31<00:00,  5.39it/s]


Epoch 1: train_loss: 1.6080 train_precision: 0.2168 train_recall: 0.2168 train_f1: 0.2168


100%|██████████| 560/560 [00:33<00:00, 16.75it/s]


Epoch 1: val_loss: 1.6052 val_precision: 0.2196 val_recall: 0.2196 val_f1: 0.2196
Saving model..
Model saved.


100%|██████████| 3730/3730 [11:33<00:00,  5.38it/s]


Epoch 2: train_loss: 1.6055 train_precision: 0.2171 train_recall: 0.2171 train_f1: 0.2171


100%|██████████| 560/560 [00:33<00:00, 16.79it/s]


Epoch 2: val_loss: 1.6034 val_precision: 0.2165 val_recall: 0.2165 val_f1: 0.2165


100%|██████████| 3730/3730 [11:39<00:00,  5.34it/s]


Epoch 3: train_loss: 1.6050 train_precision: 0.2190 train_recall: 0.2190 train_f1: 0.2190


100%|██████████| 560/560 [00:33<00:00, 16.62it/s]


Epoch 3: val_loss: 1.6030 val_precision: 0.2165 val_recall: 0.2165 val_f1: 0.2165


100%|██████████| 3730/3730 [12:13<00:00,  5.08it/s]


Epoch 4: train_loss: 1.6048 train_precision: 0.2219 train_recall: 0.2219 train_f1: 0.2219


100%|██████████| 560/560 [00:34<00:00, 16.44it/s]


Epoch 4: val_loss: 1.6034 val_precision: 0.2165 val_recall: 0.2165 val_f1: 0.2165


100%|██████████| 3730/3730 [11:33<00:00,  5.38it/s]


Epoch 5: train_loss: 1.6043 train_precision: 0.2207 train_recall: 0.2207 train_f1: 0.2207


100%|██████████| 560/560 [00:32<00:00, 17.03it/s]


Epoch 5: val_loss: 1.6029 val_precision: 0.2165 val_recall: 0.2165 val_f1: 0.2165


100%|██████████| 3730/3730 [11:28<00:00,  5.41it/s]


Epoch 6: train_loss: 1.6042 train_precision: 0.2219 train_recall: 0.2219 train_f1: 0.2219


100%|██████████| 560/560 [00:33<00:00, 16.97it/s]


Epoch 6: val_loss: 1.6034 val_precision: 0.2165 val_recall: 0.2165 val_f1: 0.2165


100%|██████████| 3730/3730 [11:22<00:00,  5.47it/s]


Epoch 7: train_loss: 1.6044 train_precision: 0.2209 train_recall: 0.2209 train_f1: 0.2209


100%|██████████| 560/560 [00:32<00:00, 17.29it/s]


Epoch 7: val_loss: 1.6035 val_precision: 0.2169 val_recall: 0.2169 val_f1: 0.2169


100%|██████████| 3730/3730 [11:21<00:00,  5.48it/s]


Epoch 8: train_loss: 1.6038 train_precision: 0.2232 train_recall: 0.2232 train_f1: 0.2232


100%|██████████| 560/560 [00:32<00:00, 17.05it/s]


Epoch 8: val_loss: 1.6029 val_precision: 0.2196 val_recall: 0.2196 val_f1: 0.2196


100%|██████████| 3730/3730 [11:23<00:00,  5.46it/s]


Epoch 9: train_loss: 1.6037 train_precision: 0.2229 train_recall: 0.2229 train_f1: 0.2229


100%|██████████| 560/560 [00:32<00:00, 17.02it/s]


Epoch 9: val_loss: 1.6030 val_precision: 0.2173 val_recall: 0.2173 val_f1: 0.2173


100%|██████████| 3730/3730 [11:23<00:00,  5.45it/s]


Epoch 10: train_loss: 1.6036 train_precision: 0.2230 train_recall: 0.2230 train_f1: 0.2230


100%|██████████| 560/560 [00:32<00:00, 17.01it/s]


Epoch 10: val_loss: 1.6030 val_precision: 0.2173 val_recall: 0.2173 val_f1: 0.2173


In [39]:
BATCH_SIZE = 32
EPOCHS = 0
SAVE_PATH = ""

options = {}
options['batch_size'] = BATCH_SIZE
options['device'] = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
options['train_data'] = dataset['train']
options['val_data'] = dataset['validation']
options['test_data'] = dataset['test']
options['save_path'] = SAVE_PATH
options['epochs'] = EPOCHS
options['training_type'] = 'top_2_training'
options['model_name'] = 'models\DistilBERT_MC_5k' #MUST BE CORRECT

print(options['device'])

torch.cuda.empty_cache()
import gc
gc.collect()

trainer = Trainer(options)
trainer.eval_test()





cuda:0
Processing data..


100%|██████████| 94/94 [00:20<00:00,  4.50it/s]

test_loss: 1.6051 test_precision: 0.2011 test_recall: 0.2011 test_f1: 0.2011





In [40]:
BATCH_SIZE = 32
EPOCHS = 0
SAVE_PATH = ""

options = {}
options['batch_size'] = BATCH_SIZE
options['device'] = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
options['train_data'] = dataset['train']
options['val_data'] = dataset['validation']
options['test_data'] = dataset['test']
options['save_path'] = SAVE_PATH
options['epochs'] = EPOCHS
options['training_type'] = 'top_2_training'
options['model_name'] = 'distilbert-base-uncased'

print(options['device'])

torch.cuda.empty_cache()
import gc
gc.collect()

trainer = Trainer(options)
trainer.eval_test()


cuda:0


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifi

Processing data..


100%|██████████| 94/94 [00:21<00:00,  4.43it/s]

test_loss: 1.6153 test_precision: 0.2015 test_recall: 0.2015 test_f1: 0.2015





In [42]:
BATCH_SIZE = 32
EPOCHS = 0
SAVE_PATH = ""

options = {}
options['batch_size'] = BATCH_SIZE
options['device'] = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
options['train_data'] = dataset['train']
options['val_data'] = dataset['validation']
options['test_data'] = dataset['test']
options['save_path'] = SAVE_PATH 
options['epochs'] = EPOCHS
options['training_type'] = 'top_2_training'
options['model_name'] = 'models\DistilBERT_MC_30k' #MUST BE CORRECT

print(options['device'])

torch.cuda.empty_cache()
import gc
gc.collect()

trainer = Trainer(options)
trainer.eval_test()

cuda:0
Processing data..


100%|██████████| 94/94 [00:20<00:00,  4.48it/s]

test_loss: 2.2032 test_precision: 0.2636 test_recall: 0.2636 test_f1: 0.2636



