**Jonathan Ng**

**Vinty Dong**

**CSE 354 Final Project Notebook 1**

The code in this notebook is heavily based off the code from HW assignment 3

This notebook was used to finetune DistilBert-base-uncased using the MLM task on the math_qa dataset.



In [3]:
import torch
import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
#from transformers import AdamW, pipeline
#from sklearn.metrics import precision_score, recall_score, f1_score
from datasets import load_dataset
torch.manual_seed(42)
np.random.seed(42)
dataset = load_dataset("math_qa")

Found cached dataset math_qa (C:/Users/waibong/.cache/huggingface/datasets/math_qa/default/0.1.0/67fc1cc5d22b185002c6fd16e19e4d5215eae01fb04d656bed83204ba6ee55ff)
100%|██████████| 3/3 [00:00<00:00, 374.20it/s]


In [4]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
class DistillBERT():
  def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
        self.model = AutoModelForMaskedLM.from_pretrained('distilbert-base-uncased')

  def get_tokenizer_and_model(self):
    return self.model, self.tokenizer
  
class DatasetLoader(Dataset):

  def __init__(self, data, tokenizer):
    self.data = data
    self.tokenizer = tokenizer

  def tokenize_data(self):
    print("Processing data..")
    tokens = []
    labels = []
    # label_dict = {'a':0, 'b':1, 'c':2, 'd':3, 'e':4}


    def mask_input(problem, rationale, mask_prob=0.15,mask_token='[MASK]'):

      masked_output = []

      if np.random.random() > 0.5:
        # problem gets masked
        for token in problem:
          if np.random.random() <= mask_prob:
            if np.random.random() <= 0.85:
              masked_output.append(mask_token)
            else:
              masked_output.append(token)
          else:
            masked_output.append(token)

        masked_output += rationale
      else:
        for token in rationale:
          if np.random.random() <= mask_prob:
            if np.random.random() <= 0.85:
              masked_output.append(mask_token)
            else:
              masked_output.append(token)
          else:
            masked_output.append(token)
        masked_output += problem
      return " ".join(masked_output)


    c = 0
    for training_instance in self.data:
      if(c == 30000):
        break
      c = c + 1
      problem = training_instance['Problem'].split()
      rationale = training_instance['Rationale'].split()


      expected_output = "".join((problem + rationale))

      input = mask_input(problem, rationale)
      masked_encoding = self.tokenizer.encode_plus(input, max_length=512, truncation=True, pad_to_max_length=True, add_special_tokens=True, return_tensors='pt')['input_ids']#.to("cuda:0" if torch.cuda.is_available() else "cpu")
      unmasked_encoding = self.tokenizer.encode_plus(expected_output, max_length=512, truncation=True, pad_to_max_length=True, add_special_tokens=True, return_tensors='pt')['input_ids']#.to("cuda:0" if torch.cuda.is_available() else "cpu")

      tokens.append(masked_encoding)
      labels.append(unmasked_encoding)
    
    tokens = pad_sequence(tokens, batch_first=True)
    labels = pad_sequence(labels, batch_first=True)
    dataset = TensorDataset(tokens, labels)
    return dataset

  def get_data_loaders(self, batch_size=32, shuffle=True):
    processed_dataset = self.tokenize_data()

    data_loader = DataLoader(
        processed_dataset,
        shuffle=shuffle,
        batch_size=batch_size
    )

    return data_loader




In [5]:
class Trainer():

  def __init__(self, options):
    self.device = options['device']
    self.train_data = options['train_data']
    self.val_data = options['val_data']
    self.batch_size = options['batch_size']
    self.epochs = options['epochs']
    self.save_path = options['save_path']
    self.training_type = options['training_type']
    transformer = DistillBERT()
    self.model, self.tokenizer = transformer.get_tokenizer_and_model()
    self.model.to(self.device)

  # def get_performance_metrics(self, preds, labels):
  #   pred_flat = np.argmax(preds, axis=1).flatten()
  #   labels_flat = labels.flatten()
  #   precision = precision_score(labels_flat, pred_flat, average='micro', zero_division=0)
  #   recall = recall_score(labels_flat, pred_flat, average='micro', zero_division=0)
  #   f1 = f1_score(labels_flat, pred_flat, average='micro', zero_division=0)
  #   return precision, recall, f1
  
  def set_training_parameters(self):
    # TODO(students): start
    t = self.training_type
    if t == 'frozen_embeddings':
      # will not turn on require_grad = True for any layer
      for name, layer in self.model.named_parameters():
        if 'classifier' in name:
          continue
        layer.require_grad = False
    elif t == 'top_2_training':
      # require_grad = True for layers 4,5
      for name, layer in self.model.named_parameters():
        if 'classifier' in name:
          continue
        if 'layer.4' in name or 'layer.5' in name:
          layer.require_grad = True
        else:
          layer.require_grad = False  
    elif t == 'top_4_training':
      #require_grad = True for layers 2,3,4,5
      for name, layer in self.model.named_parameters():
        if 'classifier' in name:
          continue
        if 'layer.2' in name or 'layer.3' in name or 'layer.4' in name or 'layer.5' in name:
          layer.require_grad = True
        else:
          layer.require_grad = False 
    elif t == 'all_training':
      # require_grad = True for layers 0,1,2,3,4,5
      for name, layer in self.model.named_parameters():
        layer.require_grad = True
    else:
      raise KeyError(f"training_type={t} not found")
    # TODO(students): end

  def train(self, data_loader, optimizer):
    self.model.train()
    total_recall = 0
    total_precision = 0
    total_f1 = 0
    total_loss = 0

    for (problem, labels) in tqdm(data_loader):
      self.model.zero_grad()

      problem = problem.squeeze(1).to(self.device)
      labels = labels.squeeze(1).to(self.device)
      output = self.model(problem, labels=labels)

      logits = output.logits
      loss = output.loss

      loss.backward()
      optimizer.step()
      total_loss += loss
      # precision, recall, f1 = self.get_performance_metrics(preds=logits.detach().cpu().numpy(), labels=labels.detach().cpu().numpy())

    #   total_recall += recall
    #   total_precision += precision
    #   total_f1 += f1

    precision = total_precision/len(data_loader)
    recall = total_recall/len(data_loader)
    f1 = total_f1/len(data_loader)
    loss = total_loss/len(data_loader)

    return precision, recall, f1, loss

  def eval(self, data_loader):
    self.model.eval()
    total_recall = 0
    total_precision = 0
    total_f1 = 0
    total_loss = 0

    with torch.no_grad():
      for (problem, labels) in tqdm(data_loader):
        problem = problem.squeeze(1).to(self.device)
        labels = labels.squeeze(1).to(self.device)
        output = self.model(problem, labels=labels)

        logits = output.logits
        loss = output.loss
        
        total_loss += loss
        # precision, recall, f1 = self.get_performance_metrics(preds=logits.detach().cpu().numpy(), labels=labels.detach().cpu().numpy())

        # total_recall += recall
        # total_precision += precision
        # total_f1 += f1
    
    precision = total_precision/len(data_loader)
    recall = total_recall/len(data_loader)
    f1 = total_f1/len(data_loader)
    loss = total_loss/len(data_loader)

    return precision, recall, f1, loss

  def save_transformer(self):
    self.model.save_pretrained(self.save_path)
    self.tokenizer.save_pretrained(self.save_path)

  def execute(self):
    last_best = 0
    train_dataset = DatasetLoader(self.train_data, self.tokenizer)
    train_data_loader = train_dataset.get_data_loaders(self.batch_size)
    val_dataset = DatasetLoader(self.val_data, self.tokenizer)
    val_data_loader = val_dataset.get_data_loaders(self.batch_size)
    optimizer = torch.optim.AdamW(self.model.parameters(), lr = 3e-5, eps = 1e-8)
    self.set_training_parameters()

    print("Done processing data")

    for epoch_i in range(0, self.epochs):
      train_precision, train_recall, train_f1, train_loss = self.train(train_data_loader, optimizer)
      print(f'Epoch {epoch_i + 1}: train_loss: {train_loss:.4f} train_precision: {train_precision:.4f} train_recall: {train_recall:.4f} train_f1: {train_f1:.4f}')
      val_precision, val_recall, val_f1, val_loss = self.eval(val_data_loader)
      print(f'Epoch {epoch_i + 1}: val_loss: {val_loss:.4f} val_precision: {val_precision:.4f} val_recall: {val_recall:.4f} val_f1: {val_f1:.4f}')

      if val_f1 > last_best:
        print("Saving model..")
        self.save_transformer()
        last_best = val_f1
        print("Model saved.")

In [8]:
BATCH_SIZE = 4
EPOCHS = 10
# TEST_PATH = "data/test_data.csv"
# TRAIN_PATH = "data/train_data.csv"
# VAL_PATH = "data/val_data.csv"
SAVE_PATH = "models/DistilBERT"

options = {}
options['batch_size'] = BATCH_SIZE
options['device'] = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
options['train_data'] = dataset['train']
options['val_data'] = dataset['validation']
options['save_path'] = SAVE_PATH + '_top_2_training'
options['epochs'] = EPOCHS
options['training_type'] = 'top_2_training'

print(options['device'])

import gc
gc.collect()
torch.cuda.empty_cache()

trainer = Trainer(options)
trainer.execute()


cuda:0
Processing data..
Processing data..
Done processing data


100%|██████████| 7460/7460 [40:20<00:00,  3.08it/s]


Epoch 1: train_loss: 1.3548 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [01:56<00:00,  9.60it/s]


Epoch 1: val_loss: 1.1459 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


100%|██████████| 7460/7460 [40:57<00:00,  3.04it/s]


Epoch 2: train_loss: 1.0567 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [01:55<00:00,  9.71it/s]


Epoch 2: val_loss: 0.9575 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


100%|██████████| 7460/7460 [41:13<00:00,  3.02it/s]


Epoch 3: train_loss: 0.9164 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [01:55<00:00,  9.68it/s]


Epoch 3: val_loss: 0.8794 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


100%|██████████| 7460/7460 [41:00<00:00,  3.03it/s]


Epoch 4: train_loss: 0.8306 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [01:55<00:00,  9.66it/s]


Epoch 4: val_loss: 0.8094 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


100%|██████████| 7460/7460 [41:03<00:00,  3.03it/s]


Epoch 5: train_loss: 0.7687 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [01:54<00:00,  9.74it/s]


Epoch 5: val_loss: 0.7529 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


100%|██████████| 7460/7460 [40:57<00:00,  3.04it/s]


Epoch 6: train_loss: 0.7230 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [01:55<00:00,  9.69it/s]


Epoch 6: val_loss: 0.7369 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


100%|██████████| 7460/7460 [40:55<00:00,  3.04it/s]


Epoch 7: train_loss: 0.6850 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [01:55<00:00,  9.71it/s]


Epoch 7: val_loss: 0.7011 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


100%|██████████| 7460/7460 [40:21<00:00,  3.08it/s]


Epoch 8: train_loss: 0.6536 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [01:53<00:00,  9.86it/s]


Epoch 8: val_loss: 0.6813 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


100%|██████████| 7460/7460 [40:19<00:00,  3.08it/s]


Epoch 9: train_loss: 0.6270 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [01:53<00:00,  9.85it/s]


Epoch 9: val_loss: 0.6598 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


100%|██████████| 7460/7460 [40:18<00:00,  3.08it/s]


Epoch 10: train_loss: 0.6041 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [01:53<00:00,  9.84it/s]


Epoch 10: val_loss: 0.6612 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


In [9]:
trainer.save_transformer()

In [10]:
trainer.execute()

Processing data..




Processing data..
Done processing data


100%|██████████| 7460/7460 [40:55<00:00,  3.04it/s]


Epoch 1: train_loss: 0.6445 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [01:57<00:00,  9.56it/s]


Epoch 1: val_loss: 0.6527 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


100%|██████████| 7460/7460 [45:00<00:00,  2.76it/s]


Epoch 2: train_loss: 0.6080 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [02:04<00:00,  8.96it/s]


Epoch 2: val_loss: 0.6263 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


100%|██████████| 7460/7460 [45:49<00:00,  2.71it/s]


Epoch 3: train_loss: 0.5813 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [02:08<00:00,  8.71it/s]


Epoch 3: val_loss: 0.6087 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


100%|██████████| 7460/7460 [46:14<00:00,  2.69it/s]


Epoch 4: train_loss: 0.5606 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [02:08<00:00,  8.68it/s]


Epoch 4: val_loss: 0.6072 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


100%|██████████| 7460/7460 [45:48<00:00,  2.71it/s]


Epoch 5: train_loss: 0.5408 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [01:54<00:00,  9.74it/s]


Epoch 5: val_loss: 0.5903 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


100%|██████████| 7460/7460 [42:14<00:00,  2.94it/s]


Epoch 6: train_loss: 0.5237 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [01:55<00:00,  9.69it/s]


Epoch 6: val_loss: 0.5913 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


100%|██████████| 7460/7460 [43:02<00:00,  2.89it/s]


Epoch 7: train_loss: 0.5104 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [01:55<00:00,  9.68it/s]


Epoch 7: val_loss: 0.5795 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


100%|██████████| 7460/7460 [41:28<00:00,  3.00it/s]


Epoch 8: train_loss: 0.4980 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [01:54<00:00,  9.75it/s]


Epoch 8: val_loss: 0.5804 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


100%|██████████| 7460/7460 [41:36<00:00,  2.99it/s]


Epoch 9: train_loss: 0.4853 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [02:04<00:00,  8.98it/s]


Epoch 9: val_loss: 0.5745 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


100%|██████████| 7460/7460 [41:01<00:00,  3.03it/s]


Epoch 10: train_loss: 0.4738 train_precision: 0.0000 train_recall: 0.0000 train_f1: 0.0000


100%|██████████| 1119/1119 [01:48<00:00, 10.30it/s]


Epoch 10: val_loss: 0.5684 val_precision: 0.0000 val_recall: 0.0000 val_f1: 0.0000


In [None]:
trainer.save_transformer()