Note: Get the data from the github repo: https://github.com/Georgepu1/cs263-final-project/

- Good resource: https://colab.research.google.com/github/zphang/zphang.github.io/blob/master/files/notebooks/Multi_task_Training_with_Transformers_NLP.ipynb#scrollTo=LlICaYzQan59



In [None]:
!pip install transformers
!pip install torchtext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Note first get the data from the github to start data prep phase
import pandas as pd

sst_tr_dataset = pd.read_csv('sst_train_data.csv')
sst_val_dataset = pd.read_csv('sst_val_data.csv')
sst_test_dataset = pd.read_csv('sst_test_data.csv')

cola_tr_dataset = pd.read_csv('cola_train_data.csv')
cola_val_dataset = pd.read_csv('cola_val_data.csv')
cola_test_dataset = pd.read_csv('cola_test_data.csv')

sst_tr_dataset.sample(5)

Unnamed: 0,sentences,labels
6542,A big meal of cliches that the talented cast g...,0
1714,Films are made of little moments .,1
7101,Chelsea Walls is a case of too many chefs fuss...,0
7210,( Jackson and Bledel ) seem to have been picke...,0
5597,"Despite the pyrotechnics , Narc is strictly by...",0


In [None]:
from transformers import BertTokenizer, BertModel
import torch

model_name = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import random
import transformers
from transformers import pipeline, AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig
from transformers import AutoConfig, AutoModelForPreTraining
from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from collections import namedtuple


In [None]:
from collections import namedtuple

gen_batch_fields = ['sst_input_text', 'sst_id_text', 'sst_attention_mask', 'sst_target', 'cola_input_text', 'cola_id_text', 'cola_attention_mask', 'cola_target']
GenBatch = namedtuple('GenBatch', field_names=gen_batch_fields, defaults=[None] * len(gen_batch_fields))

In [None]:
# Dataset for LSTM + Embedding / BERT
class MultitaskDataset(torch.utils.data.Dataset):
  def __init__(self, tokenizer, sst_X, sst_y, cola_X, cola_y, max_len=10000, max_output_length=64):
    assert len(sst_X) == len(sst_y), "SST data not one to one"
    assert len(cola_X) == len(cola_y), "CoLA data not one to one"

    self.max_len = max_len
    self.tokenizer = tokenizer
    self.max_output_length = max_output_length
    min_samples = min(len(sst_X), len(cola_X))
    self.data = []
    sst_X = sst_X[:min(min_samples, max_len)]
    sst_y = sst_y[:min(min_samples, max_len)]
    cola_X = cola_X[:min(min_samples, max_len)]
    cola_y = cola_y[:min(min_samples, max_len)]
    self.load_data(sst_X, sst_y, cola_X, cola_y)

  def load_data(self, sst_X, sst_y, cola_X, cola_y):
    for (cur_sst_X, cur_sst_y, cur_cola_X, cur_cola_y) in zip(sst_X, sst_y, cola_X, cola_y):
      self.data.append({
          'sst_input': cur_sst_X, 
          'sst_target': cur_sst_y, 
          'cola_input': cur_cola_X, 
          'cola_target': cur_cola_y,
      })

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
      # Get element consisting of sst_X, sst_y, cola_X, and cola_y
    return self.data[index]
  
  def collate_fn(self, batch):
    sst_input_text = [x['sst_input'] for x in batch]
    sst_inputs = self.tokenizer(sst_input_text, return_tensors="pt", padding=True, max_length=self.max_output_length)
    cola_input_text = [x['cola_input'] for x in batch]
    cola_inputs = self.tokenizer(cola_input_text, return_tensors="pt", padding=True, max_length=self.max_output_length)

    return GenBatch(
      sst_input_text=sst_input_text,
      sst_id_text=sst_inputs['input_ids'],
      sst_attention_mask=sst_inputs['attention_mask'],
      sst_target=[x['sst_target'] for x in batch],
      cola_input_text=cola_input_text,
      cola_id_text=cola_inputs['input_ids'],
      cola_attention_mask=cola_inputs['attention_mask'],
      cola_target=[x['cola_target'] for x in batch]
    )

In [None]:
train_set = MultitaskDataset(tokenizer, sst_tr_dataset['sentences'], sst_tr_dataset['labels'], 
                             cola_tr_dataset['sentences'], cola_tr_dataset['labels'])
val_set = MultitaskDataset(tokenizer, sst_val_dataset['sentences'], sst_val_dataset['labels'], 
                           cola_val_dataset['sentences'], cola_val_dataset['labels'])
test_set = MultitaskDataset(tokenizer, sst_test_dataset['sentences'], sst_test_dataset['labels'], 
                             cola_test_dataset['sentences'], cola_test_dataset['labels'])
train_loader = DataLoader(train_set, shuffle=True, batch_size=8,
                          collate_fn=train_set.collate_fn)
val_loader  = DataLoader(val_set, shuffle=False, batch_size=8,
                          collate_fn=val_set.collate_fn)
test_loader  = DataLoader(test_set, shuffle=False, batch_size=8,
                          collate_fn=test_set.collate_fn)


In [None]:
# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt", padding=True, max_length=64)
# inputs.keys()
# outputs = model(**inputs)

# last_hidden_states = outputs.last_hidden_state

In [None]:
import torch.nn.functional as F

class MultitaskBert(torch.nn.Module):
  def __init__(self, model_name, tokenizer, num_labels=2, hidden_size=768):
    super().__init__()

    self.num_labels = num_labels
    self.tokenizer = tokenizer
    self.bert = BertModel.from_pretrained(model_name)
    # self.dropout = nn.Dropout(.1)
    self.classifier = nn.ModuleList([nn.Linear(hidden_size, self.num_labels) for i in range(2)])
    self.init_weights()

  def init_weights(self):
    initrange = 0.5
    self.classifier[0].weight.data.uniform_(-initrange, initrange)
    self.classifier[1].weight.data.uniform_(-initrange, initrange)
    self.classifier[0].bias.data.zero_()
    self.classifier[1].bias.data.zero_()

  def forward(self, input_ids, attention_mask, ind):
    outputs = self.bert(
      input_ids,
      attention_mask=attention_mask
    )

    pooled_output = outputs[1]
    # Note BERT Model already applies dropout in output
    # pooled_output = self.dropout(pooled_output)
    logits = self.classifier[ind](pooled_output)
    # print(logits.shape)
    return F.softmax(logits, dim=-1)

In [None]:
from transformers import AdamW

model = MultitaskBert(model_name, tokenizer)
optimizer = AdamW(model.parameters())
criterion = nn.CrossEntropyLoss()

def eval_metrics(model, dl, criterion):
  model.eval()
  sst_acc = 0
  sst_loss = 0
  cola_acc = 0
  cola_loss = 0
  total = 0

  with torch.no_grad():
    for sample in dl:
      y1_pred = model(sample.sst_id_text, sample.sst_attention_mask, 0).reshape(-1, 2)
      y2_pred = model(sample.cola_id_text, sample.cola_attention_mask, 1).reshape(-1, 2)
      sample_sst_target = torch.tensor(sample.sst_target).type(torch.long).reshape(-1)
      sample_cola_target = torch.tensor(sample.cola_target).type(torch.long).reshape(-1)

      sst_loss = criterion(y1_pred, sample_sst_target)
      cola_loss = criterion(y2_pred, sample_cola_target)
      
      # print('Length: {}'.format(y1_pred.shape[0]))
      sst_acc += (torch.eq(sample_sst_target,y1_pred.argmax(1))).sum().item()
      cola_acc += (torch.eq(sample_cola_target, y2_pred.argmax(1))).sum().item()
      sst_loss += sst_loss.item()
      cola_loss += cola_loss.item()
      total += y1_pred.shape[0]

  return sst_acc/total, sst_loss/total, cola_acc/total, cola_loss/total

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# m = nn.Sigmoid()
# loss = nn.BCELoss()
# input = torch.randn(3, requires_grad=True)
# target = torch.empty(3).random_(2)

# output = loss(m(input), target)
# output.backward()
# print(input.shape, target.shape)

In [None]:
from tqdm import tqdm
epochs = 10

for _ in range(epochs):
  
  tr_count = 0
  sst_tr_acc = 0
  sst_tr_loss = 0
  cola_tr_acc = 0
  cola_tr_loss = 0
  
  model.train()
  for sample in tqdm(train_loader):
    # Note can also set the data to a decide (cuda)
    model.zero_grad()
    
    y1_pred = model(sample.sst_id_text, sample.sst_attention_mask, 0).reshape(-1, 2)
    y2_pred = model(sample.cola_id_text, sample.cola_attention_mask, 1).reshape(-1, 2)
    sample_sst_target = torch.tensor(sample.sst_target).type(torch.long).reshape(-1)
    sample_cola_target = torch.tensor(sample.cola_target).type(torch.long).reshape(-1)

    sst_loss = criterion(y1_pred, sample_sst_target)
    cola_loss = criterion(y2_pred, sample_cola_target)
    # multitask loss
    loss = sst_loss + cola_loss
    loss.backward()
    # intermediate variabels stores embedding of x and computes
    # m_output.grad w.r.t. this and calculate the MSE of the m_output.grad(emb_x)
    # norm and 1.0 and use relu on; before you do the square, pass it through a relu
    # so everything les than 1.0 wont be counted to the square.
    # MSE (m_output.grad(x), 1.0) calulate gradient of M output w.r.t. x's embedding space
    # To avoid overfitting, calculating regularization term can use a varied version of x
    # instead of the original (e.g. add gaussian noise around embeddings of x); can
    # also minimize discrepancy on the two for robustness of model
    # Lipschitz-regularized loss
    optimizer.step()
    # print('Length: {}'.format(y1_pred.shape[0]))
    sst_tr_acc += (torch.eq(sample_sst_target,y1_pred.argmax(1))).sum().item()
    cola_tr_acc += (torch.eq(sample_cola_target, y2_pred.argmax(1))).sum().item()
    sst_tr_loss += sst_loss.item()
    cola_tr_loss += cola_loss.item()
    tr_count += y1_pred.shape[0]
    
  print("SST Train accuracy: {}, CoLA Train accuracy: {}, SST Train loss: {}, CoLA Train loss: {}".format(sst_tr_acc/tr_count, cola_tr_acc/tr_count, sst_tr_loss/tr_count, cola_tr_loss/tr_count))
  val_sst_acc, val_sst_loss, val_cola_acc, val_cola_loss = eval_metrics(model, val_loader, criterion)
  print("SST Val accuracy: {}, CoLA Val accuracy: {}, SST Val loss: {}, CoLA Val loss: {} ".format(val_sst_acc, val_cola_acc, val_sst_loss, val_cola_loss))

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
  0%|          | 2/962 [00:15<2:04:03,  7.75s/it]


KeyboardInterrupt: ignored

In [None]:
# learning_rate = 3e-4
# criterion = nn.BCELoss()
# # optimizer = torch.optim.SGD(model.parameters(),lr=learning_rate)
# optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
# offset_input = torch.tensor([0])


In [None]:
test_sst_acc, test_sst_loss, test_cola_acc, test_cola_loss = eval_metrics(model, val_loader, criterion)
print("SST Test accuracy: {}, CoLA Val accuracy: {}, SST Val loss: {}, CoLA Val loss: {} ".format(test_sst_acc, test_cola_acc, test_sst_loss, test_cola_loss))

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


SST Test accuracy: 0.39369158878504673, CoLA Val accuracy: 0.2850467289719626, SST Val loss: 0.0007324839825741947, CoLA Val loss: 0.0018050861544907093 


In [None]:
sst_tr_dataset.head()

Unnamed: 0,sentences,labels
0,The Rock is destined to be the 21st Century 's...,1
1,The gorgeously elaborate continuation of `` Th...,1
2,Singer\/composer Bryan Adams contributes a sle...,1
3,You 'd think by now America would have had eno...,0
4,Yet the act is still charming here .,1


In [None]:
test_sst_acc, test_sst_loss, test_cola_acc, test_cola_loss = eval_metrics(model, val_loader, criterion)
print("SST Test accuracy: {}, CoLA Val accuracy: {}, SST Val loss: {}, CoLA Val loss: {} ".format(test_sst_acc, test_cola_acc, test_sst_loss, test_cola_loss))

SST Test accuracy: 0.6063084112149533, CoLA Val accuracy: 0.6273364485981309, SST Val loss: 0.23364485800266266, CoLA Val loss: 0.0034176348708570004 


### TODO: Adversarial attacks and Robustness Evaluation

In [None]:
def attack():
  pass