Note: Get the data from the github repo: https://github.com/Georgepu1/cs263-final-project/

In [None]:
!pip install transformers
!pip install torchtext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 27.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 51.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 40.5 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalli

In [None]:
# Note first get the data from the github to start data prep phase
import pandas as pd

sst_tr_dataset = pd.read_csv('sst_train_data.csv')
sst_val_dataset = pd.read_csv('sst_val_data.csv')
sst_test_dataset = pd.read_csv('sst_test_data.csv')

cola_tr_dataset = pd.read_csv('cola_train_data.csv')
cola_val_dataset = pd.read_csv('cola_val_data.csv')
cola_test_dataset = pd.read_csv('cola_test_data.csv')

sst_tr_dataset.sample(5)

Unnamed: 0,sentences,labels
7681,The connected stories of Breitbart and Hanusse...,0
3008,But arriving at a particularly dark moment in ...,1
965,Majidi 's direction has never been smoother or...,1
27,They are what makes it worth the trip to the t...,1
4933,There 's already been too many of these films ...,0


In [None]:
from torchtext.vocab import GloVe, vocab
from torchtext.data.utils import get_tokenizer

global_vectors = GloVe(name='6B', dim=300)
tokenizer = get_tokenizer("basic_english")

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import random
import transformers
# from transformers import pipeline, AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig


glove_vocab = vocab(global_vectors.stoi)
unk_token = "<unk>"
unk_index = 0
glove_vocab.insert_token("<unk>", unk_index)
glove_vocab.set_default_index(unk_index)
pretrained_embeddings = global_vectors.vectors
pretrained_embeddings = torch.cat((torch.zeros(1,pretrained_embeddings.shape[1]),pretrained_embeddings))

In [None]:
# def tokenize_sentences(x, tokenizer, max_words = 48):
#   x = tokenizer(x)
#   if len(x) < max_words:
#     x = x + [""] * (max_words - len(x))
#   else:
#     x = x[:max_words]

#   return glove_vocab(x)

# sst_tr_dataset['sentences'] = sst_tr_dataset['sentences'].apply(lambda x: tokenize_sentences(x, tokenizer))
# sst_val_dataset['sentences'] = sst_val_dataset['sentences'].apply(lambda x: tokenize_sentences(x, tokenizer))
# sst_test_dataset['sentences'] = sst_test_dataset['sentences'].apply(lambda x: tokenize_sentences(x, tokenizer))
# cola_tr_dataset['sentences'] = cola_tr_dataset['sentences'].apply(lambda x: tokenize_sentences(x, tokenizer))
# cola_val_dataset['sentences'] = cola_val_dataset['sentences'].apply(lambda x: tokenize_sentences(x, tokenizer))
# cola_test_dataset['sentences'] = cola_test_dataset['sentences'].apply(lambda x: tokenize_sentences(x, tokenizer))


In [None]:
from collections import namedtuple

gen_batch_fields = ['sst_input_text', 'sst_input_inds', 'sst_target', 'cola_input_text', 'cola_input_inds', 'cola_target']
GenBatch = namedtuple('GenBatch', field_names=gen_batch_fields, defaults=[None] * len(gen_batch_fields))

In [None]:
# Dataset for LSTM + Embedding / BERT
class MultitaskDataset(torch.utils.data.Dataset):
  def __init__(self, vocab, tokenizer, sst_X, sst_y, cola_X, cola_y, max_len=10000):
    assert len(sst_X) == len(sst_y), "SST data not one to one"
    assert len(cola_X) == len(cola_y), "CoLA data not one to one"

    self.max_len = max_len
    self.tokenizer = tokenizer
    self.vocab = vocab
    self.data = []
    min_samples = min(len(sst_X), len(cola_X))
    sst_X = sst_X[:min(min_samples, max_len)]
    sst_y = sst_y[:min(min_samples, max_len)]
    cola_X = cola_X[:min(min_samples, max_len)]
    cola_y = cola_y[:min(min_samples, max_len)]
    self.load_data(sst_X, sst_y, cola_X, cola_y)

  def load_data(self, sst_X, sst_y, cola_X, cola_y):
    for (cur_sst_X, cur_sst_y, cur_cola_X, cur_cola_y) in zip(sst_X, sst_y, cola_X, cola_y):
      self.data.append({
          'sst_input': cur_sst_X, 
          'sst_target': cur_sst_y, 
          'cola_input': cur_cola_X, 
          'cola_target': cur_cola_y,
      })

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
      # Get element consisting of sst_X, sst_y, cola_X, and cola_y
    return self.data[index]

  def tokenize_sentences(self, x, max_words=32):
    x = self.tokenizer(x)
    if len(x) < max_words:
      x = x + [""] * (max_words - len(x))
    else:
      x = x[:max_words]
    return self.vocab(x)
    
  def collate_fn(self, batch):
    sst_input_text = [x['sst_input'] for x in batch]
    sst_inputs = [self.tokenize_sentences(x['sst_input']) for x in batch]
    cola_input_text = [x['cola_input'] for x in batch]
    cola_inputs = [self.tokenize_sentences(x['cola_input']) for x in batch]

    return GenBatch(
      sst_input_text=sst_input_text,
      sst_input_inds=sst_inputs,
      sst_target=[x['sst_target'] for x in batch],
      cola_input_text=cola_input_text,
      cola_input_inds=cola_inputs,
      cola_target=[x['cola_target'] for x in batch]
    )

In [None]:
train_set = MultitaskDataset(glove_vocab, tokenizer, sst_tr_dataset['sentences'], sst_tr_dataset['labels'], 
                             cola_tr_dataset['sentences'], cola_tr_dataset['labels'])
val_set = MultitaskDataset(glove_vocab, tokenizer, sst_val_dataset['sentences'], sst_val_dataset['labels'], 
                           cola_val_dataset['sentences'], cola_val_dataset['labels'])
test_set = MultitaskDataset(glove_vocab, tokenizer, sst_test_dataset['sentences'], sst_test_dataset['labels'], 
                             cola_test_dataset['sentences'], cola_test_dataset['labels'])

In [None]:
train_loader = DataLoader(train_set, shuffle=True, batch_size=8,
                          collate_fn=train_set.collate_fn)
val_loader  = DataLoader(val_set, shuffle=False, batch_size=8,
                          collate_fn=val_set.collate_fn)
test_loader  = DataLoader(test_set, shuffle=False, batch_size=8,
                          collate_fn=test_set.collate_fn)

In [None]:
import torch.nn.functional as F

class MultiTaskLSTM(torch.nn.Module):
  def __init__(self, pretrain_emb, emb_dim, hidden_dim):
    super().__init__()
    self.embeddings = nn.EmbeddingBag.from_pretrained(pretrain_emb)
    self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True, bidirectional=True)
    self.linears = nn.ModuleList([nn.Linear(hidden_dim, 2) for i in range(2)])
    self.dropout = nn.Dropout(0.2)
    self.init_weights()

  def init_weights(self):
    initrange = 0.5
    self.linears[0].weight.data.uniform_(-initrange, initrange)
    self.linears[1].weight.data.uniform_(-initrange, initrange)
    self.linears[0].bias.data.zero_()
    self.linears[1].bias.data.zero_()

  def forward(self, x, ind, offset):
    print('inside forward')
    print(x.shape)
    x = self.embeddings(x, offset)
    print(x.shape)
    x = self.dropout(x)
    print(x.shape)
    print('done with dropout')
    # x_pack = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
    lstm_out, (ht, ct) = self.lstm(x)
    logits = self.linears[ind](ht[-1])

    return F.softmax(logits, dim=-1)

model = MultiTaskLSTM(pretrained_embeddings, 300, 64)

In [None]:
learning_rate = 3e-4
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.SGD(model.parameters(),lr=learning_rate)
# TODO: maybe need to truncate and clip gradient
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

def eval_metrics(model, dl, criterion, offset_input):
  model.eval()
  sst_acc = 0
  sst_loss = 0
  cola_acc = 0
  cola_loss = 0
  total = 0
  for sample in dl:
    y1_pred = model(sample.sst_input_inds, 0, offset_input).reshape(-1, 2)
    y2_pred = model(sample.cola_input_inds, 1, offset_input).reshape(-1, 2)
    sample_sst_target = torch.tensor(sample.sst_target).type(torch.long).reshape(-1)
    sample_cola_target = torch.tensor(sample.cola_target).type(torch.long).reshape(-1)

    sst_loss = criterion(y1_pred, sample_sst_target)
    cola_loss = criterion(y2_pred, sample_cola_target)
    
    # print('Length: {}'.format(y1_pred.shape[0]))
    sst_acc += (torch.eq(sample_sst_target,y1_pred.argmax(1))).sum().item()
    cola_acc += (torch.eq(sample_cola_target, y2_pred.argmax(1))).sum().item()
    sst_loss += sst_loss.item()
    cola_loss += cola_loss.item()
    total += y1_pred.shape[0]

  return sst_acc/total, sst_loss/total, cola_acc/total, cola_loss/total

In [None]:
from tqdm import tqdm
epochs = 10

# offset input is required due to how embeddings are loaded in
offset_input = torch.tensor([0])

for _ in range(epochs):
  
  tr_count = 0
  sst_tr_acc = 0
  sst_tr_loss = 0
  cola_tr_acc = 0
  cola_tr_loss = 0
  for sample in tqdm(train_loader):
    # Note can also set the data to a decide (cuda)
    model.zero_grad()
    # print(type(sample.sst_input_inds))
    # print(torch.tensor(sample.sst_input_inds))
    # print(torch.tensor(sample.sst_input_inds).type(torch.long))
    model_sst_inp = torch.tensor(sample.sst_input_inds)
    print(model_sst_inp.shape)
    model_cola_inp = torch.tensor(sample.cola_input_inds)
    print(model_cola_inp.shape)
    y1_pred = model(model_sst_inp, 0, None).reshape(-1, 2)
    y2_pred = model(model_cola_inp, 1, None).reshape(-1, 2)
    sample_sst_target = torch.tensor(sample.sst_target).type(torch.long).reshape(-1)
    sample_cola_target = torch.tensor(sample.cola_target).type(torch.long).reshape(-1)
    print(y1_pred.shape)
    print(sample_sst_target.shape)
    sst_loss = criterion(y1_pred, sample_sst_target)
    cola_loss = criterion(y2_pred, sample_cola_target)
    # multitask loss
    loss = sst_loss + cola_loss
    loss.backward()

    # intermediate variabels stores embedding of x and computes
    # m_output.grad w.r.t. this and calculate the MSE of the m_output.grad(emb_x)
    # norm and 1.0 and use relu on; before you do the square, pass it through a relu
    # so everything les than 1.0 wont be counted to the square.
    # MSE (m_output.grad(x), 1.0) calulate gradient of M output w.r.t. x's embedding space
    # To avoid overfitting, calculating regularization term can use a varied version of x
    # instead of the original (e.g. add gaussian noise around embeddings of x); can
    # also minimize discrepancy on the two for robustness of model
    # Lipschitz-regularized loss
    optimizer.step()
    sst_tr_acc += (torch.eq(sample_sst_target,y1_pred.argmax(1))).sum().item()
    cola_tr_acc += (torch.eq(sample_cola_target, y2_pred.argmax(1))).sum().item()
    sst_tr_loss += sst_loss.item()
    cola_tr_loss += cola_loss.item()
    tr_count += y1_pred.shape[0]
    
  print("SST Train accuracy: {}, CoLA Train accuracy: {}, SST Train loss: {}, CoLA Train loss: {}".format(sst_tr_acc/tr_count, cola_tr_acc/tr_count, sst_tr_loss/tr_count, cola_tr_loss/tr_count))
  val_sst_acc, val_sst_loss, val_cola_acc, val_cola_loss = eval_metrics(model, val_loader, criterion)
  print("SST Val accuracy: {}, CoLA Val accuracy: {}, SST Val loss: {}, CoLA Val loss: {} ".format(val_sst_acc, val_cola_acc, val_sst_loss, val_cola_loss))

  0%|          | 0/962 [00:00<?, ?it/s]

torch.Size([8, 32])
torch.Size([8, 32])
inside forward
torch.Size([8, 32])
torch.Size([8, 300])
torch.Size([8, 300])
done with dropout
inside forward
torch.Size([8, 32])
torch.Size([8, 300])
torch.Size([8, 300])
done with dropout
torch.Size([1, 2])
torch.Size([8])





ValueError: ignored

In [None]:
test_sst_acc, test_sst_loss, test_cola_acc, test_cola_loss = eval_metrics(model, val_loader, criterion)
print("SST Test accuracy: {}, CoLA Val accuracy: {}, SST Val loss: {}, CoLA Val loss: {} ".format(test_sst_acc, test_cola_acc, test_sst_loss, test_cola_loss))

SST Test accuracy: 0.6063084112149533, CoLA Val accuracy: 0.6273364485981309, SST Val loss: 0.23364485800266266, CoLA Val loss: 0.0034176348708570004 


### TODO: Adversarial attacks and Robustness Evaluation

In [None]:
def attack():
  pass