In [28]:
!pip install transformers
!pip install torchtext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 52.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 37.3 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [77]:
# Note first get the data from the github to start data prep phase
import pandas as pd

sst_tr_dataset = pd.read_csv('sst_train_data.csv')
sst_val_dataset = pd.read_csv('sst_val_data.csv')
sst_test_dataset = pd.read_csv('sst_test_data.csv')

cola_tr_dataset = pd.read_csv('cola_train_data.csv')
cola_val_dataset = pd.read_csv('cola_val_data.csv')
cola_test_dataset = pd.read_csv('cola_test_data.csv')

sst_tr_dataset.sample(5)

Unnamed: 0,sentences,labels
3847,"In his latest effort , Storytelling , Solondz ...",1
3264,Boasts a handful of virtuosic set pieces and o...,1
2421,The leanest and meanest of Solondz 's misanthr...,1
1896,This is SO De Palma .,1
7202,"For every articulate player , such as skateboa...",0


In [67]:
from torchtext.vocab import GloVe, vocab
global_vectors = GloVe(name='6B', dim=300)

In [78]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import random
import transformers
from transformers import pipeline, AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig
from transformers import AutoConfig, AutoModelForPreTraining
from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from collections import namedtuple


glove_vocab = vocab(global_vectors.stoi)
unk_token = "<unk>"
unk_index = 0
glove_vocab.insert_token("<unk>", unk_index)
glove_vocab.set_default_index(unk_index)
pretrained_embeddings = global_vectors.vectors
pretrained_embeddings = torch.cat((torch.zeros(1,pretrained_embeddings.shape[1]),pretrained_embeddings))

In [83]:
# max_words = 48

# test = tokenizer(sst_tr_dataset['sentences'][0])
# test = test + [""] * (max_words - len(test))
# glove_vocab(test)

In [84]:
def tokenize_sentences(x, tokenizer, max_words = 48):
  x = tokenizer(x)
  if len(x) < max_words:
    x = x + [""] * (max_words - len(x))
  else:
    x = x[:max_words]

  return glove_vocab(x)

sst_tr_dataset['sentences'] = sst_tr_dataset['sentences'].apply(lambda x: tokenize_sentences(x, tokenizer))
sst_val_dataset['sentences'] = sst_val_dataset['sentences'].apply(lambda x: tokenize_sentences(x, tokenizer))
sst_test_dataset['sentences'] = sst_test_dataset['sentences'].apply(lambda x: tokenize_sentences(x, tokenizer))
cola_tr_dataset['sentences'] = cola_tr_dataset['sentences'].apply(lambda x: tokenize_sentences(x, tokenizer))
cola_val_dataset['sentences'] = cola_val_dataset['sentences'].apply(lambda x: tokenize_sentences(x, tokenizer))
cola_test_dataset['sentences'] = cola_test_dataset['sentences'].apply(lambda x: tokenize_sentences(x, tokenizer))


In [86]:
# Dataset for LSTM + Embedding / BERT
class MultitaskDataset(torch.utils.data.Dataset):
  def __init__(self, sst_X, sst_y, cola_X, cola_y, max_len=10000):
    assert len(sst_X) == len(sst_y), "SST data not one to one"
    assert len(cola_X) == len(cola_y), "CoLA data not one to one"

    self.max_len = max_len

    min_samples = min(len(sst_X), len(cola_X))
    sst_X = sst_X[:min(min_samples, max_len)]
    sst_y = sst_y[:min(min_samples, max_len)]
    cola_X = cola_X[:min(min_samples, max_len)]
    cola_y = cola_y[:min(min_samples, max_len)]

    self.sst_X = sst_X
    self.sst_y = sst_y
    self.cola_X = cola_X
    self.cola_y = cola_y

  def __len__(self):
    return len(self.sst_X)

  def __getitem__(self, index):
      # Get element consisting of sst_X, sst_y, cola_X, and cola_y
    return (self.sst_X[index], self.sst_y[index], self.cola_X[index], self.cola_y[index])

In [94]:
train_set = MultitaskDataset(sst_tr_dataset['sentences'], sst_tr_dataset['labels'], 
                             cola_tr_dataset['sentences'], cola_tr_dataset['labels'])
val_set = MultitaskDataset(sst_val_dataset['sentences'], sst_val_dataset['labels'], 
                           cola_val_dataset['sentences'], cola_val_dataset['labels'])
test_set = MultitaskDataset(sst_test_dataset['sentences'], sst_test_dataset['labels'], 
                             cola_test_dataset['sentences'], cola_test_dataset['labels'])

In [95]:
train_loader = DataLoader(train_set, shuffle=True)
val_loader  = DataLoader(val_set, shuffle=False)
test_loader  = DataLoader(test_set, shuffle=False)


In [117]:
class MultiTaskLSTM(torch.nn.Module):
  def __init__(self, pretrain_emb, emb_dim, hidden_dim):
    super().__init__()
    self.embeddings = nn.EmbeddingBag.from_pretrained(pretrain_emb)
    self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True, bidirectional=True)
    self.linears = nn.ModuleList([nn.Linear(hidden_dim, 1) for i in range(2)])
    self.dropout = nn.Dropout(0.2)
    self.init_weights()

  def init_weights(self):
    initrange = 0.5
    self.linears[0].weight.data.uniform_(-initrange, initrange)
    self.linears[1].weight.data.uniform_(-initrange, initrange)
    self.linears[0].bias.data.zero_()
    self.linears[1].bias.data.zero_()

  def forward(self, x, ind, offset):
    x = self.embeddings(x, offset)
    x = self.dropout(x)
    # x_pack = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
    lstm_out, (ht, ct) = self.lstm(x)
    return torch.sigmoid(self.linears[ind](ht[-1]))

model = MultiTaskLSTM(pretrained_embeddings, 300, 64)

In [118]:
learning_rate = 3e-4
criterion = nn.BCELoss()
# optimizer = torch.optim.SGD(model.parameters(),lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
offset_input = torch.tensor([0])


In [135]:
def eval_metrics(model, dl, criterion):
  model.eval()
  sst_acc = 0
  sst_loss = 0
  cola_acc = 0
  cola_loss = 0
  total = 0

  for (x1, y1, x2, y2) in dl:
    # Note can also set the data to a decide (cuda)
    x1 = torch.LongTensor(x1)
    x2 = torch.LongTensor(x2)
    y1 = y1.type(torch.float)
    y2 = y2.type(torch.float)
    y1_pred = model(x1, 0, offset_input)
    y2_pred = model(x2, 1, offset_input)
    sst_loss = criterion(y1_pred, y1)
    cola_loss = criterion(y2_pred, y2)
    # multitask loss
    loss = sst_loss + cola_loss
    loss.backward()
    
    optimizer.step()
    sst_acc += (round(y1_pred.item()) == y1.item())
    cola_acc += (round(y2_pred.item()) == y2.item())
    sst_loss += sst_loss.item()
    cola_loss += cola_loss.item()
    total += 1

  return sst_acc/total, sst_loss/total, cola_acc/total, cola_loss/total

In [136]:
from tqdm import tqdm
epochs = 10

for _ in range(epochs):
  
  tr_count = 0
  sst_tr_acc = 0
  sst_tr_loss = 0
  cola_tr_acc = 0
  cola_tr_loss = 0
  for (x1, y1, x2, y2) in tqdm(train_loader):
    # Note can also set the data to a decide (cuda)
    model.zero_grad()
    
    x1 = torch.LongTensor(x1)
    x2 = torch.LongTensor(x2)
    y1 = y1.type(torch.float)
    y2 = y2.type(torch.float)
    y1_pred = model(x1, 0, offset_input)
    y2_pred = model(x2, 1, offset_input)
    sst_loss = criterion(y1_pred, y1)
    cola_loss = criterion(y2_pred, y2)
    # multitask loss
    loss = sst_loss + cola_loss
    loss.backward()
    # intermediate variabels stores embedding of x and computes
    # m_output.grad w.r.t. this and calculate the MSE of the m_output.grad(emb_x)
    # norm and 1.0 and use relu on; before you do the square, pass it through a relu
    # so everything les than 1.0 wont be counted to the square.
    # MSE (m_output.grad(x), 1.0) calulate gradient of M output w.r.t. x's embedding space
    # To avoid overfitting, calculating regularization term can use a varied version of x
    # instead of the original (e.g. add gaussian noise around embeddings of x); can
    # also minimize discrepancy on the two for robustness of model
    # Lipschitz-regularized loss
    optimizer.step()
    sst_tr_acc += (round(y1_pred.item()) == y1.item())
    cola_tr_acc += (round(y2_pred.item()) == y2.item())
    sst_tr_loss += sst_loss.item()
    cola_tr_loss += cola_loss.item()
    tr_count += 1
    
  print("SST Train accuracy: {}, CoLA Train accuracy: {}, SST Train loss: {}, CoLA Train loss: {}".format(sst_tr_acc/tr_count, cola_tr_acc/tr_count, sst_tr_loss/tr_count, cola_tr_loss/tr_count))
  val_sst_acc, val_sst_loss, val_cola_acc, val_cola_loss = eval_metrics(model, val_loader, criterion)
  print("SST Val accuracy: {}, CoLA Val accuracy: {}, SST Val loss: {}, CoLA Val loss: {} ".format(val_sst_acc, val_cola_acc, val_sst_loss, val_cola_loss))

100%|██████████| 7695/7695 [01:02<00:00, 122.25it/s]


SST Train accuracy: 0.5316439246263808, CoLA Train accuracy: 0.7026640675763483, SST Train loss: 0.6909473202226997, CoLA Train loss: 0.6087731322930803
SST Val accuracy: 0.8002336448598131, CoLA Val accuracy: 0.6740654205607477, SST Val loss: 9.163697001213222e-08, CoLA Val loss: 0.0003434980462770909 


100%|██████████| 7695/7695 [00:36<00:00, 212.74it/s]


SST Train accuracy: 0.46835607537361923, CoLA Train accuracy: 0.7026640675763483, SST Train loss: 4.020406118157929, CoLA Train loss: 0.6594015607902026
SST Val accuracy: 0.5467289719626168, CoLA Val accuracy: 0.719626168224299, SST Val loss: 1.462268084395646e-08, CoLA Val loss: 0.0008913925266824663 


100%|██████████| 7695/7695 [00:38<00:00, 200.63it/s]


SST Train accuracy: 0.46835607537361923, CoLA Train accuracy: 0.7026640675763483, SST Train loss: 4.076741085710828, CoLA Train loss: 0.609083831612362
SST Val accuracy: 0.5163551401869159, CoLA Val accuracy: 0.6647196261682243, SST Val loss: 0.23364485800266266, CoLA Val loss: 0.00032364667276851833 


100%|██████████| 7695/7695 [00:35<00:00, 214.91it/s]


SST Train accuracy: 0.5316439246263808, CoLA Train accuracy: 0.7026640675763483, SST Train loss: 46.83560753736192, CoLA Train loss: 0.6717794362990557
SST Val accuracy: 0.6063084112149533, CoLA Val accuracy: 0.719626168224299, SST Val loss: 0.23364485800266266, CoLA Val loss: 0.0005660622846335173 


100%|██████████| 7695/7695 [00:36<00:00, 212.16it/s]


SST Train accuracy: 0.5316439246263808, CoLA Train accuracy: 0.7026640675763483, SST Train loss: 46.83560753736192, CoLA Train loss: 0.6200647887256411
SST Val accuracy: 0.6063084112149533, CoLA Val accuracy: 0.6845794392523364, SST Val loss: 0.23364485800266266, CoLA Val loss: 0.0006787392776459455 


100%|██████████| 7695/7695 [00:38<00:00, 201.27it/s]


SST Train accuracy: 0.5316439246263808, CoLA Train accuracy: 0.7026640675763483, SST Train loss: 46.83560753736192, CoLA Train loss: 0.6121031098895603
SST Val accuracy: 0.6063084112149533, CoLA Val accuracy: 0.6682242990654206, SST Val loss: 0.23364485800266266, CoLA Val loss: 0.000813046470284462 


100%|██████████| 7695/7695 [00:36<00:00, 212.02it/s]


SST Train accuracy: 0.5316439246263808, CoLA Train accuracy: 0.7026640675763483, SST Train loss: 46.83560753736192, CoLA Train loss: 0.6087891669572368
SST Val accuracy: 0.6063084112149533, CoLA Val accuracy: 0.6647196261682243, SST Val loss: 0.23364485800266266, CoLA Val loss: 0.0009991888655349612 


100%|██████████| 7695/7695 [00:36<00:00, 211.75it/s]


SST Train accuracy: 0.5316439246263808, CoLA Train accuracy: 0.7026640675763483, SST Train loss: 46.83560753736192, CoLA Train loss: 0.6120928503500324
SST Val accuracy: 0.6063084112149533, CoLA Val accuracy: 0.6448598130841121, SST Val loss: 0.23364485800266266, CoLA Val loss: 0.0011887570144608617 


100%|██████████| 7695/7695 [00:36<00:00, 210.28it/s]


SST Train accuracy: 0.5316439246263808, CoLA Train accuracy: 0.7026640675763483, SST Train loss: 46.83560753736192, CoLA Train loss: 0.6220966569861474
SST Val accuracy: 0.6063084112149533, CoLA Val accuracy: 0.6343457943925234, SST Val loss: 0.23364485800266266, CoLA Val loss: 0.0013979682698845863 


100%|██████████| 7695/7695 [00:36<00:00, 212.55it/s]


SST Train accuracy: 0.5316439246263808, CoLA Train accuracy: 0.7026640675763483, SST Train loss: 46.83560753736192, CoLA Train loss: 0.6380222904132201
SST Val accuracy: 0.6063084112149533, CoLA Val accuracy: 0.6273364485981309, SST Val loss: 0.23364485800266266, CoLA Val loss: 0.0016023971838876605 


In [137]:
test_sst_acc, test_sst_loss, test_cola_acc, test_cola_loss = eval_metrics(model, val_loader, criterion)
print("SST Test accuracy: {}, CoLA Val accuracy: {}, SST Val loss: {}, CoLA Val loss: {} ".format(test_sst_acc, test_cola_acc, test_sst_loss, test_cola_loss))

SST Test accuracy: 0.6063084112149533, CoLA Val accuracy: 0.6273364485981309, SST Val loss: 0.23364485800266266, CoLA Val loss: 0.0034176348708570004 


### TODO: Adversarial attacks and Robustness Evaluation

In [None]:
def attack():
  pass