Note: Get the data from the github repo: https://github.com/Georgepu1/cs263-final-project/

- Good resource: https://colab.research.google.com/github/zphang/zphang.github.io/blob/master/files/notebooks/Multi_task_Training_with_Transformers_NLP.ipynb#scrollTo=LlICaYzQan59



# **Data preprocessing stage**

In [None]:
!pip install transformers
!pip install torchtext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 6.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 48.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 17.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 4.6 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Unins

In [None]:
!pwd

/content


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd '/content/drive/Shareddrives/CS263/Final Project'

/content/drive/Shareddrives/CS263/Final Project


In [None]:
import torch
device = torch.device("cuda:0")

In [None]:
# Note first get the data from the github to start data prep phase
import pandas as pd

sst_tr_dataset = pd.read_csv('sst_train_data.csv')
sst_val_dataset = pd.read_csv('sst_val_data.csv')
sst_test_dataset = pd.read_csv('sst_test_data.csv')

cola_tr_dataset = pd.read_csv('cola_train_data.csv')
cola_val_dataset = pd.read_csv('cola_val_data.csv')
cola_test_dataset = pd.read_csv('cola_test_data.csv')

sst_tr_dataset.sample(5)

Unnamed: 0,sentences,labels
1345,Writer\/director Walter Hill is in his hyperma...,1
305,Its sheer dynamism is infectious .,1
4822,"Witless , pointless , tasteless and idiotic .",0
4006,The gags that fly at such a furiously funny pa...,1
6283,"but rather , ` How can you charge money for th...",0


In [None]:
#sst_tr_dataset.to(device)

In [None]:
from transformers import BertTokenizer, BertModel
import torch

model_name = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import random
import transformers
from transformers import pipeline, AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig
from transformers import AutoConfig, AutoModelForPreTraining
from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from collections import namedtuple


In [None]:
from collections import namedtuple

gen_batch_fields = ['sst_input_text', 'sst_id_text', 'sst_attention_mask', 'sst_target', 'cola_input_text', 'cola_id_text', 'cola_attention_mask', 'cola_target']
GenBatch = namedtuple('GenBatch', field_names=gen_batch_fields, defaults=[None] * len(gen_batch_fields))

In [None]:
# Dataset for LSTM + Embedding / BERT
class MultitaskDataset(torch.utils.data.Dataset):
  def __init__(self, tokenizer, sst_X, sst_y, cola_X, cola_y, max_len=10000, max_output_length=64):
    assert len(sst_X) == len(sst_y), "SST data not one to one"
    assert len(cola_X) == len(cola_y), "CoLA data not one to one"

    self.max_len = max_len
    self.tokenizer = tokenizer
    self.max_output_length = max_output_length
    min_samples = min(len(sst_X), len(cola_X))
    self.data = []
    sst_X = sst_X[:min(min_samples, max_len)]
    sst_y = sst_y[:min(min_samples, max_len)]
    cola_X = cola_X[:min(min_samples, max_len)]
    cola_y = cola_y[:min(min_samples, max_len)]
    self.load_data(sst_X, sst_y, cola_X, cola_y)

  def load_data(self, sst_X, sst_y, cola_X, cola_y):
    for (cur_sst_X, cur_sst_y, cur_cola_X, cur_cola_y) in zip(sst_X, sst_y, cola_X, cola_y):
      self.data.append({
          'sst_input': cur_sst_X, 
          'sst_target': cur_sst_y, 
          'cola_input': cur_cola_X, 
          'cola_target': cur_cola_y,
      })

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
      # Get element consisting of sst_X, sst_y, cola_X, and cola_y
    return self.data[index]
  
  def collate_fn(self, batch):
    sst_input_text = [x['sst_input'] for x in batch]
    sst_inputs = self.tokenizer(sst_input_text, return_tensors="pt", padding=True, max_length=self.max_output_length)
    #print(sst_inputs)
    cola_input_text = [x['cola_input'] for x in batch]
    cola_inputs = self.tokenizer(cola_input_text, return_tensors="pt", padding=True, max_length=self.max_output_length)
    #sst_input_text = sst_input_text.to(device)
    #sst_inputs = sst_inputs.to(device)
    #cola_input_text = cola_input_text.to(device)
    #cola_inputs = cola_inputs.to(device)
    
    #if self.if_cuda:
    #sst_input_text = sst_input_text.cuda()
    #sst_inputs = sst_inputs.cuda()
    #cola_input_text = cola_input_text.cuda()
    #cola_inputs = cola_inputs.cuda()


    return GenBatch(
      sst_input_text=sst_input_text,
      sst_id_text=sst_inputs['input_ids'],
      sst_attention_mask=sst_inputs['attention_mask'],
      sst_target=[x['sst_target'] for x in batch],
      cola_input_text=cola_input_text,
      cola_id_text=cola_inputs['input_ids'],
      cola_attention_mask=cola_inputs['attention_mask'],
      cola_target=[x['cola_target'] for x in batch]
    )

In [None]:
train_set = MultitaskDataset(tokenizer, sst_tr_dataset['sentences'], sst_tr_dataset['labels'], 
                             cola_tr_dataset['sentences'], cola_tr_dataset['labels'])
val_set = MultitaskDataset(tokenizer, sst_val_dataset['sentences'], sst_val_dataset['labels'], 
                           cola_val_dataset['sentences'], cola_val_dataset['labels'])
test_set = MultitaskDataset(tokenizer, sst_test_dataset['sentences'], sst_test_dataset['labels'], 
                             cola_test_dataset['sentences'], cola_test_dataset['labels'])
train_loader = DataLoader(train_set, shuffle=True, batch_size=8,
                          collate_fn=train_set.collate_fn)
print(train_loader)
val_loader  = DataLoader(val_set, shuffle=False, batch_size=8,
                          collate_fn=val_set.collate_fn)
test_loader  = DataLoader(test_set, shuffle=False, batch_size=8,
                          collate_fn=test_set.collate_fn)


<torch.utils.data.dataloader.DataLoader object at 0x7feb811bc9d0>


In [None]:
# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt", padding=True, max_length=64)
# inputs.keys()
# outputs = model(**inputs)

# last_hidden_states = outputs.last_hidden_state

In [None]:
import torch.nn.functional as F

class MultitaskBert(torch.nn.Module):
  def __init__(self, model_name, tokenizer, num_labels=2, hidden_size=768):
    super().__init__()

    self.num_labels = num_labels
    self.tokenizer = tokenizer
    self.bert = BertModel.from_pretrained(model_name)
    # self.dropout = nn.Dropout(.1)
    self.classifier = nn.ModuleList([nn.Linear(hidden_size, self.num_labels) for i in range(2)])
    self.init_weights()

  def init_weights(self):
    initrange = 0.5
    self.classifier[0].weight.data.uniform_(-initrange, initrange)
    self.classifier[1].weight.data.uniform_(-initrange, initrange)
    self.classifier[0].bias.data.zero_()
    self.classifier[1].bias.data.zero_()

  def forward(self, input_ids, attention_mask, ind):
    outputs = self.bert(
      input_ids,
      attention_mask=attention_mask
    )

    pooled_output = outputs[1]
    # Note BERT Model already applies dropout in output
    # pooled_output = self.dropout(pooled_output)
    logits = self.classifier[ind](pooled_output)
    # print(logits.shape)
    return F.softmax(logits, dim=-1)

In [None]:
from transformers import AdamW

model = MultitaskBert(model_name, tokenizer)
model = model.to(device)
param_groups = [{'params': model.parameters(), 'lr': 0.00001, 'weight_decay': 0.01}]
optimizer = AdamW(params=param_groups)
criterion = nn.CrossEntropyLoss()


def eval_metrics(model, dl, criterion):
  model.eval()
  sst_acc = 0
  sst_loss = 0
  cola_acc = 0
  cola_loss = 0
  total = 0

  with torch.no_grad():
    for sample in dl:
      x1_id = sample.sst_id_text.cuda()
      x2_id = sample.cola_id_text.cuda()
      x1_attention_mask = sample.sst_attention_mask.cuda()
      x2_attention_mask = sample.cola_attention_mask.cuda()
      y1_pred = model(x1_id, x1_attention_mask, 0).reshape(-1, 2)
      y2_pred = model(x2_id, x2_attention_mask, 1).reshape(-1, 2)
      #print(x1_id.shape,x1_attention_mask.shape)

      #y1_pred = model(sample.sst_id_text, sample.sst_attention_mask, 0).reshape(-1, 2)
      #y2_pred = model(sample.cola_id_text, sample.cola_attention_mask, 1).reshape(-1, 2)
      sample_sst_target = torch.tensor(sample.sst_target).type(torch.long).reshape(-1)
      sample_cola_target = torch.tensor(sample.cola_target).type(torch.long).reshape(-1)
      sample_sst_target = sample_sst_target.cuda()
      sample_cola_target = sample_cola_target.cuda()

      sst_loss = criterion(y1_pred, sample_sst_target)
      cola_loss = criterion(y2_pred, sample_cola_target)
      
      # print('Length: {}'.format(y1_pred.shape[0]))
      sst_acc += (torch.eq(sample_sst_target,y1_pred.argmax(1))).sum().item()
      cola_acc += (torch.eq(sample_cola_target, y2_pred.argmax(1))).sum().item()
      sst_loss += sst_loss.item()
      cola_loss += cola_loss.item()
      total += y1_pred.shape[0]

  return sst_acc/total, sst_loss/total, cola_acc/total, cola_loss/total

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# m = nn.Sigmoid()
# loss = nn.BCELoss()
# input = torch.randn(3, requires_grad=True)
# target = torch.empty(3).random_(2)

# output = loss(m(input), target)
# output.backward()
# print(input.shape, target.shape)

In [None]:
from tqdm import tqdm
epochs = 10

for _ in range(epochs):
  
  tr_count = 0
  sst_tr_acc = 0
  sst_tr_loss = 0
  cola_tr_acc = 0
  cola_tr_loss = 0
  
  model.train()
  for sample in tqdm(train_loader):
    # Note can also set the data to a decide (cuda)
    model.zero_grad()
    
    #print(sample)
    x1_id = sample.sst_id_text.cuda()
    x2_id = sample.cola_id_text.cuda()
    x1_attention_mask = sample.sst_attention_mask.cuda()
    x2_attention_mask = sample.cola_attention_mask.cuda()
    #print(x1_id.shape)
    #print(x1_attention_mask.shape)
    y1_pred = model(x1_id, x1_attention_mask, 0).reshape(-1, 2)
    y2_pred = model(x2_id, x2_attention_mask, 1).reshape(-1, 2)
    #y1_pred = model(sample.sst_id_text, sample.sst_attention_mask, 0).reshape(-1, 2)
    #y2_pred = model(sample.cola_id_text, sample.cola_attention_mask, 1).reshape(-1, 2)
    sample_sst_target = torch.tensor(sample.sst_target).type(torch.long).reshape(-1)
    sample_cola_target = torch.tensor(sample.cola_target).type(torch.long).reshape(-1)
    sample_sst_target = sample_sst_target.cuda()
    sample_cola_target = sample_cola_target.cuda()

    sst_loss = criterion(y1_pred, sample_sst_target)
    cola_loss = criterion(y2_pred, sample_cola_target)
    # multitask loss
    loss = sst_loss + cola_loss
    loss.backward()
    # intermediate variabels stores embedding of x and computes
    # m_output.grad w.r.t. this and calculate the MSE of the m_output.grad(emb_x)
    # norm and 1.0 and use relu on; before you do the square, pass it through a relu
    # so everything les than 1.0 wont be counted to the square.
    # MSE (m_output.grad(x), 1.0) calulate gradient of M output w.r.t. x's embedding space
    # To avoid overfitting, calculating regularization term can use a varied version of x
    # instead of the original (e.g. add gaussian noise around embeddings of x); can
    # also minimize discrepancy on the two for robustness of model
    # Lipschitz-regularized loss
    optimizer.step()
    # print('Length: {}'.format(y1_pred.shape[0]))
    sst_tr_acc += (torch.eq(sample_sst_target,y1_pred.argmax(1))).sum().item()
    cola_tr_acc += (torch.eq(sample_cola_target, y2_pred.argmax(1))).sum().item()
    sst_tr_loss += sst_loss.item()
    cola_tr_loss += cola_loss.item()
    tr_count += y1_pred.shape[0]
    
  print("SST Train accuracy: {}, CoLA Train accuracy: {}, SST Train loss: {}, CoLA Train loss: {}".format(sst_tr_acc/tr_count, cola_tr_acc/tr_count, sst_tr_loss/tr_count, cola_tr_loss/tr_count))
  val_sst_acc, val_sst_loss, val_cola_acc, val_cola_loss = eval_metrics(model, val_loader, criterion)
  print("SST Val accuracy: {}, CoLA Val accuracy: {}, SST Val loss: {}, CoLA Val loss: {} ".format(val_sst_acc, val_cola_acc, val_sst_loss, val_cola_loss))

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
100%|██████████| 962/962 [02:16<00:00,  7.03it/s]


SST Train accuracy: 0.7846653671215075, CoLA Train accuracy: 0.749317738791423, SST Train loss: 0.06466381041317346, CoLA Train loss: 0.06906796868084467
SST Val accuracy: 0.8574766355140186, CoLA Val accuracy: 0.8002336448598131, SST Val loss: 0.0008775718160904944, CoLA Val loss: 0.0013164101401343942 


100%|██████████| 962/962 [02:19<00:00,  6.90it/s]


SST Train accuracy: 0.8600389863547758, CoLA Train accuracy: 0.8230019493177388, SST Train loss: 0.05596080538977114, CoLA Train loss: 0.060557639409072064
SST Val accuracy: 0.8516355140186916, CoLA Val accuracy: 0.8002336448598131, SST Val loss: 0.0007377471774816513, CoLA Val loss: 0.0010401384206488729 


100%|██████████| 962/962 [02:19<00:00,  6.92it/s]


SST Train accuracy: 0.8925276153346329, CoLA Train accuracy: 0.8530214424951267, SST Train loss: 0.052108456935101785, CoLA Train loss: 0.056879754768245167
SST Val accuracy: 0.8644859813084113, CoLA Val accuracy: 0.8119158878504673, SST Val loss: 0.000731973210349679, CoLA Val loss: 0.0013106378028169274 


100%|██████████| 962/962 [02:19<00:00,  6.91it/s]


SST Train accuracy: 0.9118908382066276, CoLA Train accuracy: 0.87953216374269, SST Train loss: 0.04999201962744903, CoLA Train loss: 0.05387017936037298
SST Val accuracy: 0.860981308411215, CoLA Val accuracy: 0.8271028037383178, SST Val loss: 0.0007319324649870396, CoLA Val loss: 0.0013160191010683775 


100%|██████████| 962/962 [02:18<00:00,  6.93it/s]


SST Train accuracy: 0.9191682910981157, CoLA Train accuracy: 0.8974658869395712, SST Train loss: 0.04915049990870257, CoLA Train loss: 0.0517420691162677
SST Val accuracy: 0.8621495327102804, CoLA Val accuracy: 0.8282710280373832, SST Val loss: 0.0007320609292946756, CoLA Val loss: 0.0013161024544388056 


100%|██████████| 962/962 [02:23<00:00,  6.70it/s]


SST Train accuracy: 0.9250162443144899, CoLA Train accuracy: 0.9090318388564003, SST Train loss: 0.04836834992504182, CoLA Train loss: 0.05037204566677334
SST Val accuracy: 0.8656542056074766, CoLA Val accuracy: 0.8247663551401869, SST Val loss: 0.0010230450425297022, CoLA Val loss: 0.0013160323724150658 


100%|██████████| 962/962 [02:18<00:00,  6.93it/s]


SST Train accuracy: 0.9319038336582196, CoLA Train accuracy: 0.908641975308642, SST Train loss: 0.04755533057191761, CoLA Train loss: 0.05039740754769482
SST Val accuracy: 0.866822429906542, CoLA Val accuracy: 0.8364485981308412, SST Val loss: 0.0010265494929626584, CoLA Val loss: 0.0013161360984668136 


100%|██████████| 962/962 [02:20<00:00,  6.87it/s]


SST Train accuracy: 0.9325536062378168, CoLA Train accuracy: 0.9196881091617933, SST Train loss: 0.047400659497021234, CoLA Train loss: 0.04910177421926135
SST Val accuracy: 0.8492990654205608, CoLA Val accuracy: 0.8341121495327103, SST Val loss: 0.0007900271448306739, CoLA Val loss: 0.0013160312082618475 


100%|██████████| 962/962 [02:20<00:00,  6.83it/s]


SST Train accuracy: 0.9347628330084471, CoLA Train accuracy: 0.9156595191682911, SST Train loss: 0.047145729738059486, CoLA Train loss: 0.0494918649424354
SST Val accuracy: 0.8492990654205608, CoLA Val accuracy: 0.8212616822429907, SST Val loss: 0.0007319203577935696, CoLA Val loss: 0.0013160372618585825 


100%|██████████| 962/962 [02:20<00:00,  6.83it/s]


SST Train accuracy: 0.938401559454191, CoLA Train accuracy: 0.9251461988304094, SST Train loss: 0.046716277962617084, CoLA Train loss: 0.048442217883853035
SST Val accuracy: 0.8679906542056075, CoLA Val accuracy: 0.8271028037383178, SST Val loss: 0.000731920066755265, CoLA Val loss: 0.0013161228271201253 


In [None]:
# learning_rate = 3e-4
# criterion = nn.BCELoss()
# # optimizer = torch.optim.SGD(model.parameters(),lr=learning_rate)
# optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
# offset_input = torch.tensor([0])


In [None]:
test_sst_acc, test_sst_loss, test_cola_acc, test_cola_loss = eval_metrics(model, val_loader, criterion)
print("SST Val accuracy: {}, CoLA Val accuracy: {}, SST Val loss: {}, CoLA Val loss: {} ".format(test_sst_acc, test_cola_acc, test_sst_loss, test_cola_loss))

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


SST Val accuracy: 0.8679906542056075, CoLA Val accuracy: 0.8271028037383178, SST Val loss: 0.000731920066755265, CoLA Val loss: 0.0013161228271201253 


In [None]:
test_sst_acc, test_sst_loss, test_cola_acc, test_cola_loss = eval_metrics(model, test_loader, criterion)
print("SST Test accuracy: {}, CoLA Val accuracy: {}, SST Val loss: {}, CoLA Val loss: {} ".format(test_sst_acc, test_cola_acc, test_sst_loss, test_cola_loss))

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


SST Test accuracy: 0.889943074003795, CoLA Val accuracy: 0.825426944971537, SST Val loss: 0.0011977063259109855, CoLA Val loss: 0.0017255261773243546 


In [None]:
print(model.parameters())

<generator object Module.parameters at 0x7feb811e0dd0>


### TODO: Adversarial attacks and Robustness Evaluation

In [None]:
def attack():
  pass

In [None]:
cd '/content/drive/Shareddrives/CS263/Final Project'

/content/drive/Shareddrives/CS263/Final Project


In [None]:
import pandas as pd

sst_adv_dataset = pd.read_csv('adversial_sst.csv')
cola_adv_dataset = pd.read_csv('adversial_cola.csv')


In [None]:
adv_set = MultitaskDataset(tokenizer, sst_adv_dataset['sentences'], sst_adv_dataset['labels'], 
                             cola_adv_dataset['sentences'], cola_adv_dataset['labels'])
adv_loader  = DataLoader(adv_set, shuffle=False, batch_size=8,
                          collate_fn=adv_set.collate_fn)

In [None]:
eval_batch_size = 12
max_output_length = 100


In [None]:
adv_sst_acc, adv_sst_loss, adv_cola_acc, adv_cola_loss = eval_metrics(model, adv_loader, criterion)
print("SST ADV accuracy: {}, CoLA ADV accuracy: {}, SST ADV loss: {}, CoLA ADV loss: {} ".format(adv_sst_acc, adv_cola_acc, adv_sst_loss, adv_cola_loss))

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


SST ADV accuracy: 0.5434782608695652, CoLA ADV accuracy: 0.2246376811594203, SST ADV loss: 0.011786402203142643, CoLA ADV loss: 0.0045721870847046375 


Transfer attacks

In [None]:
#BERT transfer attacks

sst_adv_dataset = pd.read_csv('bert_adversarial_sst.csv')
cola_adv_dataset = pd.read_csv('bert_adversarial_cola.csv')

adv_set = MultitaskDataset(tokenizer, sst_adv_dataset['sentences'], sst_adv_dataset['labels'], 
                             cola_adv_dataset['sentences'], cola_adv_dataset['labels'])
adv_loader  = DataLoader(adv_set, shuffle=False, batch_size=8,
                          collate_fn=adv_set.collate_fn)


In [None]:
adv_sst_acc, adv_sst_loss, adv_cola_acc, adv_cola_loss = eval_metrics(model, adv_loader, criterion)
print("SST ADV accuracy: {}, CoLA ADV accuracy: {}, SST ADV loss: {}, CoLA ADV loss: {} ".format(adv_sst_acc, adv_cola_acc, adv_sst_loss, adv_cola_loss))

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


SST ADV accuracy: 0.4830508474576271, CoLA ADV accuracy: 0.21694915254237288, SST ADV loss: 0.0033328437712043524, CoLA ADV loss: 0.00445141363888979 


In [None]:
#ROBERTA transfer attack

sst_adv_dataset = pd.read_csv('roberta_adversarial_sst.csv')
cola_adv_dataset = pd.read_csv('roberta_adversarial_cola.csv')

adv_set = MultitaskDataset(tokenizer, sst_adv_dataset['sentences'], sst_adv_dataset['labels'], 
                             cola_adv_dataset['sentences'], cola_adv_dataset['labels'])
adv_loader  = DataLoader(adv_set, shuffle=False, batch_size=8,
                          collate_fn=adv_set.collate_fn)


In [None]:
adv_sst_acc, adv_sst_loss, adv_cola_acc, adv_cola_loss = eval_metrics(model, adv_loader, criterion)
print("SST ADV accuracy: {}, CoLA ADV accuracy: {}, SST ADV loss: {}, CoLA ADV loss: {} ".format(adv_sst_acc, adv_cola_acc, adv_sst_loss, adv_cola_loss))

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


SST ADV accuracy: 0.5517857142857143, CoLA ADV accuracy: 0.37142857142857144, SST ADV loss: 0.003170850919559598, CoLA ADV loss: 0.003800953272730112 


In [None]:
#Data aug
sst_adv_dataset = pd.read_csv('augmented_sst.csv')
cola_adv_dataset = pd.read_csv('augmented_cola.csv')

adv_set = MultitaskDataset(tokenizer, sst_adv_dataset['sentences'], sst_adv_dataset['labels'], 
                             cola_adv_dataset['sentences'], cola_adv_dataset['labels'])
adv_loader  = DataLoader(adv_set, shuffle=False, batch_size=8,
                          collate_fn=adv_set.collate_fn)


In [None]:
from tqdm import tqdm
epochs = 10

for _ in range(epochs):
  
  tr_count = 0
  sst_tr_acc = 0
  sst_tr_loss = 0
  cola_tr_acc = 0
  cola_tr_loss = 0
  
  model.train()
  for sample in tqdm(adv_loader):
    # Note can also set the data to a decide (cuda)
    model.zero_grad()
    
    x1_id = sample.sst_id_text.cuda()
    x2_id = sample.cola_id_text.cuda()
    x1_attention_mask = sample.sst_attention_mask.cuda()
    x2_attention_mask = sample.cola_attention_mask.cuda()
    y1_pred = model(x1_id, x1_attention_mask, 0).reshape(-1, 2)
    y2_pred = model(x2_id, x2_attention_mask, 1).reshape(-1, 2)
    sample_sst_target = torch.tensor(sample.sst_target).type(torch.long).reshape(-1)
    sample_cola_target = torch.tensor(sample.cola_target).type(torch.long).reshape(-1)
    sample_sst_target = sample_sst_target.cuda()
    sample_cola_target = sample_cola_target.cuda()

    sst_loss = criterion(y1_pred, sample_sst_target)
    cola_loss = criterion(y2_pred, sample_cola_target)
    # multitask loss
    loss = sst_loss + cola_loss
    loss.backward()
    optimizer.step()
    sst_tr_acc += (torch.eq(sample_sst_target,y1_pred.argmax(1))).sum().item()
    cola_tr_acc += (torch.eq(sample_cola_target, y2_pred.argmax(1))).sum().item()
    sst_tr_loss += sst_loss.item()
    cola_tr_loss += cola_loss.item()
    tr_count += y1_pred.shape[0]
    
  print("SST Train accuracy: {}, CoLA Train accuracy: {}, SST Train loss: {}, CoLA Train loss: {}".format(sst_tr_acc/tr_count, cola_tr_acc/tr_count, sst_tr_loss/tr_count, cola_tr_loss/tr_count))

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
100%|██████████| 250/250 [00:41<00:00,  5.97it/s]


SST Train accuracy: 0.9065, CoLA Train accuracy: 0.738, SST Train loss: 0.05048494467139244, CoLA Train loss: 0.07139900241792202


100%|██████████| 250/250 [00:41<00:00,  6.04it/s]


SST Train accuracy: 0.911, CoLA Train accuracy: 0.782, SST Train loss: 0.05021222200989723, CoLA Train loss: 0.06588066667318344


100%|██████████| 250/250 [00:41<00:00,  6.03it/s]


SST Train accuracy: 0.9285, CoLA Train accuracy: 0.8075, SST Train loss: 0.04797845709323883, CoLA Train loss: 0.06266209696233273


100%|██████████| 250/250 [00:41<00:00,  5.96it/s]


SST Train accuracy: 0.93, CoLA Train accuracy: 0.8315, SST Train loss: 0.04784189991652966, CoLA Train loss: 0.059951678588986396


100%|██████████| 250/250 [00:41<00:00,  6.01it/s]


SST Train accuracy: 0.9355, CoLA Train accuracy: 0.8255, SST Train loss: 0.047154362857341765, CoLA Train loss: 0.06059365563094616


100%|██████████| 250/250 [00:41<00:00,  6.00it/s]


SST Train accuracy: 0.944, CoLA Train accuracy: 0.844, SST Train loss: 0.045988764926791194, CoLA Train loss: 0.058350446492433546


100%|██████████| 250/250 [00:41<00:00,  6.01it/s]


SST Train accuracy: 0.943, CoLA Train accuracy: 0.849, SST Train loss: 0.04617068301141262, CoLA Train loss: 0.05769325143098831


100%|██████████| 250/250 [00:41<00:00,  6.01it/s]


SST Train accuracy: 0.9545, CoLA Train accuracy: 0.863, SST Train loss: 0.04468240733444691, CoLA Train loss: 0.05596238724887371


100%|██████████| 250/250 [00:41<00:00,  5.98it/s]


SST Train accuracy: 0.958, CoLA Train accuracy: 0.87, SST Train loss: 0.04434096233546734, CoLA Train loss: 0.055169729068875316


100%|██████████| 250/250 [00:41<00:00,  6.01it/s]

SST Train accuracy: 0.9565, CoLA Train accuracy: 0.8735, SST Train loss: 0.044428249910473824, CoLA Train loss: 0.054706901833415034





In [None]:
test_sst_acc, test_sst_loss, test_cola_acc, test_cola_loss = eval_metrics(model, test_loader, criterion)
print("SST Test accuracy: {}, CoLA Val accuracy: {}, SST Val loss: {}, CoLA Val loss: {} ".format(test_sst_acc, test_cola_acc, test_sst_loss, test_cola_loss))

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


SST Test accuracy: 0.9070208728652751, CoLA Val accuracy: 0.8178368121442126, SST Val loss: 0.0011889474699273705, CoLA Val loss: 0.001749540911987424 


In [None]:
#BERT transfer attacks

sst_adv_dataset = pd.read_csv('bert_adversarial_sst.csv')
cola_adv_dataset = pd.read_csv('bert_adversarial_cola.csv')

adv_set = MultitaskDataset(tokenizer, sst_adv_dataset['sentences'], sst_adv_dataset['labels'], 
                             cola_adv_dataset['sentences'], cola_adv_dataset['labels'])
adv_loader  = DataLoader(adv_set, shuffle=False, batch_size=8,
                          collate_fn=adv_set.collate_fn)


In [None]:
adv_sst_acc, adv_sst_loss, adv_cola_acc, adv_cola_loss = eval_metrics(model, adv_loader, criterion)
print("SST ADV accuracy: {}, CoLA ADV accuracy: {}, SST ADV loss: {}, CoLA ADV loss: {} ".format(adv_sst_acc, adv_cola_acc, adv_sst_loss, adv_cola_loss))

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


SST ADV accuracy: 0.588135593220339, CoLA ADV accuracy: 0.5474576271186441, SST ADV loss: 0.003890178631991148, CoLA ADV loss: 0.00232752226293087 


In [None]:
#ROBERTA transfer attack

sst_adv_dataset = pd.read_csv('roberta_adversarial_sst.csv')
cola_adv_dataset = pd.read_csv('roberta_adversarial_cola.csv')

adv_set = MultitaskDataset(tokenizer, sst_adv_dataset['sentences'], sst_adv_dataset['labels'], 
                             cola_adv_dataset['sentences'], cola_adv_dataset['labels'])
adv_loader  = DataLoader(adv_set, shuffle=False, batch_size=8,
                          collate_fn=adv_set.collate_fn)


In [None]:
adv_sst_acc, adv_sst_loss, adv_cola_acc, adv_cola_loss = eval_metrics(model, adv_loader, criterion)
print("SST ADV accuracy: {}, CoLA ADV accuracy: {}, SST ADV loss: {}, CoLA ADV loss: {} ".format(adv_sst_acc, adv_cola_acc, adv_sst_loss, adv_cola_loss))

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


SST ADV accuracy: 0.6, CoLA ADV accuracy: 0.6589285714285714, SST ADV loss: 0.004240900743752718, CoLA ADV loss: 0.0020172852091491222 
