In [1]:
import torch
import pandas as pd
from tqdm import tqdm 
import torch
import time
# We used pip install transformers, pip install sentencepiece
from transformers import BertTokenizer, BertForSequenceClassification, AlbertTokenizer, AlbertForSequenceClassification
from transformers import AdamW
from transformers import T5Tokenizer, T5ForConditionalGeneration

from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import accuracy_score
import plotly.express as px

In [2]:
data_path = 'projectData/'
tb_dir = 'tbs_HW2/'

In [3]:
#Define our device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
if torch.cuda.is_available():
    print(f'using: {torch.cuda.get_device_name(0)}')
else:
    print('using cpu')

using: NVIDIA GeForce GTX 970


### Load Data

In [4]:
raw_data = pd.read_csv(data_path+'dataset_raw.csv')

In [5]:
#Shuffle the data
raw_data = raw_data.sample(frac = 1)

In [6]:
#Create dataser class for T5
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, data ,label):
        self.data = data
        self.label = label
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        X = self.data[index]
        y = self.label[index]
        return X, y

In [7]:
def get_data(input_df, train_size,test_size,batch_size):
    train_end_idx = train_size
    test_end_idx = train_size + test_size
    #Make one list for all the reviews
    reviews = input_df['review'].tolist()

    #mini sample of reviews for train and test
    train_data = reviews[:train_end_idx]
    test_data = reviews[train_end_idx:test_end_idx]
    #Take mini sample of the labels and preprocess them such that we can use them in the model training loop
    labels = input_df['label'].tolist()
    train_labels = labels[:train_end_idx]
    test_labels = labels[train_end_idx:test_end_idx]

    #Use data set class in order to build train and test datasets
    train_dataset = SimpleDataset(train_data, train_labels)
    test_dataset = SimpleDataset(test_data, test_labels)

    #Build train and test dataloaders
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)
    return train_dataloader , test_dataloader

## Define Tokenizer and 'Bert' Model

In [8]:
#Define Bert tokenizer and model
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

#Define AlBert tokenizer and model
bert_tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
bert_model = AlbertForSequenceClassification.from_pretrained("albert-base-v2")

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.LayerNorm.bias', 'predictions.dense.bias', 'predictions.dense.weight', 'predictions.LayerNorm.weight', 'predictions.decoder.bias', 'predictions.bias', 'predictions.decoder.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You sho

In [9]:
#Bert base uncased architecture
bert_model

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

In [10]:
n_parameters = sum(p.numel() for p in bert_model.parameters())
print(f"Number of parameters in model: {n_parameters}")

Number of parameters in model: 11685122


In [11]:
def tokenizing_batch_Bert(X, y, tokenizer) :
    """
    Does ... TODO
    X: Batch of reviews 
    y: labels of reviews in batch
    """
    inputs =  tokenizer(X, max_length = 128, padding = 'max_length', truncation = True, return_tensors="pt")

    input_ids = inputs['input_ids'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    labels = torch.LongTensor(y).T.to(device)
    
    return input_ids, token_type_ids, attention_mask, labels

## Training Loop and Hyperparameter Selection

In [12]:
def calc_accuracy(target,pred):
    target, pred = target.to(device),pred.to(device)
    correct = torch.sum(target==pred)
    accuracy = correct/len(target)
    return accuracy

In [13]:
def fineTuning(model, optim, epochs, train_dataloader, test_dataloader, tb_dirname, tokenizer,model_type):
    train_loss_list = []
    test_loss_list = []
    train_acc_list = []
    test_acc_list = []

    writer = SummaryWriter(log_dir=f'{tb_dir}/{tb_dirname}_{time.time()}')

    for epoch in tqdm(range(epochs), desc = "epochs", position=0):
        running_loss = 0
        model.train()
        # setup loop with TQDM and dataloader
        train_loop = tqdm(train_dataloader, desc='train',position =1, leave=False)
        for X, y in train_loop:
            # initialize calculated gradients (from prev step)
            optim.zero_grad()
            # pull all tensor batches required for training
            if model_type == 'bert':
                input_ids, token_type_ids, attention_mask, labels = tokenizing_batch_Bert(X, y, tokenizer)
                outputs = model(input_ids, attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                labels=labels)
                

            elif model_type == 't5':
                ids, mask, labels = tokenizing_batch_T5(X, y, tokenizer)
                outputs = model(
                input_ids=ids,
                attention_mask=mask,
                labels=labels
                )

            # process
            loss = outputs.loss
            # calculate loss for every parameter that needs grad update
            loss.backward()
            # update parameters
            optim.step()
            running_loss+=loss.item()
            # print relevant info to progress bar
            # train_loop.set_description(f'Epoch {epoch}')
            train_loop.set_postfix(loss=loss.item())
        
        epoch_train_loss = running_loss/len(train_dataloader.dataset)
        train_loss_list.append(epoch_train_loss)


        test_loss = 0
        test_acc = 0
        model.eval()
        # we dont need to update weights, so we define no_grad() to save memory
        with torch.no_grad():
            eval_loop = tqdm(test_dataloader)
            for X_test, y_test in eval_loop:
                
                if model_type == 'bert':
                    input_ids, token_type_ids, attention_mask, labels = tokenizing_batch_Bert(X_test, y_test, tokenizer)
                    outputs = model(input_ids, attention_mask=attention_mask,
                                token_type_ids=token_type_ids,
                                labels=labels)
                    # get classification
                    logits = outputs.logits
                    predicted_token_class_ids = logits.argmax(-1)
                    # calculate accuracy score
                    acc_score = calc_accuracy(y_test, predicted_token_class_ids)
                    loss = outputs.loss
                
                elif model_type == 't5':
                    ids, mask, labels = tokenizing_batch_T5(X_test, y_test, tokenizer)
                    
                    generated_ids = model.generate(
                    input_ids=ids,
                    attention_mask=mask,
                    max_length=2
                    )

                    # For printing of real words 
                    preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
                    target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in labels]

                    loss = model(input_ids=ids, labels=labels).loss
                    acc_score = calc_accuracy(generated_ids[:,1], labels[:,0])
                    
                test_acc += acc_score.item()
                batch_test_loss = outputs.loss.item()
                test_loss+= batch_test_loss
                eval_loop.set_description(f'Epoch {epoch}')
                eval_loop.set_postfix(loss=batch_test_loss)
                
        epoch_acc_score = test_acc/len(test_dataloader)
        test_acc_list.append(epoch_acc_score)
        epoch_test_loss = test_loss/len(test_dataloader.dataset)
        test_loss_list.append(epoch_test_loss)

        writer.add_scalar(tag='loss/train', scalar_value=epoch_train_loss, global_step=epoch)
        writer.add_scalar(tag='loss/test', scalar_value=epoch_test_loss, global_step=epoch)
       
    
    return {
            'train_loss':train_loss_list,
            'test_loss':test_loss_list,
            'train_acc':train_acc_list,
            'test_acc':test_acc_list
            }

In [43]:
#Enter model to device
bert_model = AlbertForSequenceClassification.from_pretrained("albert-base-v2")
bert_model.to(device)
#Define optimizer
optim = torch.optim.AdamW(bert_model.parameters(), lr = 5e-5 )
#Define epochs number
epochs = 10
train_dataloader, test_dataloader = get_data(raw_data,
                                            train_size = 500,
                                            test_size = 100,
                                            batch_size = 16)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.dense.bias', 'predictions.dense.weight', 'predictions.decoder.weight', 'predictions.decoder.bias', 'predictions.bias', 'predictions.LayerNorm.bias', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You sho

In [44]:
#Fine Tuning Bert
loss_dic = fineTuning(  model=bert_model,
                        optim=optim,
                        epochs= epochs,
                        train_dataloader = train_dataloader,
                        test_dataloader=test_dataloader,
                        tb_dirname ='Bert',
                        tokenizer=bert_tokenizer,
                        model_type='bert')

Epoch 0: 100%|██████████| 32/32 [00:23<00:00,  1.34it/s, loss=0.579]
Epoch 0: 100%|██████████| 7/7 [00:01<00:00,  4.61it/s, loss=0.708]
Epoch 1: 100%|██████████| 32/32 [00:23<00:00,  1.36it/s, loss=0.575]
Epoch 1: 100%|██████████| 7/7 [00:01<00:00,  4.62it/s, loss=0.609]
Epoch 2: 100%|██████████| 32/32 [00:23<00:00,  1.36it/s, loss=0.809]
Epoch 2: 100%|██████████| 7/7 [00:01<00:00,  4.57it/s, loss=0.49] 
Epoch 3: 100%|██████████| 32/32 [00:23<00:00,  1.35it/s, loss=0.317]
Epoch 3: 100%|██████████| 7/7 [00:01<00:00,  4.49it/s, loss=0.511]
Epoch 4: 100%|██████████| 32/32 [00:24<00:00,  1.33it/s, loss=0.178]
Epoch 4: 100%|██████████| 7/7 [00:01<00:00,  4.60it/s, loss=0.153]
Epoch 5: 100%|██████████| 32/32 [00:23<00:00,  1.36it/s, loss=0.345]
Epoch 5: 100%|██████████| 7/7 [00:01<00:00,  4.60it/s, loss=0.115]
Epoch 6: 100%|██████████| 32/32 [00:23<00:00,  1.37it/s, loss=0.607] 
Epoch 6: 100%|██████████| 7/7 [00:01<00:00,  4.53it/s, loss=0.518]
Epoch 7: 100%|██████████| 32/32 [00:23<00:00,  

In [105]:
loss_dic['test_acc']

[0.5803571428571429,
 0.6607142857142857,
 0.5892857142857143,
 0.7232142857142857,
 0.6696428571428571,
 0.8125,
 0.7232142857142857,
 0.7589285714285714,
 0.7767857142857143,
 0.7767857142857143]

In [24]:
torch.cuda.empty_cache()

# T5

## Preprocess


In [14]:
#Read the raw data
raw_data = pd.read_csv(data_path+'dataset_raw.csv')
#Shuffle the data
raw_data = raw_data.sample(frac = 1)
#Make copy
data_T5 = raw_data.copy()
#Mapping 0 and 1 to "positive" and "negative"
data_T5['label']=data_T5['label'].map({1: 'positive', 0: 'negative'})

In [15]:
#Adding the "sst2 sentence:" prefix for the reviews
def add_T5_preffix_sentimant_classiication(row):
    row['review'] = "sst2 sentence: " + row['review'] 
    row['label'] = row['label']
    return row

data_T5 = data_T5.apply(lambda row: add_T5_preffix_sentimant_classiication(row), axis =1)


In [16]:
#Make lists for reviews and lables
reviews = data_T5['review'].tolist()
labels  = data_T5['label'].tolist()

In [17]:
#Define T5 Tokenizer and model
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [18]:
#T5 small architecture
t5_model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [19]:
n_parameters = sum(p.numel() for p in t5_model.parameters())
print(f"Number of parameters in model: {n_parameters}")

Number of parameters in model: 60506624


In [20]:
def tokenizing_batch_T5(X, y, tokenizer) :
    train_encoder_inputs = tokenizer(X, padding="longest", max_length=128, truncation=True, return_tensors="pt")
    train_decoder_inputs = tokenizer(y, padding="longest", max_length=2, truncation=True, return_tensors="pt")

    labels = train_decoder_inputs["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    labels = labels.to(device)
    ids = train_encoder_inputs["input_ids"].to(device)
    mask = train_encoder_inputs["input_ids"].to(device)
    
    return ids, mask, labels

In [21]:
#Enter model to device
t5_model.to(device)
#Define optimizer
optim = torch.optim.AdamW(t5_model.parameters(), lr = 5e-5 )
#Define epochs number
epochs = 5
train_dataloader, test_dataloader = get_data(data_T5,
                                            train_size = 750,
                                            test_size = 250,
                                            batch_size = 16)

In [22]:
loss_dic_t5 = fineTuning(model=t5_model,
                        optim=optim,
                        epochs= epochs,
                        train_dataloader = train_dataloader,
                        test_dataloader=test_dataloader,
                        tb_dirname ='T5',
                        tokenizer=t5_tokenizer,
                        model_type='t5')

Epoch 0: 100%|██████████| 16/16 [00:02<00:00,  5.75it/s, loss=7.18]
Epoch 1: 100%|██████████| 16/16 [00:02<00:00,  5.49it/s, loss=3.78]
Epoch 2: 100%|██████████| 16/16 [00:02<00:00,  5.91it/s, loss=1.21]
Epoch 3: 100%|██████████| 16/16 [00:02<00:00,  6.01it/s, loss=0.515]
Epoch 4: 100%|██████████| 16/16 [00:02<00:00,  6.15it/s, loss=0.474]
epochs: 100%|██████████| 5/5 [01:14<00:00, 14.93s/it]


In [23]:
loss_dic_t5['test_acc']

[0.0, 0.0, 0.28046875074505806, 0.49765625037252903, 0.48984375037252903]