# Import Libraries

In [None]:
!pip install transformers==2.7.0

In [2]:
!git clone https://github.com/KennethSunn/Intro-to-NLP-Final-Project-Fall2021.git

Cloning into 'Intro-to-NLP-Final-Project-Fall2021'...
remote: Enumerating objects: 60, done.[K
remote: Counting objects: 100% (60/60), done.[K
remote: Compressing objects: 100% (52/52), done.[K
remote: Total 60 (delta 13), reused 37 (delta 3), pack-reused 0[K
Unpacking objects: 100% (60/60), done.


In [3]:
import sys
sys.path.append('/content/Intro-to-NLP-Final-Project-Fall2021/src/')
import collections
from typing import Callable
import numpy as np
np.random.seed(42)
import pandas as pd
from tqdm import notebook
import importlib
import pprint
import nltk
import datetime
import os
from argparse import Namespace
import re
from collections import Counter

In [4]:
import utils.general as general_utils
import utils.transformer.data as transformer_data_utils
import utils.transformer.general as transformer_general_utils
general_utils.set_seed_everywhere() #set the seed for reproducibility

In [5]:
import logging
logging.basicConfig(level=logging.INFO) 

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

## Import Optimzer and transformers Models

In [7]:
# Import RAdam and Lookahead
from radam.radam import RAdam
from lookahead.optimizer import Lookahead

In [8]:
from transformers import GPT2Tokenizer, GPT2Model
from transformers import AlbertTokenizer, AlbertModel
from transformers import RobertaTokenizer, RobertaModel

INFO:transformers.file_utils:PyTorch version 1.10.0+cu111 available.
INFO:transformers.file_utils:TensorFlow version 2.7.0 available.


# Set up the argspace/important_variables


In [9]:
args = Namespace(
        #use cuda by default
        device = 'cuda' if torch.cuda.is_available() else 'cpu',
    
        #set batch size and number of epochs
        batch_size = 32,
        num_epochs = 10,
    
        #set the learning rate
        learning_rate = 0.0001,

        #location of the train, dev and test csv
        train_val_csv = '/content/Intro-to-NLP-Final-Project-Fall2021/TaskA_data/train_a.csv',
        test_csv = '/content/Intro-to-NLP-Final-Project-Fall2021/TaskA_data/test_a_tweets.tsv',
        test_label = '/content/Intro-to-NLP-Final-Project-Fall2021/TaskA_data/test_a_labels.csv',
        #directory to save our models at
        directory = './', 
        model_name = 'distilgpt2.pt',
)

## Loading a presplit subset of the full dataset data into DataFrames. 
Here, 0 : NOT, 1 : HOF

In [10]:
label = pd.read_csv(args.test_label, header=None, names=['id', 'label'])
map_of_label = {'OFF': 1, 'NOT': 0}
label['label'] = label['label'].map(map_of_label)
label['label'].value_counts()

0    2807
1    1080
Name: label, dtype: int64

In [11]:
test_set = pd.read_csv(args.test_csv, sep='\t')
test_set.rename(columns={'tweet': 'text'}, inplace=True)
test_set['split'] = 'test'
test_set['label'] = label['label'].values

In [12]:
data_df =  pd.read_csv(args.train_val_csv)
data_df.drop(columns=['average', 'std'], inplace=True)
data_df.drop(data_df[data_df['split'] == 'test'].index, inplace = True)
data_df = pd.concat([data_df, test_set])

## Create the text preprocessor

In [13]:
class GPT2Preprocessor:
    def __init__(self, transformer_tokenizer, sentence_detector):
        self.transformer_tokenizer = transformer_tokenizer
        self.sentence_detector = sentence_detector

    def add_eos_tokens(self, text):
        eos_token = " " + self.transformer_tokenizer.eos_token + " "
        sentences = self.sentence_detector.tokenize(text)
        eos_added_text = (
            eos_token.join(sentences) + " " + self.transformer_tokenizer.eos_token
        )
        return eos_added_text

class AlbertPreprocessor:
    def __init__(self, transformer_tokenizer, sentence_detector):
        self.transformer_tokenizer = transformer_tokenizer
        self.sentence_detector = sentence_detector

    def add_eos_tokens(self, text):
        eos_token = " " + self.transformer_tokenizer.eos_token + " "
        sentences = self.sentence_detector.tokenize(text)
        eos_added_text = (
            eos_token.join(sentences) + " " + self.transformer_tokenizer.eos_token
        )
        return eos_added_text

class RobertaPreprocessor:
    def __init__(self, transformer_tokenizer, sentence_detector):
        self.transformer_tokenizer = transformer_tokenizer
        self.sentence_detector = sentence_detector

    def add_eos_tokens(self, text):
        eos_token = " " + self.transformer_tokenizer.eos_token + " "
        sentences = self.sentence_detector.tokenize(text)
        eos_added_text = (
            eos_token.join(sentences) + " " + self.transformer_tokenizer.eos_token
        )
        return eos_added_text

In [None]:
!python -c 'import nltk; nltk.download("punkt")'

In [None]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
roberta_tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
#roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
punkt_sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle')

In [16]:
gpt2_preproc = GPT2Preprocessor(gpt2_tokenizer, punkt_sentence_detector)
albert_preproc = AlbertPreprocessor(albert_tokenizer, punkt_sentence_detector)
roberta_preproc = RobertaPreprocessor(roberta_tokenizer, punkt_sentence_detector)

In [17]:
#add the special tokens
data_df["text"] = data_df["text"].map(gpt2_preproc.add_eos_tokens)
#data_df["text"] = data_df["text"].map(albert_preproc.add_eos_tokens)
#data_df["text"] = data_df["text"].map(roberta_preproc.add_eos_tokens)

### Create the torch torch Dataset

In [18]:
dataset = transformer_data_utils.HateDataset(
         data_df=data_df, tokenizer=gpt2_tokenizer
     )
#dataset = transformer_data_utils.HateDataset(
#        data_df=data_df, tokenizer=albert_tokenizer
#    )
#dataset = transformer_data_utils.HateDataset(
#    data_df=data_df, tokenizer=roberta_tokenizer 
#)
assert dataset._max_seq_length <= 512

In [19]:
print(dataset.__getitem__(0)['x_data'].shape)

(349,)


# Creating the Classifier 

In [20]:
class SimpleGPT2SequenceClassifier(nn.Module):
     def __init__(
         self, 
         hidden_size: int,
         num_classes:int ,
         max_seq_len:int,
         gpt_model_name:str, 
     ):
         super(SimpleGPT2SequenceClassifier,self).__init__()
         self.gpt2model = GPT2Model.from_pretrained(
             gpt_model_name
         )
         self.fc1 = nn.Linear(hidden_size, num_classes)
        
     def forward(self, x_in):
         gpt_out = self.gpt2model(x_in)[0] #returns tuple
         batch_size = gpt_out.shape[0]
         prediction_vector = self.fc1(gpt_out.view(batch_size,-1)) #(batch_size , max_len, num_classes)
         return prediction_vector

class SimpleRobertaSequenceClassifier(nn.Module):
     def __init__(
         self, 
         hidden_size: int,
         num_classes:int ,
         max_seq_len:int,
         roberta_model_name:str, 
     ):
         super(SimpleRobertaSequenceClassifier,self).__init__()
         self.robertamodel = RobertaModel.from_pretrained(
             roberta_model_name
         )
         self.fc1 = nn.Linear(hidden_size, num_classes)
        
     def forward(self, x_in):
         gpt_out = self.robertamodel(x_in)[0] #returns tuple
         batch_size = gpt_out.shape[0]
         prediction_vector = self.fc1(gpt_out.view(batch_size,-1)) #(batch_size , max_len, num_classes)
         return prediction_vector

class SimpleAlbertSequenceClassifier(nn.Module):
    def __init__(
        self, 
        hidden_size: int,
        num_classes:int ,
        max_seq_len:int,
        albert_model_name:str, 
    ):
        super(SimpleAlbertSequenceClassifier,self).__init__()
        self.albertmodel = AlbertModel.from_pretrained(
            albert_model_name
        )
        self.fc1 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x_in):
        albert_out = self.albertmodel(x_in)[0] #returns tuple
        batch_size = albert_out.shape[0]
        prediction_vector = self.fc1(albert_out.view(batch_size,-1)) #(batch_size , max_len, num_classes)
        return prediction_vector

In [None]:
print("Loading Pretrained distilgpt2...")
#print("Loading Pretrained albert...")
#print('Loading Pretrained roberta...')
num_classes = len(set(data_df.label))
hidden_size = dataset._max_seq_length * 768
model = SimpleGPT2SequenceClassifier(
     hidden_size=hidden_size,
     num_classes=num_classes,
     gpt_model_name="distilgpt2",
     max_seq_len=dataset._max_seq_length,
)
#model = SimpleAlbertSequenceClassifier(
#    hidden_size=hidden_size,
#    num_classes=num_classes,
#    albert_model_name='albert-base-v2',
#    max_seq_len=dataset._max_seq_length,
#)
#model = SimpleRobertaSequenceClassifier(
#    hidden_size=hidden_size,
#    num_classes=num_classes,
#    max_seq_len=dataset._max_seq_length,
#    roberta_model_name="distilroberta-base"
#)
model.to(args.device)
print("Finished")

# Begin Training

In [115]:
loss_func = nn.CrossEntropyLoss()
base_optimizer = RAdam(model.parameters(), lr=args.learning_rate)
optimizer = Lookahead(optimizer=base_optimizer, k=5, alpha=0.5)
args.num_epochs = 10

In [112]:
train_state = general_utils.make_train_state()
train_state["ckpt"] = 0
train_state['max_seq_len'] = dataset._max_seq_length

In [113]:
args.batch_size = 14
early_stopping = transformer_general_utils.EarlyStopping(patience=4)

In [None]:
epoch_bar = notebook.tqdm(
    desc = 'training_routine',
    total = args.num_epochs,
    position=0,
    leave = True,
)
dataset.set_split('train')
train_bar = notebook.tqdm(
    desc = 'split=train ',
    total=dataset.get_num_batches(args.batch_size),
    position=0,
    leave=True,
)
dataset.set_split('val')
eval_bar = notebook.tqdm(
    desc = 'split=eval',
    total=dataset.get_num_batches(args.batch_size),
    position=0,
    leave=True,
)

for epoch_index in range(args.num_epochs):
    train_state['epoch_in'] = epoch_index

    dataset.set_split('train')
    batch_generator = transformer_data_utils.generate_batches(
        dataset=dataset,
        batch_size=args.batch_size,
        shuffle=True,
        device=args.device,
        drop_last=False,
    )


    running_loss = 0.0
    running_acc = 0.0
    running_f1 = 0.0
    model.train()

    train_bar.reset(
        total=dataset.get_num_batches(args.batch_size),
    )
    model.train()
    for batch_index, batch_dict in enumerate(batch_generator):
        optimizer.zero_grad()

        y_pred = model(batch_dict["x_data"])

        loss = loss_func(y_pred, batch_dict["y_target"])
        loss_t = loss.item()


        loss.backward()
        optimizer.step()
                             
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

                             
        y_pred = y_pred.detach().cpu()

        batch_dict['y_target'] = batch_dict['y_target'].cpu()
        
        acc_t = transformer_general_utils.compute_accuracy(
            y_pred, batch_dict["y_target"]
        )

        f1_t = transformer_general_utils.compute_macro_f1(
            y_pred, batch_dict["y_target"]
        )

        train_state["batch_preds"].append(y_pred)
        train_state["batch_targets"].append(batch_dict["y_target"])
        train_state["batch_indexes"].append(batch_dict["x_index"])

        running_acc += (acc_t - running_acc) / (batch_index + 1)
        running_f1 += (f1_t - running_f1) / (batch_index + 1)

        train_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,
                             epoch=epoch_index)

        train_bar.update()

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    train_state['train_accuracies'].append(running_acc)
    train_state['train_losses'].append(running_loss)
    
    train_state['train_preds'].append(
        torch.cat(train_state['batch_preds']).cpu()
    )
    train_state['train_targets'].append(
        torch.cat(train_state['batch_targets']).cpu()
    )
    train_state['train_indexes'].append(
        torch.cat(train_state['batch_indexes']).cpu()
    )
    train_f1 = transformer_general_utils \
                .compute_macro_f1(train_state['train_preds'][-1],
                                  train_state['train_targets'][-1],
                                 )
                                 
    train_state['train_f1s'].append(train_f1)
    
    train_state['batch_preds'] = []
    train_state['batch_targets'] = []
    train_state['batch_indexes'] = []
    
    
    dataset.set_split('val')
    batch_generator = transformer_data_utils.generate_batches(
        dataset= dataset, batch_size= args.batch_size, shuffle=True,
        device = args.device, drop_last=False,
        pinned_memory = False, n_workers = 2, 
    )
    eval_bar.reset(
        total=dataset.get_num_batches(args.batch_size),
    )
    running_loss = 0.0
    running_acc = 0.0
    running_f1 = 0.0
    
    model.eval()
    with torch.no_grad():
        optimizer._backup_and_load_cache()
        for batch_index, batch_dict in enumerate(batch_generator):
            y_pred = model(batch_dict["x_data"])

            loss = loss_func(y_pred, batch_dict["y_target"])
            loss_t = loss.item()

            running_loss += (loss_t - running_loss) / (batch_index + 1)

            y_pred = y_pred.detach()
            
            acc_t = transformer_general_utils.compute_accuracy(
                y_pred, batch_dict["y_target"]
            )

            f1_t = transformer_general_utils.compute_macro_f1(
                y_pred, batch_dict["y_target"]
            )

            train_state['batch_preds'].append(y_pred.cpu())
            train_state['batch_targets'].append(batch_dict['y_target'])
            train_state['batch_indexes'].append(batch_dict['x_index'].cpu())

            running_acc += (acc_t - running_acc) / (batch_index + 1)
            running_f1 += (f1_t - running_f1) / (batch_index + 1)
            

            eval_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,
                                 epoch=epoch_index)
            eval_bar.update()
            
    train_state['val_accuracies'].append(running_acc)
    train_state['val_losses'].append(running_loss)
    
        
    train_state['val_preds'].append(
        torch.cat(train_state['batch_preds']).cpu()
    )

    train_state['val_targets'].append(
        torch.cat(train_state['batch_targets']).cpu()
    )

    train_state['val_indexes'].append(
        torch.cat(train_state['batch_indexes']).cpu()
    )

    val_f1 = transformer_general_utils \
                .compute_macro_f1(train_state['val_preds'][-1],
                                  train_state['val_targets'][-1],
                                 )
          
    train_state['val_f1s'].append(val_f1)
    
    train_state['batch_preds'] = []
    train_state['batch_targets'] = []
    train_state['batch_indexes'] = []
    
    torch.save(
        {
            'model':model.state_dict(),
        },
        args.directory + f'_epoc_{epoch_index}_' + args.model_name,
    )
    
    optimizer._clear_and_load_backup()
    epoch_bar.set_postfix(best_f1 = early_stopping.best_score, current = val_f1)
    epoch_bar.update()

    epoch_bar.set_postfix(best_f1 = early_stopping.best_score, current = val_f1)
    epoch_bar.update()
    print(epoch_index)

In [94]:
print(train_state['train_f1s'])

[0.8171802897875236]


In [95]:
print(train_state['val_f1s'])

[0.909488974752727]


In [96]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [97]:
best_run_index = train_state['val_f1s'].index(max(train_state['val_f1s']))
print(f'Best run at epoch {best_run_index}')
print('Train:',classification_report(
    y_pred=(torch.argmax(train_state['train_preds'][best_run_index],dim=1) ).cpu().long().numpy(),
    y_true= train_state['train_targets'][best_run_index].cpu().numpy(), 
    digits=4)
)
print('Dev:',classification_report(
    y_pred=(torch.argmax(train_state['val_preds'][best_run_index],dim=1) ).cpu().long().numpy(),
    y_true= train_state['val_targets'][best_run_index].cpu().numpy(), 
    digits=4)
)

Best run at epoch 0
Train:               precision    recall  f1-score   support

           0     0.9355    0.9543    0.9448     53481
           1     0.7306    0.6529    0.6895     10141

    accuracy                         0.9063     63622
   macro avg     0.8330    0.8036    0.8172     63622
weighted avg     0.9028    0.9063    0.9041     63622

Dev:               precision    recall  f1-score   support

           0     0.9699    0.9729    0.9714     11460
           1     0.8545    0.8408    0.8476      2173

    accuracy                         0.9518     13633
   macro avg     0.9122    0.9068    0.9095     13633
weighted avg     0.9515    0.9518    0.9516     13633



## Checkpoint ensemble

In [None]:
def sort_preds(indexes, preds):
    """Sorts the predictions in order, to reverse the effects of shuffle
    done by dataloader"""
    indexes = indexes.cpu().numpy().reshape(-1,1)
    preds = preds.cpu().numpy()
    arr_concat = np.hstack((indexes,preds)) #concat the preds and their indexes
    sort_arr = arr_concat[ arr_concat[:,0].argsort()] #sort based on the indexes
    sorted_preds = np.delete(sort_arr,0,axis=1)
    return sorted_preds

def get_optimal_models(train_state, split, reverse=False ):
    """Naive Ensembling"""
    trgts= sort_preds(train_state[f'{split}_indexes'][-1],train_state[f'{split}_targets'][-1].reshape(-1,1))
    total_preds = len(train_state[f'{split}_indexes'])
    init = np.zeros(train_state[f'{split}_preds'][-1].shape)
    max_f1 = 0
    idxes = []
    rng = range(0,total_preds)
    if reverse:
        rng = reversed(rng)
    for i in rng:
        temp = sort_preds(train_state[f'{split}_indexes'][i],train_state[f'{split}_preds'][i])
        temp2 = init+temp
        f1 = f1_score(
            y_pred=temp2.argmax(axis=1),
            y_true= trgts, average ='weighted'
        )
        if f1 > max_f1:
            max_f1 = f1
            init = init+temp
            idxes.append(i)
    print(f'Taking preds from {idxes} | Dev f1:{f1}')
    return (idxes,max_f1)

In [None]:
train_state['val_f1s']

[0.4567010720121149,
 0.4567010720121149,
 0.4567010720121149,
 0.4567010720121149,
 0.4567010720121149,
 0.4567010720121149,
 0.8359722219773102,
 0.8640951104943737,
 0.8930587491997887,
 0.8915854573179369]

In [None]:
best_model_f1_score = f1_score(
    y_pred=(torch.argmax(train_state['val_preds'][best_run_index],dim=1) ).cpu().long().numpy(),
    y_true= train_state['val_targets'][best_run_index].cpu().numpy(), 
    average='weighted'
)
_models= [get_optimal_models(train_state,'val', reverse=False),
                 get_optimal_models(train_state,'val', reverse=True),
                 ([best_run_index],best_model_f1_score),]
optimal_models = max(_models, key=lambda x:x[1]) #select ensembles or best model 
print(f'Optimal models chosen: {optimal_models}')

Taking preds from [0, 6, 7, 8, 9] | Dev f1:0.9448525010850208
Taking preds from [9, 8, 7] | Dev f1:0.9473387834097712
Optimal models chosen: ([9, 8, 7], 0.9484107144067528)


In [None]:
all_models= [os.path.join(args.directory,i) for i in os.listdir(args.directory) if args.model_name in i]
all_models = sorted(all_models, key = lambda x: int(x[8])) #sort by epoch num.
all_models

['./_epoc_0_distilgpt2.pt',
 './_epoc_1_distilgpt2.pt',
 './_epoc_2_distilgpt2.pt',
 './_epoc_3_distilgpt2.pt',
 './_epoc_4_distilgpt2.pt',
 './_epoc_5_distilgpt2.pt',
 './_epoc_6_distilgpt2.pt',
 './_epoc_7_distilgpt2.pt',
 './_epoc_8_distilgpt2.pt',
 './_epoc_9_distilgpt2.pt']

In [None]:
selected_models = [all_models[i] for i in optimal_models[0]]
pprint.pprint(selected_models)

['./_epoc_9_distilgpt2.pt',
 './_epoc_8_distilgpt2.pt',
 './_epoc_7_distilgpt2.pt']


## Loading test set


In [None]:
test_dataset = dataset

In [None]:
test_dataset.set_split('test')

In [None]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=False, device="cpu", pinned_memory = False, n_workers = 0): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last,
                            pin_memory= pinned_memory,
                            num_workers = n_workers,
                            )
    for data_dict in dataloader:
        out_data_dict = {}
        # print(data_dict.items())
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device, non_blocking= (True if pinned_memory else False) )
        yield out_data_dict

In [None]:
test_state = general_utils.make_train_state() 
test_dataset.set_split('test')
eval_bar = notebook.tqdm(
    desc = 'split=train ',
    total=test_dataset.get_num_batches(args.batch_size),
    position=0,
    leave=True,
)
model.eval()
for m in notebook.tqdm(selected_models, total=len(selected_models)):
    eval_bar.reset(
        total=test_dataset.get_num_batches(args.batch_size),
    )
    model.load_state_dict(torch.load(m)['model'])
    batch_generator = generate_batches(
        dataset= test_dataset, batch_size= args.batch_size, shuffle=False,
        device = args.device, drop_last=False,
        pinned_memory = True, n_workers = 1, 
    )

    with torch.no_grad():
        for batch_index, batch_dict in enumerate(batch_generator):

            y_pred = model(batch_dict["x_data"])

            y_pred = y_pred.detach()
            
            batch_dict['y_target'] = batch_dict['y_target'].cpu()
            test_state['batch_preds'].append(y_pred.cpu())
            test_state['batch_targets'].append(batch_dict['y_target'].cpu())
            test_state['batch_indexes'].append(batch_dict['x_index'].cpu())
            eval_bar.update()

    test_state['val_preds'].append(
        torch.cat(test_state['batch_preds']).cpu()
    )
    test_state['val_targets'].append(
        torch.cat(test_state['batch_targets']).cpu()
    )
    test_state['val_indexes'].append(
        torch.cat(test_state['batch_indexes']).cpu()
    )
    
    test_state['batch_preds'] = []
    test_state['batch_targets'] = []
    test_state['batch_indexes'] = []


split=train :   0%|          | 0/277 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
assert len(test_state['val_preds']) == len(optimal_models[0])

### Ensemble

In [None]:
ensemble = torch.zeros_like(test_state['val_preds'][-1])
for i in test_state['val_preds']:
    ensemble += i

In [None]:
test_preds = torch.argmax(ensemble, dim=1).tolist()

In [None]:
#test_df = data_df.loc[data_df['split'] == 'test']
y_true = np.array(data_df.loc[data_df['split'] == 'test']['label'])
y_pred = np.array(test_preds)
with open('y_pred_gpt2.npy', 'wb') as f:
  np.save(f, y_pred)
from sklearn.metrics import f1_score
test_acc = accuracy_score(y_pred, y_true)
test_f1 = f1_score(y_pred, y_true, average='macro')
print(test_acc)
print(test_f1)