# Funniness Estimation System v3.0

In [None]:
"""
@author: Ziyang Lin
         zlin19@sheffield.ac.uk
         University of Sheffield, UK
"""

'''
A system for
"Assessing the Funniness of Edited News Headlines (SemEval-2020)" task 2.
'''

import random

import pandas as pd
import numpy as np

import os
import re
import time
import math

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets
import torch.utils.data as tud

from google.colab import drive 
drive.mount('/content/gdrive')

import nltk
nltk.download('punkt')
from nltk import word_tokenize


# fix the seeds to get consistent results before every training
# loop in what follows
def fix_seed(seed=123):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)


# Helper function to print the run time
def run_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
log_num = 0
training_stats = []

def add_training_stats(training_stats, log_num, MODEL_NAME, BATCH_SIZE, N_EPOCHS, LRATE, FRATE, EPS, WU, WDECAY, train_loss, train_accuracy, val_loss, val_accuracy, test_accuracy):
    log_num += 1

    training_stats.append(
        {
            'log': log_num,
            'Model Name': MODEL_NAME,
            'Batch Size': BATCH_SIZE,
            'N_Epochs': N_EPOCHS,
            'lr': LRATE,
            'fr': FRATE,
            'eps': EPS,
            'wu': WU,
            'wd': WDECAY,
            'Training Loss': train_loss,
            'Training Accur.': train_accuracy,
            'Valid. Loss': val_loss,
            'Valid. Accur.': val_accuracy,
            'Testing Accur.': test_accuracy
        }
    )
    
    return training_stats, log_num

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 8.3MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 29.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 59.1MB/s 
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K 

In [None]:
# do computation on a GPU if possible 
if torch.cuda.is_available():
  torch.backends.cudnn.deterministic = True
  DEVICE='cuda:0'
else:
  DEVICE='cpu'

print('Device is', DEVICE)

Device is cuda:0


# Preprocessing Datasets

## Read data from csv files

In [None]:
train_loc = 'gdrive/My Drive/subtask-2/train.csv'
dev_loc = 'gdrive/My Drive/subtask-2/dev.csv'
test_loc = 'gdrive/My Drive/subtask-2/test.csv'
train = pd.read_csv(train_loc)  
valid = pd.read_csv(dev_loc)
test = pd.read_csv(test_loc)

In [None]:
# For the normal version:

def get_edited_headlines_list(headls_words):
    # list of new edited headlines
    headls_list = []
    
    for origin_headl, new_word in headls_words:
      # pattern
      p = re.compile(r'\<(.*?)\/\>')
      # get the new edited headline
      new_headl = p.sub(new_word, origin_headl)
      # add it to the list
      headls_list.append(new_headl)

    return headls_list


def processed_data_to_lists(train):
    headls_words_1 = [(origin_headl_1, new_word_1) for (origin_headl_1, new_word_1) in zip(train.original1.to_list(), train.edit1.to_list())]
    headls_words_2 = [(origin_headl_2, new_word_2) for (origin_headl_2, new_word_2) in zip(train.original2.to_list(), train.edit2.to_list())]
    labels_list = train.label.to_list()

    headls_1 = get_edited_headlines_list(headls_words_1)
    headls_2 = get_edited_headlines_list(headls_words_2)

    return headls_1, headls_2, labels_list



In [None]:
# For cut-headlines version:

def cut_headline(new_word, new_headl, num_context):
    # does not consider the words in the very first and very last of the sentences
    # could be improved
    new_headl_split = new_headl.split()

    for index, word in enumerate(new_headl_split):
       if word.strip(",'\".!") == new_word:
           break
    return " ".join(new_headl_split[index-num_context:index] + new_headl_split[index:index+(num_context+1)])


def get_edited_headlines_list_cut(headls_words, num_context):
    # list of new edited headlines
    headls_list = []
    
    for origin_headl, new_word in headls_words:
      # pattern
      p = re.compile(r'\<(.*?)\/\>')
      # get the new edited headline
      new_headl = p.sub(new_word, origin_headl)
      # cut the new_headl
      cut_new_headl = cut_headline(new_word, new_headl, num_context)
      # add it to the list
      headls_list.append(cut_new_headl)

    return headls_list


def processed_data_to_lists_cut(train, num_context):
    headls_words_1 = [(origin_headl_1, new_word_1) for (origin_headl_1, new_word_1) in zip(train.original1.to_list(), train.edit1.to_list())]
    headls_words_2 = [(origin_headl_2, new_word_2) for (origin_headl_2, new_word_2) in zip(train.original2.to_list(), train.edit2.to_list())]
    labels_list = train.label.to_list()

    headls_1 = get_edited_headlines_list_cut(headls_words_1, num_context)
    headls_2 = get_edited_headlines_list_cut(headls_words_2, num_context)

    return headls_1, headls_2, labels_list

In [None]:
# For the punctuation removal and new words returned version:

def remove_punctuation(sentence):
    words = nltk.word_tokenize(sentence)

    # isalnum() -> word that only contain alphanumeric characters
    new_words = [word for word in words if word.isalnum()]
    # Use map() method for mapping str (for converting elements in list to string) with given iterator
    new_sentence = ' '.join(map(str, new_words))

    return new_sentence


def get_edited_headlines_list_pv(headls_words):
    # list of new edited headlines
    headls_list = []
    
    for origin_headl, new_word in headls_words:
      # pattern
      p = re.compile(r'\<(.*?)\/\>')
      # get the new edited headline
      new_headl = p.sub(new_word, origin_headl)
      # remove punctuations
      new_headl = remove_punctuation(new_headl)
      # add it to the list
      headls_list.append(new_headl)

    return headls_list


def processed_data_to_lists_pv(train):
    headls_words_1 = [(origin_headl_1, new_word_1) for (origin_headl_1, new_word_1) in zip(train.original1.to_list(), train.edit1.to_list())]
    headls_words_2 = [(origin_headl_2, new_word_2) for (origin_headl_2, new_word_2) in zip(train.original2.to_list(), train.edit2.to_list())]
    labels_list = train.label.to_list()

    headls_1 = get_edited_headlines_list_pv(headls_words_1)
    headls_2 = get_edited_headlines_list_pv(headls_words_2)

    new_words_1 = [new_word_1 for (origin_headl_1, new_word_1) in headls_words_1]
    new_words_2 = [new_word_2 for (origin_headl_2, new_word_2) in headls_words_2]

    return headls_1, headls_2, labels_list, new_words_1, new_words_2

## Get lists of headlines and list of labels

In [None]:
# For the normal version:

train_headls_1, train_headls_2, train_labels_list = processed_data_to_lists(train)
valid_headls_1, valid_headls_2, valid_labels_list = processed_data_to_lists(valid)
test_headls_1, test_headls_2, test_labels_list = processed_data_to_lists(test)

In [None]:
# For cut-headlines version:

num_context = 4

train_headls_1, train_headls_2, train_labels_list = processed_data_to_lists_cut(train, num_context)
valid_headls_1, valid_headls_2, valid_labels_list = processed_data_to_lists_cut(valid, num_context)
test_headls_1, test_headls_2, test_labels_list = processed_data_to_lists_cut(test, num_context)

In [None]:
# For the punctuation removal and new words returned version:

train_headls_1, train_headls_2, train_labels_list, train_new_words_1, train_new_words_2 = processed_data_to_lists_pv(train)
valid_headls_1, valid_headls_2, valid_labels_list, valid_new_words_1, valid_new_words_2  = processed_data_to_lists_pv(valid)
test_headls_1, test_headls_2, test_labels_list, test_new_words_1, test_new_words_2 = processed_data_to_lists_pv(test)


In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
from transformers import AlbertTokenizer

# Load the ALBERT tokenizer.
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

In [None]:
from transformers import ElectraTokenizer

# Load the ELECTRA tokenizer.
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator', do_lower_case=True)

In [None]:
from transformers import RobertaTokenizer

# Load the Roberta tokenizer.
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [None]:
from transformers import XLNetTokenizer 

# Load the XLNet tokenizer.
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

In [None]:
print(' Original: ', train_headls_1[0])

print('Tokenized: ', tokenizer.tokenize(train_headls_1[0]))

print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_headls_1[0])))

 Original:  " Gene Cernan , Last Dancer on the Moon , Dies at 82 "
Tokenized:  ['"', 'gene', 'ce', '##rna', '##n', ',', 'last', 'dancer', 'on', 'the', 'moon', ',', 'dies', 'at', '82', '"']
Token IDs:  [1000, 4962, 8292, 12789, 2078, 1010, 2197, 8033, 2006, 1996, 4231, 1010, 8289, 2012, 6445, 1000]


## Max sequence length for pretrain LMs

In [None]:
max_one_len = 0
"""
for headl in train_headls_1:
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(headl, add_special_tokens=True)
    # Update the maximum sentence length.
    max_one_len = max(max_one_len, len(input_ids))

print('Max sequence length: ', (max_one_len-1)*2)
"""
for headl in train_headls_1:
    headl = headl.split()
    max_one_len = max(len(headl), max_one_len)

print('Max sequence length: ', (max_one_len * 2) + 3 )

Max sequence length:  55


## Get encoded inputs for pretrain LMs

In [None]:
train_encoded_inputs = tokenizer(train_headls_1, train_headls_2, padding='max_length', max_length=55, truncation=True, return_tensors="pt")
valid_encoded_inputs = tokenizer(valid_headls_1, valid_headls_2, padding='max_length', max_length=55, truncation=True, return_tensors="pt")
test_encoded_inputs = tokenizer(test_headls_1, test_headls_2, padding='max_length', max_length=55, truncation=True, return_tensors="pt")

#45
train_encoded_inputs

{'input_ids': tensor([[  101,  1000,  4962,  ...,     0,     0,     0],
        [  101,  1000,  1045,  ...,     0,     0,     0],
        [  101,  1000,  1045,  ...,     0,     0,     0],
        ...,
        [  101,  1523,  2009,  ...,  3043,  1524,   102],
        [  101,  1523, 12849,  ...,  3302,  1029,   102],
        [  101,  1523,  2365,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
train_input_ids = train_encoded_inputs['input_ids']
train_attention_mask = train_encoded_inputs['attention_mask']
train_token_type_ids = train_encoded_inputs['token_type_ids']
train_labels = torch.tensor(train_labels_list)

valid_input_ids = valid_encoded_inputs['input_ids']
valid_attention_mask = valid_encoded_inputs['attention_mask']
valid_token_type_ids = valid_encoded_inputs['token_type_ids']
valid_labels = torch.tensor(valid_labels_list)

test_input_ids = test_encoded_inputs['input_ids']
test_attention_mask = test_encoded_inputs['attention_mask']
test_token_type_ids = test_encoded_inputs['token_type_ids']
test_labels = torch.tensor(test_labels_list)

train_token_type_ids[0]
tokenizer.decode(train_input_ids.tolist()[1])

'[CLS] " i\'m done " : fed up with california, some vagrants look to texas [SEP] " i\'m done " : fed up with pancakes, some conservatives look to texas [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

## Prepare mini-batches

In [None]:
class BERT_Dataset(tud.Dataset):
    def __init__(self, x1, x2, x3, y1):
        self.len = x1.shape[0]

        self.x1_data = x1.to(DEVICE)
        self.x2_data = x2.to(DEVICE)
        self.x3_data = x3.to(DEVICE)
        self.y1_data = y1.to(DEVICE)


    def __getitem__(self, index):
        return self.x1_data[index], self.x2_data[index], self.x3_data[index], self.y1_data[index]


    def __len__(self):
        return self.len

In [None]:
fix_seed()
# Batching for BERT
BATCH_SIZE = 32

train_dataset = BERT_Dataset(train_input_ids, train_attention_mask, train_token_type_ids, train_labels)
valid_dataset = BERT_Dataset(valid_input_ids, valid_attention_mask, valid_token_type_ids, valid_labels)
test_dataset = BERT_Dataset(test_input_ids, test_attention_mask, test_token_type_ids, test_labels)

train_dataloader = tud.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = tud.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = tud.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)


##### demo #####
print(train_dataloader)

for x1, x2, x3, y1 in train_dataloader:
    demo_x1 = x1
    demo_x2 = x2
    demo_x3 = x3
    demo_y1 = y1
    break
    
print(x1.shape)
print(x2.shape)
print(x3.shape)
print(y1.shape)
print(len(train_dataloader))

<torch.utils.data.dataloader.DataLoader object at 0x7f509778dcf8>
torch.Size([32, 55])
torch.Size([32, 55])
torch.Size([32, 55])
torch.Size([32])
294


In [None]:
class ROBERTA_Dataset(tud.Dataset):
    def __init__(self, x1, x2, y1):
        self.len = x1.shape[0]

        self.x1_data = x1.to(DEVICE)
        self.x2_data = x2.to(DEVICE)
        self.y1_data = y1.to(DEVICE)


    def __getitem__(self, index):
        return self.x1_data[index], self.x2_data[index], self.y1_data[index]


    def __len__(self):
        return self.len

In [None]:
fix_seed()
# Batching for ROBERTA_Dataset
BATCH_SIZE = 16

train_dataset = ROBERTA_Dataset(train_input_ids, train_attention_mask, train_labels)
valid_dataset = ROBERTA_Dataset(valid_input_ids, valid_attention_mask, valid_labels)
test_dataset = ROBERTA_Dataset(test_input_ids, test_attention_mask, test_labels)

train_dataloader = tud.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = tud.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = tud.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Training Models

## Define accuracy

In [None]:
def accuracy(preds, y):
    """
    returns accuracy per batch
    """

    class_preds =  torch.argmax(F.softmax(preds, dim = 1), 1)
    correct = (class_preds == y).float() # convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

## Define train and evaluate

In [None]:
# define train_BERT and evaluate_BERT
def train_BERT(model, train_dataloader, valid_dataloader, optimizer, scheduler, N_EPOCHS):
    fix_seed()
    model = model.to(DEVICE)

    # Measure the total time for the whole run.
    t0 = time.time()

    for epoch in range(N_EPOCHS):
    
        start_time = time.time()

        # To ensure the dropout is "turned on" while training
        model.train()
        
        epoch_loss = 0
        epoch_acc = 0
    
        for input_ids_batch, attention_mask_batch, token_type_ids_batch, labels in train_dataloader:
                        
            # Zero the gradients
            optimizer.zero_grad()

            # shape(input_ids_batch) = [B, T]
            # shape(attention_mask_batch) = [B, T]
            # shape(labels) = [B]

            # get the output
            outputs = model(input_ids_batch,
                            attention_mask=attention_mask_batch,
                            token_type_ids=token_type_ids_batch,
                            labels=labels)
            
            # get the loss & the logits
            loss = outputs[0]
            logits = outputs[1]

            # compute training accuracy
            acc = accuracy(logits, labels)
                      
            # calculate the gradient of each parameter
            loss.backward()
        
            # update the parameters using the gradients and optimizer algorithm 
            optimizer.step()

            # update the learning rate
            scheduler.step()
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
        average_epoch_loss = epoch_loss / len(train_dataloader)
        average_epoch_acc = epoch_acc / len(train_dataloader)
        
        end_time = time.time()
               
        epoch_mins, epoch_secs = run_time(start_time, end_time)
    
        average_epoch_valid_loss, average_epoch_valid_acc = evaluate_BERT(model, valid_dataloader)

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {average_epoch_loss:.5f} | Train Acc: {average_epoch_acc*100:.4f}%')
        print(f'\t Val. Loss: {average_epoch_valid_loss:.5f} |  Val. Acc: {average_epoch_valid_acc*100:.4f}%')

    print("")
    print("***Completed***")
    total_mins, total_secs = run_time(t0, time.time())
    print(f'Total time spent: {total_mins}m {total_secs}s')

    return average_epoch_loss, average_epoch_acc, average_epoch_valid_loss, average_epoch_valid_acc

def evaluate_BERT(model, dataloader):
    fix_seed()

    epoch_loss = 0
    epoch_acc = 0

    # Turn on evaluate mode. This de-activates dropout. 
    model.eval()

    # We do not compute gradients within this block, i.e. no training
    with torch.no_grad():

        for input_ids_batch, attention_mask_batch, token_type_ids_batch, labels in dataloader:
            
            # get the output
            outputs = model(input_ids_batch,
                            attention_mask=attention_mask_batch,
                            token_type_ids=token_type_ids_batch,
                            labels=labels)

            loss = outputs[0]
            logits = outputs[1]
            acc = accuracy(logits, labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)

In [None]:
# define train_ROBERTA and evaluate_ROBERTA for RoBerta
def train_ROBERTA(model, train_dataloader, valid_dataloader, optimizer, scheduler, N_EPOCHS):
    fix_seed()
    model = model.to(DEVICE)

    # Measure the total time for the whole run.
    t0 = time.time()

    for epoch in range(N_EPOCHS):
    
        start_time = time.time()

        # To ensure the dropout is "turned on" while training
        model.train()
        
        epoch_loss = 0
        epoch_acc = 0
    
        for input_ids_batch, attention_mask_batch, labels in train_dataloader:
          
            # Zero the gradients
            optimizer.zero_grad()

            # shape(input_ids_batch) = [B, T]
            # shape(attention_mask_batch) = [B, T]
            # shape(labels) = [B]

            # get the output
            outputs = model(input_ids_batch,
                            attention_mask=attention_mask_batch,
                            labels=labels)
            
            # get the loss & the logits
            loss = outputs[0]
            logits = outputs[1]

            # compute training accuracy
            acc = accuracy(logits, labels)
                      
            # calculate the gradient of each parameter
            loss.backward()
        
            # update the parameters using the gradients and optimizer algorithm 
            optimizer.step()

            # update the learning rate
            scheduler.step()
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
        average_epoch_loss = epoch_loss / len(train_dataloader)
        average_epoch_acc = epoch_acc / len(train_dataloader)
        
        end_time = time.time()
               
        epoch_mins, epoch_secs = run_time(start_time, end_time)
    
        average_epoch_valid_loss, average_epoch_valid_acc = evaluate_ROBERTA(model, valid_dataloader)

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {average_epoch_loss:.5f} | Train Acc: {average_epoch_acc*100:.4f}%')
        print(f'\t Val. Loss: {average_epoch_valid_loss:.5f} |  Val. Acc: {average_epoch_valid_acc*100:.4f}%')

    print("")
    print("***Completed***")
    total_mins, total_secs = run_time(t0, time.time())
    print(f'Total time spent: {total_mins}m {total_secs}s')

    return average_epoch_loss, average_epoch_acc, average_epoch_valid_loss, average_epoch_valid_acc

def evaluate_ROBERTA(model, dataloader):
    fix_seed()

    epoch_loss = 0
    epoch_acc = 0

    # Turn on evaluate mode. This de-activates dropout. 
    model.eval()

    # We do not compute gradients within this block, i.e. no training
    with torch.no_grad():

        for input_ids_batch, attention_mask_batch, labels in dataloader:
            
            # get the output
            outputs = model(input_ids_batch,
                            attention_mask=attention_mask_batch,
                            labels=labels)

            loss = outputs[0]
            logits = outputs[1]
            acc = accuracy(logits, labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)

## Load pretrain LMs

In [None]:
from transformers import BertForSequenceClassification, AdamW

# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                        num_labels = 3,   
                                                        output_attentions = False,
                                                        output_hidden_states = False)

In [None]:
from transformers import AlbertForSequenceClassification, AdamW

# Load the AlbertForSequenceClassification model
model = AlbertForSequenceClassification.from_pretrained("albert-base-v2",
                                                        num_labels = 3,   
                                                        output_attentions = False,
                                                        output_hidden_states = False)

In [None]:
from transformers import ElectraForSequenceClassification, AdamW

# Load the ElectraForSequenceClassification model
model = ElectraForSequenceClassification.from_pretrained("google/electra-base-discriminator",
                                                        num_labels = 3,   
                                                        output_attentions = False,
                                                        output_hidden_states = False)

In [None]:
from transformers import RobertaForSequenceClassification, AdamW

# Load the RobertaForSequenceClassification model
model = RobertaForSequenceClassification.from_pretrained('roberta-base',
                                                    num_labels = 3,   
                                                    output_attentions = False,
                                                    output_hidden_states = False)

In [None]:
from transformers import XLNetForSequenceClassification, AdamW

# Load the XLNetForSequenceClassification model
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased',
                                                    num_labels = 3,   
                                                    output_attentions = False,
                                                    output_hidden_states = False)

## Hyperparameters

In [None]:
# Hyperparameters for BERT:

# Number of training epochs. The BERT authors recommend between 2 and 4.
N_EPOCHS = 4

LRATE = 1e-5
FRATE = "none"
EPS = "none"
WU = 0.06
WDECAY = 0.1

# Total number of training steps is [number of batches] x [number of epochs]. 
TOTSTEPS = len(train_dataloader) * N_EPOCHS
WUSTEPS = int(TOTSTEPS * WU)

# Apply weight decay to all parameters other than bias and layer normalization terms
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': WDECAY},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

"""optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if "transformer" not in n], 'lr': LRATE, 'weight_decay': WDECAY},
    {'params': [p for n, p in model.named_parameters() if "transformer" in n], 'weight_decay': WDECAY}
]"""

'optimizer_grouped_parameters = [\n    {\'params\': [p for n, p in model.named_parameters() if "transformer" not in n], \'lr\': LRATE, \'weight_decay\': WDECAY},\n    {\'params\': [p for n, p in model.named_parameters() if "transformer" in n], \'weight_decay\': WDECAY}\n]'

## Optimizer & Learning Rate Scheduler

In [None]:
# Create the optimizer, 
# the epsilon parameter is a very small number to prevent any division by zero
optimizer = AdamW(optimizer_grouped_parameters, lr=LRATE)

In [None]:
from transformers import get_linear_schedule_with_warmup

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = WUSTEPS,
                                            num_training_steps = TOTSTEPS)

## Start training

In [None]:
train_loss, train_accuracy, val_loss, val_accuracy = train_BERT(model,
                                                                train_dataloader,
                                                                valid_dataloader,
                                                                optimizer,
                                                                scheduler,
                                                                N_EPOCHS)

Epoch: 01 | Epoch Time: 1m 31s
	Train Loss: 0.99479 | Train Acc: 45.0191%
	 Val. Loss: 0.96465 |  Val. Acc: 45.7548%
Epoch: 02 | Epoch Time: 1m 32s
	Train Loss: 0.95419 | Train Acc: 47.6531%
	 Val. Loss: 0.96574 |  Val. Acc: 47.0817%
Epoch: 03 | Epoch Time: 1m 33s
	Train Loss: 0.80342 | Train Acc: 66.9430%
	 Val. Loss: 1.05948 |  Val. Acc: 47.4884%
Epoch: 04 | Epoch Time: 1m 33s
	Train Loss: 0.52005 | Train Acc: 84.2368%
	 Val. Loss: 1.14986 |  Val. Acc: 48.5442%

***Completed***
Total time spent: 6m 40s


# Testing Models

## Start testing

In [None]:
fix_seed()

test_loss = 0
test_acc = 0
test_logits_all = torch.tensor([], device=DEVICE)

# Turn on evaluate mode. This de-activates dropout. 
model.eval()

# We do not compute gradients within this block, i.e. no training
with torch.no_grad():

    for input_ids_batch, attention_mask_batch, token_type_ids_batch, labels in test_dataloader:
        
        # get the output
        outputs = model(input_ids_batch,
                        attention_mask=attention_mask_batch,
                        token_type_ids=token_type_ids_batch,
                        labels=labels)

        loss_batch = outputs[0]
        logits_batch = outputs[1]
        #test_logits_all += logits_batch.tolist()
        test_logits_all = torch.cat((test_logits_all, logits_batch), 0)
        acc_batch = accuracy(logits_batch, labels)

        test_loss += loss_batch.item()
        test_acc += acc_batch.item()

    average_test_loss = test_loss / len(test_dataloader)
    average_test_acc = test_acc / len(test_dataloader)

print(f'Test Loss: {average_test_loss:.5f} | Test Acc: {average_test_acc*100:.4f}%')

Test Loss: 1.16847 | Test Acc: 45.1613%


In [None]:
# Test for the ROBERTA
fix_seed()

test_loss = 0
test_acc = 0
test_logits_all = torch.tensor([], device=DEVICE)

# Turn on evaluate mode. This de-activates dropout. 
model.eval()

# We do not compute gradients within this block, i.e. no training
with torch.no_grad():

    for input_ids_batch, attention_mask_batch, labels in test_dataloader:
        
        # get the output
        outputs = model(input_ids_batch,
                        attention_mask=attention_mask_batch,
                        labels=labels)

        loss_batch = outputs[0]
        logits_batch = outputs[1]
        #test_logits_all += logits_batch.tolist()
        test_logits_all = torch.cat((test_logits_all, logits_batch), 0)
        acc_batch = accuracy(logits_batch, labels)

        test_loss += loss_batch.item()
        test_acc += acc_batch.item()

    average_test_loss = test_loss / len(test_dataloader)
    average_test_acc = test_acc / len(test_dataloader)

print(f'Test Loss: {average_test_loss:.5f} | Test Acc: {average_test_acc*100:.4f}%')

Test Loss: 1.04915 | Test Acc: 45.0338%


In [None]:
test_logits_all.shape

torch.Size([2960, 3])

## Write results

In [None]:
def write_predictions(predictions, test_data_frame, out_loc):
    test_data_frame['pred'] = predictions
    output = test_data_frame[['id','pred']]
    output.to_csv(out_loc, index=False)
        
    print('Output file created:\n\t- '+os.path.abspath(out_loc))


# write the predictions for the dev data into 'task-2-output.csv'
out_loc = 'gdrive/My Drive/subtask-2/task-2-output.csv'

test_class_preds = torch.argmax(F.softmax(test_logits_all, dim = 1), 1)
write_predictions(test_class_preds.cpu().numpy(), test, out_loc)

Output file created:
	- /content/gdrive/My Drive/subtask-2/task-2-output.csv


## Check final results

In [None]:
def score(truth_loc, prediction_loc):
    truth = pd.read_csv(truth_loc, usecols=['id','label'])
    pred = pd.read_csv(prediction_loc, usecols=['id','pred'])
         
    assert(sorted(truth.id) == sorted(pred.id)),"ID mismatch between ground truth and prediction!"
    
    data = pd.merge(truth,pred)
    data = data[data.label != 0]
    accuracy = (np.sum(data.label == data.pred)*1.0/len(data))*100
    
    print("Accuracy = %.6f" % accuracy)

    return accuracy   

# print Accuracy
truth_loc = 'gdrive/My Drive/subtask-2/test.csv'
prediction_loc = 'gdrive/My Drive/subtask-2/task-2-output.csv'
test_accuracy = score(truth_loc, prediction_loc)

Accuracy = 50.304414


# Logging Statistics

In [None]:
B1 = "bert-base-uncased"
A2 = "albert-base-v2"
E = "electra"
XL = "xlnet"
R = "roberta"

training_stats, log_num = add_training_stats(training_stats, 
                                             log_num,
                                             B1,
                                             BATCH_SIZE, 
                                             N_EPOCHS,
                                             "{:.0e}".format(LRATE),
                                             FRATE, 
                                             EPS, 
                                             WU,
                                             WDECAY, 
                                             train_loss, 
                                             train_accuracy*100, 
                                             val_loss, 
                                             val_accuracy*100,
                                             test_accuracy
                                             )

In [None]:
# Display floats with five decimal places.
pd.set_option('precision', 5)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'log' as the row index.
df_stats = df_stats.set_index('log')

# Display the table.
df_stats

Unnamed: 0_level_0,Model Name,Batch Size,N_Epochs,lr,fr,eps,wu,wd,Training Loss,Training Accur.,Valid. Loss,Valid. Accur.,Testing Accur.
log,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,xlnet,32,4,3e-05,none,1e-08,0.1,0.01,0.4545,83.26956,1.45902,43.38994,48.8965
2,xlnet,32,2,0.008,3e-05,1e-06,0.3,0.01,1.04101,43.85842,0.96483,43.24769,50.95129
3,xlnet,32,2,0.005,2e-05,1e-08,0.1,0.01,1.00768,44.62372,0.96349,44.73906,48.8965
4,albert-base-v2,32,2,0.005,2e-05,1e-08,0.1,0.01,1.01367,44.53869,0.96444,43.24769,50.95129
5,albert-base-v2,32,2,0.005,2e-05,1e-08,0.1,0.01,0.96267,44.72151,0.96402,43.24769,50.95129
6,albert-base-v2,32,3,0.005,2e-05,1e-08,0.1,0.01,0.97209,47.04719,0.96623,43.24769,50.95129
7,albert-base-v2,32,4,3e-05,none,1e-08,0.1,0.01,0.8883,57.72534,0.99317,43.24769,50.95129
8,albert-base-v2,32,2,3e-05,none,1e-08,0.1,0.01,0.96056,48.11437,0.95214,51.54027,50.41857
9,albert-base-v2,32,2,3e-05,none,1e-08,none,0.01,0.96824,44.85332,0.96558,43.24769,50.91324
10,albert-base-v2,32,2,3e-05,none,1e-08,0.3,0.01,0.9561,47.67432,0.96057,47.07726,50.87519


## Write statistics

In [None]:
log_loc = 'gdrive/My Drive/subtask-2/log4.csv'
df_stats.to_csv(log_loc, index=False)

In [None]:
training_stats.pop(-1)
log_num -= 1