# Funniness Estimation Regression System

In [None]:
"""
@author: Ziyang Lin
         zlin19@sheffield.ac.uk
         University of Sheffield, UK
"""

'''
A system for
"Assessing the Funniness of Edited News Headlines (SemEval-2020)" task 2.
'''

import random

import pandas as pd
import numpy as np

import os
import re
import time
import math

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets
import torch.utils.data as tud

from google.colab import drive 
drive.mount('/content/gdrive')

import nltk
nltk.download('punkt')
from nltk import word_tokenize


# fix the seeds to get consistent results before every training
# loop in what follows
def fix_seed(seed=1234):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)


# Helper function to print the run time
def run_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
training_stats = []
log_num = 0

def add_training_stats(training_stats, log_num, MODEL_NAME, BATCH_SIZE, N_EPOCHS, LRATE, FRATE, EPS, WU, WDECAY, train_loss, val_loss, test_loss):
    log_num += 1

    training_stats.append(
        {
            'log': log_num,
            'Model Name': MODEL_NAME,
            'Batch Size': BATCH_SIZE,
            'N_Epochs': N_EPOCHS,
            'lr': LRATE,
            'fr': FRATE,
            'eps': EPS,
            'wu': WU,
            'wd': WDECAY,
            'Training Loss': train_loss,
            'Valid. Loss': val_loss,
            'Test Loss': test_loss
        }
    )
    
    return training_stats, log_num   

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 6.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 12.8MB/s 
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 32.1MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB

In [None]:
# do computation on a GPU if possible 
if torch.cuda.is_available():
  torch.backends.cudnn.deterministic = True
  DEVICE='cuda:0'
else:
  DEVICE='cpu'

print('Device is', DEVICE)

Device is cuda:0


# Preprocessing Datasets

## Read data from csv files

In [None]:
train_loc = 'gdrive/My Drive/subtask-1/train.csv'
dev_loc = 'gdrive/My Drive/subtask-1/dev.csv'
test_loc = 'gdrive/My Drive/subtask-1/test.csv'
train = pd.read_csv(train_loc)  
valid = pd.read_csv(dev_loc)
test = pd.read_csv(test_loc)

In [None]:
def processed_data_to_lists(train):
    headls_words = [(origin_headl, new_word) for (origin_headl, new_word) in zip(train.original.to_list(), train.edit.to_list())]
    labels_list = train.meanGrade.to_list()

    # list of tuple for original headlines and new edited headlines
    o_headls_n_headls = []
    new_word_list = []
    
    for origin_headl, new_word in headls_words:
      # pattern
      p = re.compile(r'\<(.*?)\/\>')
      # get the normal version of the original headline
      origin_word = ''.join(re.findall(p, origin_headl))
      normal_origin_headl = p.sub(origin_word, origin_headl)
      # get the new edited headline
      new_headl = p.sub(new_word, origin_headl)
      # pair them and put them into the list
      o_headls_n_headls.append((normal_origin_headl,new_headl))

      new_word_list.append(new_word)

    o_headls = [i for i, j in o_headls_n_headls]
    n_headls = [j for i, j in o_headls_n_headls]

    return o_headls, n_headls, new_word_list, labels_list


## Get lists of headlines and list of labels

In [None]:
train_o_headls, train_n_headls, train_new_word_list, train_labels_list = processed_data_to_lists(train)
valid_o_headls, valid_n_headls, valid_new_word_list, valid_labels_list = processed_data_to_lists(valid)
test_o_headls, test_n_headls, test_new_word_list, test_labels_list = processed_data_to_lists(test)

len(train_n_headls)

9652

In [None]:
# extra data for training

train_loc_extra = 'gdrive/My Drive/subtask-1/train_funlines.csv'
train_extra = pd.read_csv(train_loc_extra)
train_o_headls_extra, train_n_headls_extra, train_new_word_list_extra, train_labels_list_extra = processed_data_to_lists(train_extra)

train_n_headls = train_n_headls + train_n_headls_extra
train_new_word_list = train_new_word_list + train_new_word_list_extra
train_labels_list = train_labels_list + train_labels_list_extra

len(train_n_headls)

17900

In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
from transformers import AlbertTokenizer

# Load the ALBERT tokenizer.
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

In [None]:
from transformers import AlbertTokenizer

# Load the ALBERT tokenizer.
tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2', do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760289.0, style=ProgressStyle(descripti…




In [None]:
from transformers import ElectraTokenizer

# Load the ELECTRA tokenizer.
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator', do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
from transformers import XLNetTokenizer 

# Load the XLNet tokenizer.
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

In [None]:
print(' Original: ', train_o_headls[0])

print('Tokenized: ', tokenizer.tokenize(train_o_headls[0]))

print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_o_headls[0])))

 Original:  France is ‘ hunting down its citizens who joined Isis ’ without trial in Iraq
Tokenized:  ['▁france', '▁is', '▁', '‘', '▁hunting', '▁down', '▁its', '▁citizens', '▁who', '▁joined', '▁is', 'is', '▁', '’', '▁without', '▁trial', '▁in', '▁iraq']
Token IDs:  [714, 25, 13, 1, 5038, 125, 82, 2888, 72, 670, 25, 403, 13, 1, 366, 2178, 19, 4903]


## Max sequence length for BERT

In [None]:
max_one_len = 0

"""for headl in train_o_headls:
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(headl, add_special_tokens=True)
    # Update the maximum sentence length.
    max_one_len = max(max_one_len, len(input_ids))

print('Max sequence length for concatenation: ', (max_one_len-1)*2)
print('Max sequence length for \'sentence + word\': ', max_one_len+2)"""

for headl in train_n_headls:
    headl = headl.split()
    max_one_len = max(len(headl), max_one_len)

print('Max sequence length: ', max_one_len + 4 )


Max sequence length:  31


## Get encoded inputs for BERT

In [None]:
# the version that concatenates original sentences and new sentences
train_encoded_inputs = tokenizer(train_o_headls, train_n_headls, padding='max_length', max_length=90, truncation=True, return_tensors="pt")
valid_encoded_inputs = tokenizer(valid_o_headls, valid_n_headls, padding='max_length', max_length=90, truncation=True, return_tensors="pt")
test_encoded_inputs = tokenizer(test_o_headls, test_n_headls, padding='max_length', max_length=90, truncation=True, return_tensors="pt")

train_encoded_inputs

{'input_ids': tensor([[  101,  2605,  2003,  ...,     0,     0,     0],
        [  101, 20864,  4447,  ...,     0,     0,     0],
        [  101, 10399,  7610,  ...,     0,     0,     0],
        ...,
        [  101,  8592,  2240,  ...,     0,     0,     0],
        [  101,  3996,  2610,  ...,     0,     0,     0],
        [  101,  2182,  1005,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
# the version that only contains new sentences
train_encoded_inputs = tokenizer(train_n_headls, padding=True, truncation=True, return_tensors="pt")
valid_encoded_inputs = tokenizer(valid_n_headls, padding=True, truncation=True, return_tensors="pt")
test_encoded_inputs = tokenizer(test_n_headls, padding=True, truncation=True, return_tensors="pt")

train_encoded_inputs

{'input_ids': tensor([[    2,   714,    25,  ...,     0,     0,     0],
        [    2, 21213,  2810,  ...,     0,     0,     0],
        [    2, 10659,  6736,  ...,     0,     0,     0],
        ...,
        [    2,  6581,   293,  ...,     0,     0,     0],
        [    2,  2368,   698,  ...,     0,     0,     0],
        [    2,   235,    13,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
# the version that concatenates new sentences and new words

train_encoded_inputs = tokenizer(train_n_headls, train_new_word_list, padding='max_length', max_length=38, truncation=True, return_tensors="pt")
valid_encoded_inputs = tokenizer(valid_n_headls, valid_new_word_list, padding='max_length', max_length=38, truncation=True, return_tensors="pt")
test_encoded_inputs = tokenizer(test_n_headls, test_new_word_list, padding='max_length', max_length=38, truncation=True, return_tensors="pt")

# best so far: max_length=38
train_encoded_inputs

{'input_ids': tensor([[  101,  2605,  2003,  ...,     0,     0,     0],
        [  101, 20864,  4447,  ...,     0,     0,     0],
        [  101, 10399,  7610,  ...,     0,     0,     0],
        ...,
        [  101, 15432,  6284,  ...,     0,     0,     0],
        [  101,  3533, 20996,  ...,     0,     0,     0],
        [  101,  2096,  7513,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
train_input_ids = train_encoded_inputs['input_ids']
train_attention_mask = train_encoded_inputs['attention_mask']
train_token_type_ids = train_encoded_inputs['token_type_ids']
train_labels = torch.tensor(train_labels_list)

valid_input_ids = valid_encoded_inputs['input_ids']
valid_attention_mask = valid_encoded_inputs['attention_mask']
valid_token_type_ids = valid_encoded_inputs['token_type_ids']
valid_labels = torch.tensor(valid_labels_list)

test_input_ids = test_encoded_inputs['input_ids']
test_attention_mask = test_encoded_inputs['attention_mask']
test_token_type_ids = test_encoded_inputs['token_type_ids']
test_labels = torch.tensor(test_labels_list)

train_token_type_ids[0]
tokenizer.decode(train_input_ids.tolist()[1])

'[CLS] pentagon claims 2, 000 % increase in russian trolls after bowling strikes. what does that mean? [SEP] bowling [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

## Prepare mini-batches

In [None]:
class BERT_Dataset(tud.Dataset):
    def __init__(self, x1, x2, x3, y1):
        self.len = x1.shape[0]

        self.x1_data = x1.to(DEVICE)
        self.x2_data = x2.to(DEVICE)
        self.x3_data = x3.to(DEVICE)
        self.y1_data = y1.to(DEVICE)


    def __getitem__(self, index):
        return self.x1_data[index], self.x2_data[index], self.x3_data[index], self.y1_data[index]


    def __len__(self):
        return self.len

In [None]:
fix_seed()
# Batching for BERT
BATCH_SIZE = 16

train_dataset = BERT_Dataset(train_input_ids, train_attention_mask, train_token_type_ids, train_labels)
valid_dataset = BERT_Dataset(valid_input_ids, valid_attention_mask, valid_token_type_ids, valid_labels)
test_dataset = BERT_Dataset(test_input_ids, test_attention_mask, test_token_type_ids, test_labels)

train_dataloader = tud.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = tud.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = tud.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)


##### demo #####
print(train_dataloader)

for x1, x2, x3, y1 in train_dataloader:
    demo_x1 = x1
    demo_x2 = x2
    demo_x3 = x3
    demo_y1 = y1
    break
    
print(x1.shape)
print(x2.shape)
print(x3.shape)
print(y1.shape)
print(len(train_dataloader))

<torch.utils.data.dataloader.DataLoader object at 0x7faf0dcb2f60>
torch.Size([16, 38])
torch.Size([16, 38])
torch.Size([16, 38])
torch.Size([16])
1119


# Training Models

## Define train and evaluate

In [None]:
# define train_BERT and evaluate_BERT for the concatenation version
def train_BERT(model, train_dataloader, valid_dataloader, optimizer, scheduler, criterion, N_EPOCHS):
    fix_seed()
    model = model.to(DEVICE)

    # Measure the total time for the whole run.
    t0 = time.time()

    for epoch in range(N_EPOCHS):
    
        start_time = time.time()

        # To ensure the dropout is "turned on" while training
        model.train()
        
        epoch_loss = 0
    
        for input_ids_batch, attention_mask_batch, token_type_ids_batch, labels in train_dataloader:
                        
            # Zero the gradients
            optimizer.zero_grad()

            # shape(input_ids_batch) = [B, T]
            # shape(attention_mask_batch) = [B, T]
            # shape(labels) = [B]

            # get the output
            outputs = model(input_ids_batch,
                            attention_mask=attention_mask_batch,
                            token_type_ids=token_type_ids_batch)
            
            # get the predictions & calculate the loss
            predictions = outputs[0].squeeze(1)
            loss = criterion(predictions, labels)
                      
            # calculate the gradient of each parameter
            loss.backward()
        
            # update the parameters using the gradients and optimizer algorithm 
            optimizer.step()

            # update the learning rate
            scheduler.step()
            
            epoch_loss += loss.item()
            
        average_epoch_loss = epoch_loss / len(train_dataloader)
        
        end_time = time.time()
               
        epoch_mins, epoch_secs = run_time(start_time, end_time)
    
        average_epoch_valid_loss = evaluate_BERT(model, criterion, valid_dataloader)

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {average_epoch_loss:.5f} |')
        print(f'\t Val. Loss: {average_epoch_valid_loss:.5f} |')

    print("")
    print("***Completed***")
    total_mins, total_secs = run_time(t0, time.time())
    print(f'Total time spent: {total_mins}m {total_secs}s')

    return average_epoch_loss, average_epoch_valid_loss


def evaluate_BERT(model, criterion, dataloader):
    fix_seed()

    epoch_loss = 0

    # Turn on evaluate mode. This de-activates dropout. 
    model.eval()

    # We do not compute gradients within this block, i.e. no training
    with torch.no_grad():

        for input_ids_batch, attention_mask_batch, token_type_ids_batch, labels in dataloader:
            
            # get the output
            outputs = model(input_ids_batch,
                            attention_mask=attention_mask_batch,
                            token_type_ids=token_type_ids_batch)

            predictions = outputs[0].squeeze(1)
            loss = criterion(predictions, labels)

            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

In [None]:
# define train_BERT and evaluate_BERT for only-new-sentences version
def train_BERT(model, train_dataloader, valid_dataloader, optimizer, scheduler, criterion, N_EPOCHS):
    fix_seed()
    model = model.to(DEVICE)

    # Measure the total time for the whole run.
    t0 = time.time()

    for epoch in range(N_EPOCHS):
    
        start_time = time.time()

        # To ensure the dropout is "turned on" while training
        model.train()
        
        epoch_loss = 0
    
        for input_ids_batch, attention_mask_batch, token_type_ids_batch, labels in train_dataloader:
                        
            # Zero the gradients
            optimizer.zero_grad()

            # shape(input_ids_batch) = [B, T]
            # shape(attention_mask_batch) = [B, T]
            # shape(labels) = [B]

            # get the output
            outputs = model(input_ids_batch,
                            attention_mask=attention_mask_batch)
            
            # get the predictions & calculate the loss
            predictions = outputs[0].squeeze(1)
            loss = criterion(predictions, labels)
                      
            # calculate the gradient of each parameter
            loss.backward()
        
            # update the parameters using the gradients and optimizer algorithm 
            optimizer.step()

            # update the learning rate
            scheduler.step()
            
            epoch_loss += loss.item()
            
        average_epoch_loss = epoch_loss / len(train_dataloader)
        
        end_time = time.time()
               
        epoch_mins, epoch_secs = run_time(start_time, end_time)
    
        average_epoch_valid_loss = evaluate_BERT(model, criterion, valid_dataloader)

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {average_epoch_loss:.5f} |')
        print(f'\t Val. Loss: {average_epoch_valid_loss:.5f} |')

    print("")
    print("***Completed***")
    total_mins, total_secs = run_time(t0, time.time())
    print(f'Total time spent: {total_mins}m {total_secs}s')

    return average_epoch_loss, average_epoch_valid_loss


def evaluate_BERT(model, criterion, dataloader):
    fix_seed()

    epoch_loss = 0

    # Turn on evaluate mode. This de-activates dropout. 
    model.eval()

    # We do not compute gradients within this block, i.e. no training
    with torch.no_grad():

        for input_ids_batch, attention_mask_batch, token_type_ids_batch, labels in dataloader:
            
            # get the output
            outputs = model(input_ids_batch,
                            attention_mask=attention_mask_batch)

            predictions = outputs[0].squeeze(1)
            loss = criterion(predictions, labels)

            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

## Load BERT

In [None]:
from transformers import AlbertForSequenceClassification, AdamW

# Load the AlbertForSequenceClassification model
model = AlbertForSequenceClassification.from_pretrained("albert-base-v2",
                                                        num_labels = 1,   
                                                        output_attentions = False,
                                                        output_hidden_states = False)

In [None]:
from transformers import XLNetForSequenceClassification, AdamW

# Load the XLNetForSequenceClassification model
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased',
                                                    num_labels = 1,   
                                                    output_attentions = False,
                                                    output_hidden_states = False)

In [None]:
from transformers import ElectraForSequenceClassification, AdamW

# Load the ElectraForSequenceClassification model
model = ElectraForSequenceClassification.from_pretrained("google/electra-base-discriminator",
                                                        num_labels = 1,   
                                                        output_attentions = False,
                                                        output_hidden_states = False)

In [None]:
from transformers import BertForSequenceClassification, AdamW

# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                        num_labels = 1,   
                                                        output_attentions = False,
                                                        output_hidden_states = False)

## Hyperparameters

In [None]:
# Hyperparameters for BERT:

# Number of training epochs. The BERT authors recommend between 2 and 4.
N_EPOCHS = 1

LRATE = 8e-3
FRATE = 3e-5
EPS = 1e-8
WU = 0.2
WDECAY = 0.005

# best so far: N_EPOCHS = 2, LRATE = 8e-3, FRATE = 3e-5 EPS = 1e-8, WU = 0.3, WDECAY = 0.01

# Total number of training steps is [number of batches] x [number of epochs]. 
TOTSTEPS = len(train_dataloader) * N_EPOCHS * 2
WUSTEPS = int(TOTSTEPS * WU)

# Apply weight decay to all parameters other than bias and layer normalization terms
no_decay = ['bias', 'LayerNorm.weight']
"""optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': WDECAY},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]"""
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if "bert" not in n], 'lr': LRATE, 'weight_decay': WDECAY},
    {'params': [p for n, p in model.named_parameters() if "bert" in n], 'weight_decay': WDECAY}
]

## Optimizer & Learning Rate Scheduler

In [None]:
# Create the optimizer, 
# the epsilon parameter is a very small number to prevent any division by zero
optimizer = AdamW(optimizer_grouped_parameters, lr=FRATE, eps = EPS)

In [None]:
from transformers import get_linear_schedule_with_warmup

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = WUSTEPS,
                                            num_training_steps = TOTSTEPS)

## Define RMSE

In [None]:
# define rmse
def rmse(predictions, labels):
    loss = torch.sqrt(((predictions - labels)**2).mean())

    return loss

## Start training

In [None]:
criterion = rmse

train_loss, val_loss = train_BERT(model,
                                  train_dataloader,
                                  valid_dataloader,
                                  optimizer,
                                  scheduler,
                                  criterion,
                                  N_EPOCHS)

Epoch: 01 | Epoch Time: 4m 50s
	Train Loss: 0.58582 |
	 Val. Loss: 0.51137 |

***Completed***
Total time spent: 5m 1s


# Testing Models

## Start testing

In [None]:
fix_seed()
model.eval()

test_input_ids = test_input_ids.to(DEVICE)
test_attention_mask = test_attention_mask.to(DEVICE)
test_token_type_ids = test_token_type_ids.to(DEVICE)
test_labels = test_labels.to(DEVICE)

with torch.no_grad():
  test_predictions = model(test_input_ids,
                           attention_mask=test_attention_mask,
                           token_type_ids=test_token_type_ids)[0].squeeze(1)
  test_loss = torch.sqrt(((test_predictions - test_labels)**2).mean()).item()

print(f'| Test Loss: {test_loss:.5f} |')

| Test Loss: 0.52763 |


In [None]:
fix_seed()

test_loss = 0
test_logits_all = torch.tensor([], device=DEVICE)

# Turn on evaluate mode. This de-activates dropout. 
model.eval()

# We do not compute gradients within this block, i.e. no training
with torch.no_grad():

    for input_ids_batch, attention_mask_batch, token_type_ids_batch, labels in test_dataloader:
        
        # get the output
        outputs = model(input_ids_batch,
                        attention_mask=attention_mask_batch,
                        token_type_ids=token_type_ids_batch)

        logits_batch = outputs[0].squeeze(1)
        loss_batch = rmse(logits_batch, labels)
        #test_logits_all += logits_batch.tolist()
        test_logits_all = torch.cat((test_logits_all, logits_batch), 0)

        test_loss += loss_batch.item()

    average_test_loss = test_loss / len(test_dataloader)

print(f'Test Loss: {average_test_loss:.5f}')

Test Loss: 0.51948


## Write results

In [None]:
def write_predictions(predictions, test_data_frame, out_loc):
    test_data_frame['pred'] = predictions
    output = test_data_frame[['id','pred']]
    output.to_csv(out_loc, index=False)
        
    print('Output file created:\n\t- '+os.path.abspath(out_loc))


# write the predictions for the dev data into 'task-1-output.csv'
out_loc = 'gdrive/My Drive/subtask-1/task-1-output.csv'
write_predictions(test_predictions.cpu(), test, out_loc)

Output file created:
	- /content/gdrive/My Drive/subtask-1/task-1-output.csv


## Check final results

In [None]:
def score(truth_loc, prediction_loc):
    truth = pd.read_csv(truth_loc, usecols=['id','meanGrade'])
    pred = pd.read_csv(prediction_loc, usecols=['id','pred'])
    
    assert(sorted(truth.id) == sorted(pred.id)),"ID mismatch between ground truth and prediction!"
    
    data = pd.merge(truth,pred)
    rmse = np.sqrt(np.mean((data['meanGrade'] - data['pred'])**2))
    
    print("RMSE = %.6f" % rmse)

    return rmse   

# print RMSE
truth_loc = 'gdrive/My Drive/subtask-1/test.csv'
prediction_loc = 'gdrive/My Drive/subtask-1/task-1-output.csv'
test_loss = score(truth_loc, prediction_loc)

RMSE = 0.527628


# Logging Statistics

In [None]:
B1 = "bert-base-uncased"
A2 = "albert-base-v2"
A2XX = "albert-xxlarge-v2"
E = "electra"
XL = "xlnet"
B1M = "bert-base-uncased more_data"
B1MS = "BertBaseUncasedMDataStp*2"

training_stats, log_num = add_training_stats(training_stats, 
                                             log_num,
                                             B1MS,
                                             BATCH_SIZE, 
                                             N_EPOCHS,
                                             "{:.0e}".format(LRATE),
                                             "{:.0e}".format(FRATE), 
                                             "{:.0e}".format(EPS), 
                                             WU,
                                             WDECAY, 
                                             train_loss,
                                             val_loss,
                                             test_loss
                                             )

In [None]:
# Display floats with five decimal places.
pd.set_option('precision', 5)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'log' as the row index.
df_stats = df_stats.set_index('log')

# Display the table.
df_stats

Unnamed: 0_level_0,Model Name,Batch Size,N_Epochs,lr,fr,eps,wu,wd,Training Loss,Valid. Loss,Test Loss
log,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,bert-base-uncased,16,1,0.008,3e-05,1e-08,0.3,0.01,0.59192,0.54155,0.54946
2,bert-base-uncased,16,2,0.008,3e-05,1e-08,0.3,0.01,0.51444,0.53222,0.54059
3,bert-base-uncased,16,2,0.008,3e-05,1e-08,0.3,0.01,0.52616,0.53481,0.54498
4,bert-base-uncased,16,2,0.008,3e-05,1e-08,0.3,0.01,0.52624,0.53648,0.54421
5,bert-base-uncased,16,2,0.008,3e-05,1e-08,0.3,0.01,0.52289,0.53178,0.54142
6,bert-base-uncased,8,2,0.008,3e-05,1e-08,0.3,0.01,0.50253,0.52634,0.54085
7,bert-base-uncased,16,2,0.008,3e-05,1e-08,0.3,0.01,0.51758,0.52993,0.53935
8,bert-base-uncased,32,2,0.009,3e-05,1e-08,0.3,0.01,0.53778,0.53758,0.54469
9,bert-base-uncased,16,2,0.009,3e-05,1e-08,0.3,0.01,0.49362,0.52963,0.53184
10,bert-base-uncased,16,2,0.009,3e-05,1e-08,0.3,0.01,0.49701,0.53673,0.53775


## Write statistics

In [None]:
log_loc = 'gdrive/My Drive/subtask-1/log_2.csv'
df_stats.to_csv(log_loc, index=False)

In [None]:
training_stats.pop(-1)
log_num -= 1