# Funniness Estimation System  v4.5

In [1]:
"""
@author: Ziyang Lin
         zlin19@sheffield.ac.uk
         University of Sheffield, UK
"""

'''
A system for
"Assessing the Funniness of Edited News Headlines (SemEval-2020)" task 2.
'''

import random

import pandas as pd
import numpy as np

import os
import re
import time
import math

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets
import torch.utils.data as tud

from google.colab import drive 
drive.mount('/content/gdrive')

import nltk
nltk.download('punkt')
from nltk import word_tokenize


# fix the seeds to get consistent results before every training
# loop in what follows
def fix_seed(seed=1234):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)


# Helper function to print the run time
def run_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
log_num_r = 0
real_task_stats = []

def add_real_task_stats(real_task_stats, log_num_r, MODEL_NAME, BATCH_SIZE, N_EPOCHS, LRATE, FRATE, EPS, WU, WDECAY, train_loss, train_accuracy, val_loss, val_accuracy, test_accuracy):
    log_num_r += 1

    real_task_stats.append(
        {
            'log': log_num_r,
            'Model Name': MODEL_NAME,
            'Batch Size': BATCH_SIZE,
            'N_Epochs': N_EPOCHS,
            'lr': LRATE,
            'fr': FRATE,
            'eps': EPS,
            'wu': WU,
            'wd': WDECAY,
            'Training Loss': train_loss,
            'Training Accur.': train_accuracy,
            'Valid. Loss': val_loss,
            'Valid. Accur.': val_accuracy,
            'Testing Accur.': test_accuracy
        }
    )
    
    return real_task_stats, log_num_r

In [4]:
log_num_f = 0
fake_task_stats = []

def add_fake_task_stats(fake_task_stats, log_num_f, MODEL_NAME, BATCH_SIZE, N_EPOCHS, LRATE, FRATE, EPS, WU, WDECAY, train_loss, val_loss):
    log_num_f += 1

    fake_task_stats.append(
        {
            'log': log_num_f,
            'Model Name': MODEL_NAME,
            'Batch Size': BATCH_SIZE,
            'N_Epochs': N_EPOCHS,
            'lr': LRATE,
            'fr': FRATE,
            'eps': EPS,
            'wu': WU,
            'wd': WDECAY,
            'Training Loss': train_loss,
            'Valid. Loss': val_loss
        }
    )
    
    return fake_task_stats, log_num_f

In [5]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 8.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 23.7MB/s 
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 45.2MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K 

In [6]:
# do computation on a GPU if possible 
if torch.cuda.is_available():
  torch.backends.cudnn.deterministic = True
  DEVICE='cuda:0'
else:
  DEVICE='cpu'

print('Device is', DEVICE)

Device is cuda:0


# Preprocessing Datasets

## Read data from csv files

In [7]:
train_loc = 'gdrive/My Drive/subtask-2/train.csv'
dev_loc = 'gdrive/My Drive/subtask-2/dev.csv'
test_loc = 'gdrive/My Drive/subtask-2/test.csv'
train = pd.read_csv(train_loc)  
valid = pd.read_csv(dev_loc)
test = pd.read_csv(test_loc)

In [8]:
def get_edited_headlines_list(headls_words):
    # list of new edited headlines
    headls_list = []
    
    for origin_headl, new_word in headls_words:
      # pattern
      p = re.compile(r'\<(.*?)\/\>')
      # get the new edited headline
      new_headl = p.sub(new_word, origin_headl)
      # add it to the list
      headls_list.append(new_headl)

    return headls_list


def processed_data_to_lists(train):
    headls_words_1 = [(origin_headl_1, new_word_1) for (origin_headl_1, new_word_1) in zip(train.original1.to_list(), train.edit1.to_list())]
    headls_words_2 = [(origin_headl_2, new_word_2) for (origin_headl_2, new_word_2) in zip(train.original2.to_list(), train.edit2.to_list())]
    
    labels_list = train.label.to_list()

    meanGrade1_list = train.meanGrade1.to_list()
    meanGrade2_list = train.meanGrade2.to_list()
    meanGrade_list = meanGrade1_list + meanGrade2_list

    meanGrade1_list = train.meanGrade1.to_list()
    meanGrade2_list = train.meanGrade2.to_list()
    meanGrade_list = meanGrade1_list + meanGrade2_list

    new_word1_list = train.edit1.to_list()
    new_word2_list = train.edit2.to_list()
    new_word_list = new_word1_list + new_word2_list
    
    headls_1 = get_edited_headlines_list(headls_words_1)
    headls_2 = get_edited_headlines_list(headls_words_2)

    return headls_1, headls_2, labels_list, meanGrade_list, new_word_list



## Get lists of headlines and list of labels

In [9]:
train_headls_1, train_headls_2, train_labels_list, train_meanGrade_list, train_new_word_list = processed_data_to_lists(train)
valid_headls_1, valid_headls_2, valid_labels_list, valid_meanGrade_list, valid_new_word_list = processed_data_to_lists(valid)
test_headls_1, test_headls_2, test_labels_list, test_meanGrade_list, test_new_word_list = processed_data_to_lists(test)

In [10]:
train_headls = train_headls_1 + train_headls_2
valid_headls = valid_headls_1 + valid_headls_2
test_headls = test_headls_1 + test_headls_2

In [248]:
# extra data for training

train_loc_extra = 'gdrive/My Drive/subtask-2/train_funlines.csv'
train_extra = pd.read_csv(train_loc_extra)
train_headls_1_extra, train_headls_2_extra, train_labels_list_extra, train_meanGrade_list_extra, train_new_word_list_extra = processed_data_to_lists(train_extra)

train_headls_1 = train_headls_1 + train_headls_1_extra
train_headls_2 = train_headls_2 + train_headls_2_extra
train_meanGrade_list = train_meanGrade_list + train_meanGrade_list_extra
train_labels_list = train_labels_list + train_labels_list_extra
train_new_word_list = train_new_word_list + train_new_word_list_extra

train_headls = train_headls + train_headls_1_extra + train_headls_2_extra

len(train_headls_1)

15255

In [11]:
from transformers import AlbertTokenizer

# Load the ALBERT tokenizer.
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760289.0, style=ProgressStyle(descripti…




In [12]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [245]:
print(' Original: ', train_headls_1[0])

print('Tokenized: ', tokenizer.tokenize(train_headls_1[0]))

print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_headls_1[0])))

 Original:  " Gene Cernan , Last Dancer on the Moon , Dies at 82 "
Tokenized:  ['"', 'gene', 'ce', '##rna', '##n', ',', 'last', 'dancer', 'on', 'the', 'moon', ',', 'dies', 'at', '82', '"']
Token IDs:  [1000, 4962, 8292, 12789, 2078, 1010, 2197, 8033, 2006, 1996, 4231, 1010, 8289, 2012, 6445, 1000]


## Max sequence length for pre-trained LMs

In [249]:
max_one_len = 0

"""for headl in train_headls:
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(headl, add_special_tokens=True)
    # Update the maximum sentence length.
    max_one_len = max(max_one_len, len(input_ids))

print('Max sequence length for two sentences: ', (max_one_len-1)*2)
print('Max sequence length for one sentence: ', max_one_len)"""


for headl in train_headls:
    headl = headl.split()
    max_one_len = max(len(headl), max_one_len)

print('Max sequence length for two headlines: ', max_one_len*2 + 3 )
print('Max sequence length for new \'headlines + new words\': ', max_one_len + 4 )


Max sequence length for two headlines:  57
Max sequence length for new 'headlines + new words':  31


## Get encoded inputs for pre-trained LMs

In [250]:
# prepare encoded inputs for real task
train_encoded_inputs = tokenizer(train_headls_1, train_headls_2, padding='max_length', max_length=57, truncation=True, return_tensors="pt")
valid_encoded_inputs = tokenizer(valid_headls_1, valid_headls_2, padding='max_length', max_length=57, truncation=True, return_tensors="pt")
test_encoded_inputs = tokenizer(test_headls_1, test_headls_2, padding='max_length', max_length=57, truncation=True, return_tensors="pt")

# prepare encoded inputs for fake task
pre_train_encoded = tokenizer(train_headls, train_new_word_list, padding='max_length', max_length=31, truncation=True, return_tensors="pt")
pre_valid_encoded = tokenizer(valid_headls, valid_new_word_list, padding='max_length', max_length=31, truncation=True, return_tensors="pt")

In [251]:
# get input_ids, attention_mask, token_type_ids and labels for real task
train_input_ids = train_encoded_inputs['input_ids']
train_attention_mask = train_encoded_inputs['attention_mask']
train_token_type_ids = train_encoded_inputs['token_type_ids']
train_labels = torch.tensor(train_labels_list)

valid_input_ids = valid_encoded_inputs['input_ids']
valid_attention_mask = valid_encoded_inputs['attention_mask']
valid_token_type_ids = valid_encoded_inputs['token_type_ids']
valid_labels = torch.tensor(valid_labels_list)

test_input_ids = test_encoded_inputs['input_ids']
test_attention_mask = test_encoded_inputs['attention_mask']
test_token_type_ids = test_encoded_inputs['token_type_ids']
test_labels = torch.tensor(test_labels_list)


# get input_ids, attention_mask and labels for fake task
pre_train_input_ids = pre_train_encoded['input_ids']
pre_train_attention_mask = pre_train_encoded['attention_mask']
pre_train_token_type_ids = pre_train_encoded['token_type_ids']
pre_train_labels = torch.tensor(train_meanGrade_list)

pre_valid_input_ids = pre_valid_encoded['input_ids']
pre_valid_attention_mask = pre_valid_encoded['attention_mask']
pre_valid_token_type_ids = pre_valid_encoded['token_type_ids']
pre_valid_labels = torch.tensor(valid_meanGrade_list)

pre_train_input_ids[0]
pre_train_labels

tensor([1.2000, 0.6000, 0.6000,  ..., 1.4000, 1.4000, 2.2000])

## Prepare mini-batches

In [252]:
class BERT_Dataset(tud.Dataset):
    def __init__(self, x1, x2, x3, y1):
        self.len = x1.shape[0]

        self.x1_data = x1.to(DEVICE)
        self.x2_data = x2.to(DEVICE)
        self.x3_data = x3.to(DEVICE)
        self.y1_data = y1.to(DEVICE)


    def __getitem__(self, index):
        return self.x1_data[index], self.x2_data[index], self.x3_data[index], self.y1_data[index]


    def __len__(self):
        return self.len


class Pre_BERT_Dataset(tud.Dataset):
    def __init__(self, x1, x2, y1):
        self.len = x1.shape[0]

        self.x1_data = x1.to(DEVICE)
        self.x2_data = x2.to(DEVICE)
        self.y1_data = y1.to(DEVICE)


    def __getitem__(self, index):
        return self.x1_data[index], self.x2_data[index], self.y1_data[index]


    def __len__(self):
        return self.len

In [253]:
fix_seed()
# Batching for BERT
BATCH_SIZE = 16

# For real task
train_dataset = BERT_Dataset(train_input_ids, train_attention_mask, train_token_type_ids, train_labels)
valid_dataset = BERT_Dataset(valid_input_ids, valid_attention_mask, valid_token_type_ids, valid_labels)
test_dataset = BERT_Dataset(test_input_ids, test_attention_mask, test_token_type_ids, test_labels)

train_dataloader = tud.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = tud.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = tud.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

# For fake task
pre_train_dataset = BERT_Dataset(pre_train_input_ids, pre_train_attention_mask, pre_train_token_type_ids, pre_train_labels)
pre_valid_dataset = BERT_Dataset(pre_valid_input_ids, pre_valid_attention_mask, pre_valid_token_type_ids, pre_valid_labels)

pre_train_dataloader = tud.DataLoader(pre_train_dataset, batch_size=BATCH_SIZE, shuffle=True)
pre_valid_dataloader = tud.DataLoader(pre_valid_dataset, batch_size=BATCH_SIZE, shuffle=True)


##### demo #####
print(train_dataloader)

for x1, x2, x3, y1 in train_dataloader:
    demo_x1 = x1
    demo_x2 = x2
    demo_x3 = x3
    demo_y1 = y1
    break
    
print(x1.shape)
print(x2.shape)
print(x3.shape)
print(y1.shape)
print(len(train_dataloader))

<torch.utils.data.dataloader.DataLoader object at 0x7f1e3e147be0>
torch.Size([16, 57])
torch.Size([16, 57])
torch.Size([16, 57])
torch.Size([16])
954


# Training Preparation 

## Define accuracy

In [254]:
def accuracy(preds, y):
    """
    returns accuracy per batch
    """

    class_preds =  torch.argmax(F.softmax(preds, dim = 1), 1)
    correct = (class_preds == y).float() # convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

## Define train and evaluate

In [255]:
# define train_BERT and evaluate_BERT for real task
def train_BERT(model, train_dataloader, valid_dataloader, optimizer, scheduler, criterion, N_EPOCHS):
    fix_seed()
    model = model.to(DEVICE)
    model_list = []
    # Measure the total time for the whole run.
    t0 = time.time()

    for epoch in range(N_EPOCHS):
    
        start_time = time.time()
        # To ensure the dropout is "turned on" while training
        model.train()
        
        epoch_loss = 0
        epoch_acc = 0
    
        for input_ids_batch, attention_mask_batch, token_type_ids_batch, labels in train_dataloader:
                        
            # Zero the gradients
            optimizer.zero_grad()
            # shape(input_ids_batch) = [B, T]
            # shape(attention_mask_batch) = [B, T]
            # shape(labels) = [B]

            # get the output
            predictions = model(input_ids_batch,
                                attention_mask_batch,
                                token_type_ids_batch)
            
            # calculate the loss
            loss = criterion(predictions.view(-1, 3), labels.view(-1))

            # calculate training accuracy
            acc = accuracy(predictions, labels)
                      
            # calculate the gradient of each parameter
            loss.backward()
        
            # update the parameters using the gradients and optimizer algorithm 
            optimizer.step()

            # update the learning rate
            scheduler.step()
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
        average_epoch_loss = epoch_loss / len(train_dataloader)
        average_epoch_acc = epoch_acc / len(train_dataloader)
        
        end_time = time.time()
               
        epoch_mins, epoch_secs = run_time(start_time, end_time)
    
        average_epoch_valid_loss, average_epoch_valid_acc = evaluate_BERT(model, criterion, valid_dataloader)

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {average_epoch_loss:.5f} | Train Acc: {average_epoch_acc*100:.4f}%')
        print(f'\t Val. Loss: {average_epoch_valid_loss:.5f} |  Val. Acc: {average_epoch_valid_acc*100:.4f}%')

        model_list.append(model)

    print("")
    print("***Completed***")
    total_mins, total_secs = run_time(t0, time.time())
    print(f'Total time spent: {total_mins}m {total_secs}s')

    return average_epoch_loss, average_epoch_acc, average_epoch_valid_loss, average_epoch_valid_acc, model_list

def evaluate_BERT(model, criterion, dataloader):
    fix_seed()
    epoch_loss = 0
    epoch_acc = 0
    # Turn on evaluate mode. This de-activates dropout. 
    model.eval()

    # We do not compute gradients within this block, i.e. no training
    with torch.no_grad():

        for input_ids_batch, attention_mask_batch, token_type_ids_batch, labels in dataloader:
            
            # get the output
            predictions = model(input_ids_batch,
                            attention_mask_batch,
                            token_type_ids_batch)

            loss = criterion(predictions.view(-1, 3), labels.view(-1))
            acc = accuracy(predictions, labels)
            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)

In [256]:
# define train_BERT and evaluate_BERT for fake regression task
def pre_train_BERT(model, train_dataloader, valid_dataloader, optimizer, scheduler, criterion, N_EPOCHS):
    fix_seed()
    model = model.to(DEVICE)
    model_list = []
    t0 = time.time()

    for epoch in range(N_EPOCHS):    
        start_time = time.time()
        model.train()       
        epoch_loss = 0
        epoch_acc = 0
    
        for input_ids_batch, attention_mask_batch, token_type_ids_batch, labels in train_dataloader:                       
            optimizer.zero_grad()

            predictions = model(input_ids_batch,
                                attention_mask_batch,
                                token_type_ids_batch)
            
            loss = criterion(predictions.view(-1), labels.view(-1))                      
            loss.backward()        
            optimizer.step()
            scheduler.step()          
            epoch_loss += loss.item()
            
        average_epoch_loss = epoch_loss / len(train_dataloader)
        
        end_time = time.time()               
        epoch_mins, epoch_secs = run_time(start_time, end_time)
    
        average_epoch_valid_loss = pre_evaluate_BERT(model, criterion, valid_dataloader)

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {average_epoch_loss:.5f} ')
        print(f'\t Val. Loss: {average_epoch_valid_loss:.5f} ')
        
        model_list.append(model)

    print("")
    print("***Completed***")
    total_mins, total_secs = run_time(t0, time.time())
    print(f'Total time spent: {total_mins}m {total_secs}s')

    return average_epoch_loss, average_epoch_valid_loss, model_list

def pre_evaluate_BERT(model, criterion, dataloader):
    fix_seed()
    epoch_loss = 0
    model.eval()

    with torch.no_grad():

        for input_ids_batch, attention_mask_batch, token_type_ids_batch, labels in dataloader:
            
            predictions = model(input_ids_batch,
                                attention_mask_batch,
                                token_type_ids_batch)

            loss = criterion(predictions.view(-1), labels.view(-1))
            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

## Define models

In [214]:
from transformers import AlbertModel, BertModel, ElectraModel, AdamW

class AlbertModel_FakeTask(nn.Module):
    def __init__(self):
        super(AlbertModel_FakeTask, self).__init__()
        self.albert = AlbertModel.from_pretrained('albert-base-v2')
        self.linear = nn.Linear(768, 1)

    def forward(self, source, mask):
        output = self.albert(source, attention_mask=mask)
        # take the last hidden state
        hidden = output[0]
        # take the representations for CLS
        all_cls = hidden[:, 0, :]
        # pass to linear layer to get the score for each sentence(cls here)
        pred = self.linear(all_cls)

        return pred


class AlbertModel_Real(nn.Module):
    def __init__(self, albert_model):
        super(AlbertModel_Real, self).__init__()
        self.albert = albert_model
        self.classifier = nn.Linear(768, 3)

    def forward(self, source, mask, type_ids):
        output = self.albert(source, attention_mask=mask, token_type_ids=type_ids)
        # take the last hidden state
        hidden = output[0]
        # take the representations for CLS
        all_cls = hidden[:, 0, :]
        # pass to linear layer to get the prediction for every sentence pairs(cls here)
        pred = self.classifier(all_cls)

        return pred


class BertModel_FakeTask(nn.Module):
    def __init__(self):
        super(BertModel_FakeTask, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.linear = nn.Linear(768, 1)

    def forward(self, source, mask, type_ids):
        output = self.bert(source, attention_mask=mask, token_type_ids=type_ids)
        # take the last hidden state
        hidden = output[0]
        # take the representations for CLS
        all_cls = hidden[:, 0, :]
        # pass to linear layer to get the score for each sentence(cls here)
        pred = self.linear(all_cls)

        return pred


class BertModel_Real(nn.Module):
    def __init__(self, bert_model, dropout_prob):
        super(BertModel_Real, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(dropout_prob)
        self.classifier = nn.Linear(768, 3)

    def forward(self, source, mask, type_ids):
        output = self.bert(source, attention_mask=mask, token_type_ids=type_ids)
        # take the last hidden state
        hidden = output[0]
        # take the representations for CLS
        all_cls = hidden[:, 0, :]

        all_cls = self.dropout(all_cls)

        # pass to linear layer to get the prediction for every sentence pairs(cls here)
        pred = self.classifier(all_cls)

        return pred        

# Fake Task

In [96]:
fix_seed()
# Create the pre_model for the fake task
pre_model_al = AlbertModel_FakeTask()

pre_model_ber = BertModel_FakeTask()


param_names_list = [n for n, p in pre_model_ber.named_parameters()]
param_names_list

['bert.embeddings.word_embeddings.weight',
 'bert.embeddings.position_embeddings.weight',
 'bert.embeddings.token_type_embeddings.weight',
 'bert.embeddings.LayerNorm.weight',
 'bert.embeddings.LayerNorm.bias',
 'bert.encoder.layer.0.attention.self.query.weight',
 'bert.encoder.layer.0.attention.self.query.bias',
 'bert.encoder.layer.0.attention.self.key.weight',
 'bert.encoder.layer.0.attention.self.key.bias',
 'bert.encoder.layer.0.attention.self.value.weight',
 'bert.encoder.layer.0.attention.self.value.bias',
 'bert.encoder.layer.0.attention.output.dense.weight',
 'bert.encoder.layer.0.attention.output.dense.bias',
 'bert.encoder.layer.0.attention.output.LayerNorm.weight',
 'bert.encoder.layer.0.attention.output.LayerNorm.bias',
 'bert.encoder.layer.0.intermediate.dense.weight',
 'bert.encoder.layer.0.intermediate.dense.bias',
 'bert.encoder.layer.0.output.dense.weight',
 'bert.encoder.layer.0.output.dense.bias',
 'bert.encoder.layer.0.output.LayerNorm.weight',
 'bert.encoder.layer

## Hyperparameters

In [101]:
# Hyperparameters for the fake task LM:

# Number of training epochs. The BERT authors recommend between 2 and 4.
N_EPOCHS = 1

LRATE = 5e-3
FRATE = 2e-5
EPS = 1e-8
WU = 0.1
WDECAY = 0.01

# Total number of training steps is [number of batches] x [number of epochs]. 
TOTSTEPS = len(train_dataloader) * N_EPOCHS
WUSTEPS = int(TOTSTEPS * WU)

# Apply weight decay to all parameters other than bias and layer normalization terms
# Optimize the parameters of the head layer by the learning rate
# Optimize the parameters of the pretrain LM by the fine-tuning rate
no_decay = ['bias', 'LayerNorm.weight']
named_parameters = pre_model_ber.named_parameters()
optimizer_grouped_parameters = [
    {'params': [p for n, p in named_parameters if not any(nd in n for nd in no_decay)], 'weight_decay': WDECAY},
    {'params': [p for n, p in named_parameters if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    {'params': [p for n, p in named_parameters if "albert" not in n], 'lr': LRATE}
]
#    {'params': [p for n, p in named_parameters if "albert" in n], 'lr': FRATE},

## Optimizer & learning rate scheduler

In [102]:
# Create the optimizer, 
# the epsilon parameter is a very small number to prevent any division by zero
optimizer = AdamW(optimizer_grouped_parameters, lr=FRATE, eps = EPS)

In [103]:
from transformers import get_linear_schedule_with_warmup

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = WUSTEPS,
                                            num_training_steps = TOTSTEPS)

## Define RMSE

In [104]:
# define rmse
def rmse(predictions, labels):
    loss = torch.sqrt(((predictions - labels)**2).mean())

    return loss

## Fake task training

In [105]:
criterion_r = rmse

train_loss, val_loss, model_list = pre_train_BERT(pre_model_ber,
                                                  pre_train_dataloader,
                                                  pre_valid_dataloader,
                                                  optimizer,
                                                  scheduler,
                                                  criterion_r,
                                                  N_EPOCHS)

Epoch: 01 | Epoch Time: 2m 24s
	Train Loss: 0.52952 
	 Val. Loss: 0.52524 

***Completed***
Total time spent: 2m 34s


## Fake task logging

In [106]:
B1 = "bert-base-uncased"
A2 = "albert-base-v2"
E = "electra"

fake_task_stats, log_num_f = add_fake_task_stats(fake_task_stats, 
                                             log_num_f,
                                             B1,
                                             BATCH_SIZE, 
                                             N_EPOCHS,
                                             "{:.0e}".format(LRATE),
                                             "{:.0e}".format(FRATE), 
                                             "{:.0e}".format(EPS), 
                                             WU,
                                             WDECAY, 
                                             train_loss,
                                             val_loss
                                             )

pd.set_option('precision', 5)
df_stats = pd.DataFrame(data=fake_task_stats)
df_stats = df_stats.set_index('log')
# Display the table.
df_stats

Unnamed: 0_level_0,Model Name,Batch Size,N_Epochs,lr,fr,eps,wu,wd,Training Loss,Valid. Loss
log,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,bert-base-uncased,16,3,0.005,2e-05,1e-08,0.1,0.01,0.32183,0.54243
2,bert-base-uncased,16,1,0.005,2e-05,1e-08,0.1,0.01,0.52952,0.52524


In [107]:
log_loc = 'gdrive/My Drive/subtask-2/log_fake_task2.csv'
df_stats.to_csv(log_loc, index=False)

# Real Task

In [297]:
import copy

fix_seed()
# Create the model for the real task
albert_model = pre_model_al.albert
real_model_al = AlbertModel_Real(albert_model)

bert_model = copy.deepcopy(pre_model_ber.bert)
drop_prob = 0.1
real_model_ber = BertModel_Real(bert_model, drop_prob)

"""param_names_list = [n for n, p in real_model_ber.named_parameters()]
param_names_list"""

'param_names_list = [n for n, p in real_model_ber.named_parameters()]\nparam_names_list'

## Hyperparameters

In [270]:
# Hyperparameters for the real task model:

# Number of training epochs. The BERT authors recommend between 2 and 4.
N_EPOCHS_r = 2

LRATE_r = 3e-5
FRATE_r = 5e-5
EPS_r = 1e-8
WU_r = 0.1
WDECAY_r = 0.5

# Total number of training steps is [number of batches] x [number of epochs]. 
TOTSTEPS_r = len(train_dataloader) * N_EPOCHS_r
WUSTEPS_r = int(TOTSTEPS_r * WU_r)

# Apply weight decay to all parameters other than bias and layer normalization terms
# Optimize the parameters of the head layer by the learning rate
# Optimize the parameters of the pretrain LM by the fine-tuning rate
no_decay_r = ['bias', 'LayerNorm.weight']
named_parameters_r = real_model_ber.named_parameters()
optimizer_grouped_parameters_r = [
    {'params': [p for n, p in named_parameters_r if not any(nd in n for nd in no_decay_r)], 'weight_decay': WDECAY_r},
    {'params': [p for n, p in named_parameters_r if any(nd in n for nd in no_decay_r)], 'weight_decay': 0.0}
]

In [298]:
# Hyperparameters for the real task LM (2nd v):

# Number of training epochs. The BERT authors recommend between 2 and 4.
N_EPOCHS_r = 1

LRATE_r = 8e-3
FRATE_r = 3e-5
EPS_r = 1e-8
WU_r = 0.3
WDECAY_r = 0.05

"""N_EPOCHS_r = 2

LRATE_r = 5e-3
FRATE_r = 2e-5
EPS_r = 1e-8
WU_r = 0.1
WDECAY_r = 0.1"""

# Total number of training steps is [number of batches] x [number of epochs]. 
TOTSTEPS_r = len(train_dataloader) * N_EPOCHS_r * 2
WUSTEPS_r = int(TOTSTEPS_r * WU_r)

# Apply weight decay to all parameters other than bias and layer normalization terms
# Optimize the parameters of the head layer by the learning rate
# Optimize the parameters of the pretrain LM by the fine-tuning rate
no_decay_r = ['bias', 'LayerNorm.weight']
named_parameters_r = real_model_ber.named_parameters()
optimizer_grouped_parameters_r = [
    {'params': [p for n, p in named_parameters_r if not any(nd in n for nd in no_decay_r)], 'weight_decay': WDECAY_r},
    {'params': [p for n, p in named_parameters_r if any(nd in n for nd in no_decay_r)], 'weight_decay': 0.0},
    {'params': [p for n, p in named_parameters_r if "albert" not in n], 'lr': LRATE_r}
]
#    {'params': [p for n, p in named_parameters if "albert" in n], 'lr': FRATE},

## Optimizer & learning rate scheduler

In [299]:
# Create the optimizer, 
# the epsilon parameter is a very small number to prevent any division by zero
optimizer_r = AdamW(optimizer_grouped_parameters_r, lr=FRATE_r, eps = EPS_r)

In [300]:
# Create the learning rate scheduler.
scheduler_r = get_linear_schedule_with_warmup(optimizer_r, 
                                            num_warmup_steps = WUSTEPS_r,
                                            num_training_steps = TOTSTEPS_r)

## Real task training

In [301]:
criterion_c = nn.CrossEntropyLoss()

train_loss_r, train_accuracy_r, val_loss_r, val_accuracy_r, model_list_r = train_BERT(real_model_ber,
                                                                            train_dataloader,
                                                                            valid_dataloader,
                                                                            optimizer_r,
                                                                            scheduler_r,
                                                                            criterion_c,
                                                                            N_EPOCHS_r)


Epoch: 01 | Epoch Time: 3m 7s
	Train Loss: 0.95008 | Train Acc: 49.9981%
	 Val. Loss: 0.94389 |  Val. Acc: 54.5608%

***Completed***
Total time spent: 3m 16s


# Testing Real Task Models

## Start testing

In [302]:
fix_seed()

test_loss = 0
test_acc = 0
test_logits_all = []

# Turn on evaluate mode. This de-activates dropout. 
real_model_ber.eval()

# We do not compute gradients within this block, i.e. no training
with torch.no_grad():

    for input_ids_batch, attention_mask_batch, token_type_ids_batch, labels in test_dataloader:
        
        # get the output
        predictions = real_model_ber(input_ids_batch,
                                     attention_mask_batch,
                                     token_type_ids_batch)

        loss_batch = criterion_c(predictions, labels)
        test_logits_all += predictions.tolist()
        #test_logits_all = torch.cat((test_logits_all, predictions), 0)
        acc_batch = accuracy(predictions, labels)

        test_loss += loss_batch.item()
        test_acc += acc_batch.item()

    average_test_loss = test_loss / len(test_dataloader)
    average_test_acc = test_acc / len(test_dataloader)

print(f'Test Loss: {average_test_loss:.5f} | Test Acc: {average_test_acc*100:.7f}%')

Test Loss: 0.95342 | Test Acc: 52.9391892%


In [303]:
#test_logits_all.shape

## Write results

In [304]:
def write_predictions(predictions, test_data_frame, out_loc):
    test_data_frame['pred'] = predictions
    output = test_data_frame[['id','pred']]
    output.to_csv(out_loc, index=False)
        
    print('Output file created:\n\t- '+os.path.abspath(out_loc))


# write the predictions for the dev data into 'task-2-output.csv'
out_loc = 'gdrive/My Drive/subtask-2/task-2-output.csv'

test_class_preds = torch.argmax(F.softmax(torch.FloatTensor(test_logits_all), dim = 1), 1)
write_predictions(test_class_preds, test, out_loc)

"""test_class_preds = torch.argmax(F.softmax(test_logits_all, dim = 1), 1)
write_predictions(test_class_preds.cpu().numpy(), test, out_loc)"""

Output file created:
	- /content/gdrive/My Drive/subtask-2/task-2-output.csv


'test_class_preds = torch.argmax(F.softmax(test_logits_all, dim = 1), 1)\nwrite_predictions(test_class_preds.cpu().numpy(), test, out_loc)'

## Check final results

In [305]:
def score(truth_loc, prediction_loc):
    truth = pd.read_csv(truth_loc, usecols=['id','label'])
    pred = pd.read_csv(prediction_loc, usecols=['id','pred'])
         
    assert(sorted(truth.id) == sorted(pred.id)),"ID mismatch between ground truth and prediction!"
    
    data = pd.merge(truth,pred)
    data = data[data.label != 0]
    accuracy = (np.sum(data.label == data.pred)*1.0/len(data))*100
    
    print("Accuracy = %.6f" % accuracy)

    return accuracy   

# print Accuracy
truth_loc = 'gdrive/My Drive/subtask-2/test.csv'
prediction_loc = 'gdrive/My Drive/subtask-2/task-2-output.csv'
test_accuracy = score(truth_loc, prediction_loc)

Accuracy = 50.304414


# Logging Real Task Statistics

In [306]:
B1 = "bert-base-uncased"
A2 = "albert-base-v2"
E = "electra"
B1M = "bert-base-uncased more_data"

real_task_stats, log_num_r = add_real_task_stats(real_task_stats, 
                                             log_num_r,
                                             B1M,
                                             BATCH_SIZE, 
                                             N_EPOCHS_r,
                                             "{:.0e}".format(LRATE_r),
                                             "{:.0e}".format(FRATE_r), 
                                             "{:.0e}".format(EPS_r), 
                                             WU_r,
                                             WDECAY_r, 
                                             train_loss_r, 
                                             train_accuracy_r*100, 
                                             val_loss_r, 
                                             val_accuracy_r*100,
                                             test_accuracy
                                             )

In [307]:
# Display floats with five decimal places.
pd.set_option('precision', 5)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=real_task_stats)

# Use the 'log' as the row index.
df_stats = df_stats.set_index('log')

# Display the table.
df_stats

Unnamed: 0_level_0,Model Name,Batch Size,N_Epochs,lr,fr,eps,wu,wd,Training Loss,Training Accur.,Valid. Loss,Valid. Accur.,Testing Accur.
log,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,bert-base-uncased,16,3,3e-05,5e-05,1e-08,0.1,0.01,0.23012,92.14225,1.68649,51.29505,46.34703
2,bert-base-uncased,16,1,3e-05,5e-05,1e-08,0.1,0.01,0.30659,88.454,1.79389,49.22579,44.71081
3,bert-base-uncased,16,1,3e-05,5e-05,1e-08,0.1,0.01,0.1074,96.72913,2.53388,46.77646,41.55251
4,bert-base-uncased,16,1,3e-05,5e-05,1e-08,0.1,0.1,0.04975,98.69889,2.84458,47.62106,41.93303
5,bert-base-uncased,16,1,3e-05,5e-05,1e-08,0.1,0.1,0.96322,47.4063,0.94663,52.36486,49.42922
6,bert-base-uncased,16,3,3e-05,5e-05,1e-08,0.1,0.1,0.29321,89.28876,1.63454,53.40653,46.27093
7,bert-base-uncased,16,2,3e-05,5e-05,1e-08,0.1,0.5,0.82296,63.49446,0.95054,56.58784,50.0
8,bert-base-uncased,16,2,0.005,2e-05,1e-08,0.1,0.01,0.86409,60.43654,0.93353,56.84122,50.53272
9,bert-base-uncased,16,2,0.005,2e-05,1e-08,0.1,0.7,0.86533,60.06388,0.93377,57.01014,50.34247
10,bert-base-uncased,16,2,0.005,2e-05,1e-08,0.3,0.01,0.89612,57.24446,0.93043,56.02477,49.61948


## Write statistics

In [308]:
log_loc = 'gdrive/My Drive/subtask-2/log2.csv'
df_stats.to_csv(log_loc, index=False)

In [None]:
# remove the last row:
#training_stats.pop(-1)
#log_num -= 1