# Funniness Estimation System v1.0

In [None]:
"""
@author: Ziyang Lin
         zlin19@sheffield.ac.uk
         University of Sheffield, UK
"""

'''
A two inputs NN regression system for
"Assessing the Funniness of Edited News Headlines (SemEval-2020)" task 1
in which given the original and the edited headline, the system
is required to predict the mean funniness of the edited headline.
'''

import random

import pandas as pd
import numpy as np

import os
import re

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from google.colab import drive 
drive.mount('/content/gdrive')

import nltk
nltk.download('punkt')
from nltk import word_tokenize

# fix the seeds to get consistent results before every training
# loop in what follows
def fix_seed(seed=234):
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  np.random.seed(seed)
  random.seed(seed)

# Preprocessing Datasets

In [None]:
def processed_data_to_lists(train):
    headls_words = [(origin_headl, new_word) for (origin_headl, new_word) in zip(train.original.to_list(), train.edit.to_list())]
    labels_list = train.meanGrade.to_list()

    # list of tuple for original headlines and new edited headlines
    o_headls_n_headls = []
    
    new_word_list = []

    for origin_headl, new_word in headls_words:
      # pattern
      p = re.compile(r'\<(.*?)\/\>')
      # get the normal version of the original headline
      origin_word = ''.join(re.findall(p, origin_headl))
      normal_origin_headl = p.sub(origin_word, origin_headl)
      # get the new edited headline
      new_headl = p.sub(new_word, origin_headl)
      # pair them and put them into the list
      o_headls_n_headls.append((normal_origin_headl,new_headl))

      new_word_list.append(new_word)

    return o_headls_n_headls, labels_list, new_word_list


# tokenize both the original headlines and the corresponding new edited headlines
def get_tokenized_headls(o_headls_n_headls):
    tokenized_headls = [] 
    for origin_headl, new_headl in o_headls_n_headls:
      origin_headl = " ".join(word_tokenize(origin_headl))
      new_headl = " ".join(word_tokenize(new_headl))    

      tokenized_origin = []
      tokenized_new = []

      for token in origin_headl.split(' '):
        token = token.lower()
        tokenized_origin.append(token)

      for token in new_headl.split(' '):
        token = token.lower()
        tokenized_new.append(token)

      tokenized_headls.append((tokenized_origin, tokenized_new))

    return tokenized_headls


def get_word2idx(tokenized_headls, new_word_list):
    vocabulary = []
    for origin_headl, new_headl in tokenized_headls:
      for token in origin_headl:
          if token not in vocabulary:
              vocabulary.append(token)
              
    for token in new_word_list:
      if token not in vocabulary:
          vocabulary.append(token)
  
    word2idx = {w: idx+1 for (idx, w) in enumerate(vocabulary)}
    # we reserve the 0 index for the padding token
    word2idx['<pad>'] = 0
      
    return word2idx


def get_model_inputs(tokenized_headls, word2idx, labels):
    # we index our original headlines and the corresponding new edited headlines
    vectorized_headls = [([word2idx[tk] for tk in origin if tk in word2idx],[word2idx[tk] for tk in new if tk in word2idx]) for origin, new in tokenized_headls]

    # the original headlines lengths and the new headlines lengths
    origin_headl_lengths = [len(origin_headl) for origin_headl, new_headl in vectorized_headls]
    new_headl_lengths = [len(new_headl) for origin_headl, new_headl in vectorized_headls]

    # Get maximum length
    max_len = max(origin_headl_lengths)
    
    # we create two tensors of the same fixed size filled with zeroes for padding
    origin_tensor = torch.zeros((len(vectorized_headls), max_len)).long()
    new_tensor = torch.zeros((len(vectorized_headls), max_len)).long()

    # we fill them with our vectorized headlines 
    for idx, ((origin_headl, new_headl), origin_headllen) in enumerate(zip(vectorized_headls, origin_headl_lengths)):
      origin_tensor[idx, :origin_headllen] = torch.LongTensor(origin_headl)

    for idx, ((origin_headl, new_headl), new_headllen) in enumerate(zip(vectorized_headls, new_headl_lengths)):
      new_tensor[idx, :new_headllen] = torch.LongTensor(new_headl)  

    # Label tensor
    label_tensor = torch.FloatTensor(labels)
    
    return origin_tensor, new_tensor, label_tensor

In [None]:
train_loc = 'gdrive/My Drive/subtask-1/train.csv'
test_loc = 'gdrive/My Drive/subtask-1/dev.csv'
train = pd.read_csv(train_loc)    
test = pd.read_csv(test_loc)

# Prepare the training corpus and labels
o_headls_n_headls, labels_list, new_word_list = processed_data_to_lists(train)
tokenized_headls = get_tokenized_headls(o_headls_n_headls)
word2idx = get_word2idx(tokenized_headls, new_word_list)
origin_tensor, new_tensor, label_tensor = get_model_inputs(tokenized_headls, word2idx, labels_list)

print('origin_tensor:')
print(origin_tensor)
print('new_tensor:')
print(new_tensor)
print('label_tensor:')
print(label_tensor)
print('vocab_size:')
print(len(word2idx))

print()
print()

# Prepare the validation corpus and labels
valid_o_headls_n_headls, valid_labels_list, valid_new_word_list = processed_data_to_lists(test)
valid_tokenized_headls = get_tokenized_headls(valid_o_headls_n_headls)
valid_origin_tensor, valid_new_tensor, valid_label_tensor = get_model_inputs(valid_tokenized_headls, word2idx, valid_labels_list)

print('valid_origin_tensor:')
print(valid_origin_tensor)
print('valid_new_tensor:')
print(valid_new_tensor)
print('valid_label_tensor:')
print(valid_label_tensor)

origin_tensor:
tensor([[   1,    2,    3,  ...,    0,    0,    0],
        [  16,   17,   18,  ...,    0,    0,    0],
        [  32,   33,   34,  ...,    0,    0,    0],
        ...,
        [5728, 2737, 5729,  ...,    0,    0,    0],
        [7010,   80, 2169,  ...,    0,    0,    0],
        [ 105,   93,   27,  ...,    0,    0,    0]])
new_tensor:
tensor([[   1,    2,    3,  ...,    0,    0,    0],
        [  16,   17,   18,  ...,    0,    0,    0],
        [  32,   33,   34,  ...,    0,    0,    0],
        ...,
        [5728, 2737, 5729,  ...,    0,    0,    0],
        [7010,   80, 2169,  ...,    0,    0,    0],
        [ 105,   93,   27,  ...,    0,    0,    0]])
label_tensor:
tensor([0.2000, 1.6000, 1.0000,  ..., 0.6000, 1.4000, 0.4000])
vocab_size:
11722


valid_origin_tensor:
tensor([[1674,  323, 1832,  ...,    0,    0,    0],
        [ 509, 2944,  855,  ...,    0,    0,    0],
        [1598,   80,  749,  ...,    0,    0,    0],
        ...,
        [  88,  903,  398,  ...,  

# Define Model

In [None]:
class TwoInputsNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim_1, hidden_dim_2, hidden_dim_3, vocab_size):  
        super(TwoInputsNN, self).__init__()
        
        # embedding (lookup layer) layer
        # padding_idx argument makes sure that the 0-th token in the vocabulary
        # is used for padding purposes i.e. its embedding will be a 0-vector
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # hidden layer 1
        self.fc1 = nn.Linear(embedding_dim, hidden_dim_1)

        # hidden layer 2
        self.fc2 = nn.Linear(hidden_dim_1, hidden_dim_2)
        
        # activation
        self.relu1 = nn.ReLU()
        
        # hidden layer 3
        self.fc3 = nn.Linear(hidden_dim_2, hidden_dim_3)


    def forward(self, x, y):
        # tensor x and tensor y have shape (batch_size, max_headl_len)
        
        # put x into embedding layer
        x_embedded = self.embedding(x)
        # Now `embedding` has shape (batch size, max_headl_len, embedding dim)
        # Compute the average embeddings of shape (batch_size, embedding_dim)
        # Implement averaging that ignores padding (average using actual headline lengths).        
        x_headl_lens = x.ne(0).sum(1, keepdims=True)
        x_averaged = x_embedded.sum(1) / x_headl_lens

        # put y into embedding layer
        y_embedded = self.embedding(y)       
        y_headl_lens = y.ne(0).sum(1, keepdims=True)
        y_averaged = y_embedded.sum(1) / y_headl_lens

        # hidden layer 1
        x_out = self.fc1(x_averaged)
        y_out = self.fc1(y_averaged)

        x_out = self.relu1(x_out)
        y_out = self.relu1(y_out)

        # hidden layer 2
        x_out = self.fc2(x_out)
        y_out = self.fc2(y_out)

        x_out = self.relu1(x_out)
        y_out = self.relu1(y_out)

        # hidden layer 3
        x_out = self.fc3(x_out)
        y_out = self.fc3(y_out)


        # output layer
        out = x_out * y_out 
        out = torch.sum(out, 1, keepdim = True)

        return out




Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Start Training

In [None]:
# Reset the seed before every model construction for reproducible results
fix_seed()

# we will train for N epochs (The model will see the corpus N times)
EPOCHS = 100

# Learning rate is initially set to 0.145
LRATE = 0.145

# we define our embedding dimension (dimensionality of the output of the first layer)
EMBEDDING_DIM = 300

# dimensionality of the output of the second hidden layer
HIDDEN_DIM_1 = 100

# dimensionality of the output of the third hidden layer
HIDDEN_DIM_2 = 50

# dimensionality of the output of the fourth hidden layer
HIDDEN_DIM_3 = 10

# Construct the model
model = TwoInputsNN(EMBEDDING_DIM, HIDDEN_DIM_1, HIDDEN_DIM_2, HIDDEN_DIM_3, len(word2idx))

# Print the model
print(model)

# we use the stochastic gradient descent (SGD) optimizer
optimizer = optim.SGD(model.parameters(), lr=LRATE)

# schedule learning rate using scheduler
steps = 100
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, steps)

# Input and label tensors for training
x_feature = origin_tensor
y_feature = new_tensor
target = label_tensor

# Input and label tensors for validation
valid_x_feature = valid_origin_tensor
valid_y_feature = valid_new_tensor
valid_target = valid_label_tensor


################
# Start training
################
print(f'Will train for {EPOCHS} epochs')
for epoch in range(1, EPOCHS + 1):
  # to ensure the dropout (explained later) is "turned on" while training
  # good practice to include even if do not use here
  model.train()
  
  # we zero the gradients as they are not removed automatically
  optimizer.zero_grad()
  
  # squeeze is needed as the predictions will have the shape (batch size, 1)
  # and we need to remove the dimension of size 1
  predictions = model(x_feature, y_feature).squeeze(1)

  # Compute here the RMSE loss
  loss = torch.sqrt(((predictions - target)**2).mean())
  train_loss = loss.item()

  # calculate the gradient of each parameter
  loss.backward()

  # update the parameters using the gradients and optimizer algorithm 
  optimizer.step()
  
  # update the learning rate
  scheduler.step()

  # this puts the model in "evaluation mode" (turns off dropout and batch normalization)
  # good practise to include even if we do not use them right now
  model.eval()

  # we do not compute gradients within this block, i.e. no training
  with torch.no_grad():
    valid_predictions = model(valid_x_feature, valid_y_feature).squeeze(1)
    valid_loss = torch.sqrt(((valid_predictions - valid_target)**2).mean()).item()
  
  print(f'| Epoch: {epoch:02} | Train Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f} |')

TwoInputsNN(
  (embedding): Embedding(11722, 300, padding_idx=0)
  (fc1): Linear(in_features=300, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (relu1): ReLU()
  (fc3): Linear(in_features=50, out_features=10, bias=True)
)
Will train for 100 epochs
| Epoch: 01 | Train Loss: 1.050 | Val. Loss: 1.007 |
| Epoch: 02 | Train Loss: 1.012 | Val. Loss: 0.938 |
| Epoch: 03 | Train Loss: 0.944 | Val. Loss: 0.819 |
| Epoch: 04 | Train Loss: 0.826 | Val. Loss: 0.653 |
| Epoch: 05 | Train Loss: 0.660 | Val. Loss: 0.589 |
| Epoch: 06 | Train Loss: 0.590 | Val. Loss: 0.586 |
| Epoch: 07 | Train Loss: 0.589 | Val. Loss: 0.587 |
| Epoch: 08 | Train Loss: 0.588 | Val. Loss: 0.584 |
| Epoch: 09 | Train Loss: 0.587 | Val. Loss: 0.586 |
| Epoch: 10 | Train Loss: 0.586 | Val. Loss: 0.583 |
| Epoch: 11 | Train Loss: 0.586 | Val. Loss: 0.584 |
| Epoch: 12 | Train Loss: 0.585 | Val. Loss: 0.583 |
| Epoch: 13 | Train Loss: 0.584 | Val. Loss: 0.583 |
| Epoch: 14 | Tra

# Start Testing

In [None]:
test_loc = 'gdrive/My Drive/subtask-1/test.csv'    
test = pd.read_csv(test_loc)

# Prepare the test corpus and labels
test_o_headls_n_headls, test_labels_list, test_new_word_list = processed_data_to_lists(test)
test_tokenized_headls = get_tokenized_headls(test_o_headls_n_headls)
test_origin_tensor, test_new_tensor, test_label_tensor = get_model_inputs(test_tokenized_headls, word2idx, test_labels_list)

print('test_origin_tensor:')
print(test_origin_tensor)
print('test_new_tensor:')
print(test_new_tensor)
print('test_label_tensor:')
print(test_label_tensor)

# run on the test corpus
model.eval()

test_x_feature = test_origin_tensor
test_y_feature = test_new_tensor
test_target = test_label_tensor

with torch.no_grad():
  test_predictions = model(test_x_feature, test_y_feature).squeeze(1)
  test_loss = torch.sqrt(((test_predictions - test_target)**2).mean()).item()

print(f'| Test Loss: {test_loss:.3f} |')

test_origin_tensor:
tensor([[  87, 2816,  234,  ...,    0,    0,    0],
        [ 392, 1532,  425,  ...,    0,    0,    0],
        [ 212,    2, 7535,  ...,    0,    0,    0],
        ...,
        [ 538,  234,  224,  ...,    0,    0,    0],
        [4808, 2153, 5571,  ...,    0,    0,    0],
        [  58,  429, 1988,  ...,    0,    0,    0]])
test_new_tensor:
tensor([[  87, 2816,  234,  ...,    0,    0,    0],
        [ 392, 1532,  773,  ...,    0,    0,    0],
        [ 212,    2, 7535,  ...,    0,    0,    0],
        ...,
        [ 538,  234,  224,  ...,    0,    0,    0],
        [4808, 2153, 5571,  ...,    0,    0,    0],
        [  58,  429, 1988,  ...,    0,    0,    0]])
test_label_tensor:
tensor([1.2000, 0.4000, 1.0000,  ..., 0.4000, 0.0000, 0.8000])
| Test Loss: 0.576 |


# Write Results

In [None]:
def write_predictions(predictions, test_data_frame, out_loc):
    test_data_frame['pred'] = predictions
    output = test_data_frame[['id','pred']]
    output.to_csv(out_loc, index=False)
        
    print('Output file created:\n\t- '+os.path.abspath(out_loc))


# write the predictions for the dev data into 'task-1-output.csv'
out_loc = 'gdrive/My Drive/subtask-1/task-1-output.csv'
write_predictions(test_predictions, test, out_loc)   

Output file created:
	- /content/gdrive/My Drive/subtask-1/task-1-output.csv


# Check Final Results

In [None]:
def score(truth_loc, prediction_loc):
    truth = pd.read_csv(truth_loc, usecols=['id','meanGrade'])
    pred = pd.read_csv(prediction_loc, usecols=['id','pred'])
    
    assert(sorted(truth.id) == sorted(pred.id)),"ID mismatch between ground truth and prediction!"
    
    data = pd.merge(truth,pred)
    rmse = np.sqrt(np.mean((data['meanGrade'] - data['pred'])**2))
    
    print("RMSE = %.3f" % rmse)    

# print RMSE
truth_loc = 'gdrive/My Drive/subtask-1/test.csv'
prediction_loc = 'gdrive/My Drive/subtask-1/task-1-output.csv'
score(truth_loc, prediction_loc)

RMSE = 0.576
