In [10]:
"""
@author: Ziyang Lin
         zlin19@sheffield.ac.uk
         University of Sheffield, UK
"""

'''
A two inputs NN regression system for
"Assessing the Funniness of Edited News Headlines (SemEval-2020)" task 1
in which given the original and the edited headline, the system
is required to predict the mean funniness of the edited headline.
'''

import random

import pandas as pd
import numpy as np

import os
import re
import time
import math

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext import data
# This time we will work with a dataset from the torchtext package consists of data processing utilities and popular datasets for NLP
from torchtext import datasets
import torch.utils.data as tud

from google.colab import drive 
drive.mount('/content/gdrive')

import nltk
nltk.download('punkt')
from nltk import word_tokenize


# fix the seeds to get consistent results before every training
# loop in what follows
def fix_seed(seed=234):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)


# Helper function to print time between epochs
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


def processed_data_to_lists(train):
    headls_words = [(origin_headl, new_word) for (origin_headl, new_word) in zip(train.original.to_list(), train.edit.to_list())]
    labels_list = train.meanGrade.to_list()

    # list of tuple for original headlines and new edited headlines
    o_headls_n_headls = []
    
    new_word_list = []

    for origin_headl, new_word in headls_words:
      # pattern
      p = re.compile(r'\<(.*?)\/\>')
      # get the normal version of the original headline
      origin_word = ''.join(re.findall(p, origin_headl))
      normal_origin_headl = p.sub(origin_word, origin_headl)
      # get the new edited headline
      new_headl = p.sub(new_word, origin_headl)
      # pair them and put them into the list
      o_headls_n_headls.append((normal_origin_headl,new_headl))

      new_word_list.append(new_word)

    return o_headls_n_headls, labels_list, new_word_list


# tokenize both the original headlines and the corresponding new edited headlines
def get_tokenized_headls(o_headls_n_headls):
    tokenized_headls = [] 
    for origin_headl, new_headl in o_headls_n_headls:
      origin_headl = " ".join(word_tokenize(origin_headl))
      new_headl = " ".join(word_tokenize(new_headl))    

      tokenized_origin = []
      tokenized_new = []

      for token in origin_headl.split(' '):
        token = token.lower()
        tokenized_origin.append(token)

      for token in new_headl.split(' '):
        token = token.lower()
        tokenized_new.append(token)

      tokenized_headls.append((tokenized_origin, tokenized_new))

    return tokenized_headls


def get_word2idx(tokenized_headls, new_word_list):
    vocabulary = []
    for origin_headl, new_headl in tokenized_headls:
      for token in origin_headl:
          if token not in vocabulary:
              vocabulary.append(token)
              
    for token in new_word_list:
      if token not in vocabulary:
          vocabulary.append(token)
  
    word2idx = {w: idx+1 for (idx, w) in enumerate(vocabulary)}
    # we reserve the 0 index for the padding token
    word2idx['<pad>'] = 0
      
    return word2idx


def get_model_inputs(tokenized_headls, word2idx, labels):
    # we index our original headlines and the corresponding new edited headlines
    vectorized_headls = [([word2idx[tk] for tk in origin if tk in word2idx],[word2idx[tk] for tk in new if tk in word2idx]) for origin, new in tokenized_headls]

    # the original headlines lengths and the new headlines lengths
    origin_headl_lengths = [len(origin_headl) for origin_headl, new_headl in vectorized_headls]
    new_headl_lengths = [len(new_headl) for origin_headl, new_headl in vectorized_headls]

    # Get maximum length
    max_len = max(origin_headl_lengths)
    
    # we create two tensors of the same fixed size filled with zeroes for padding
    origin_tensor = torch.zeros((len(vectorized_headls), max_len)).long()
    new_tensor = torch.zeros((len(vectorized_headls), max_len)).long()

    # we fill them with our vectorized headlines 
    for idx, ((origin_headl, new_headl), origin_headllen) in enumerate(zip(vectorized_headls, origin_headl_lengths)):
      origin_tensor[idx, :origin_headllen] = torch.LongTensor(origin_headl)

    for idx, ((origin_headl, new_headl), new_headllen) in enumerate(zip(vectorized_headls, new_headl_lengths)):
      new_tensor[idx, :new_headllen] = torch.LongTensor(new_headl)  

    # Label tensor
    label_tensor = torch.FloatTensor(labels)
    
    return origin_tensor, new_tensor, label_tensor


class TwoInputsNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim_1, hidden_dim_2, hidden_dim_3, vocab_size):  
        super(TwoInputsNN, self).__init__()
        
        # embedding (lookup layer) layer
        # padding_idx argument makes sure that the 0-th token in the vocabulary
        # is used for padding purposes i.e. its embedding will be a 0-vector
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # hidden layer 1
        self.fc1 = nn.Linear(embedding_dim, hidden_dim_1)

        # hidden layer 2
        self.fc2 = nn.Linear(hidden_dim_1, hidden_dim_2)
        
        # activation
        self.relu1 = nn.ReLU()
        
        # hidden layer 3
        self.fc3 = nn.Linear(hidden_dim_2, hidden_dim_3)


    def forward(self, x, y):
        # tensor x and tensor y have shape (batch_size, max_headl_len)
        
        # put x into embedding layer
        x_embedded = self.embedding(x)
        # Now `embedding` has shape (batch size, max_headl_len, embedding dim)
        # Compute the average embeddings of shape (batch_size, embedding_dim)
        # Implement averaging that ignores padding (average using actual headline lengths).        
        x_headl_lens = x.ne(0).sum(1, keepdims=True)
        x_averaged = x_embedded.sum(1) / x_headl_lens

        # put y into embedding layer
        y_embedded = self.embedding(y)       
        y_headl_lens = y.ne(0).sum(1, keepdims=True)
        y_averaged = y_embedded.sum(1) / y_headl_lens

        # hidden layer 1
        x_out = self.fc1(x_averaged)
        y_out = self.fc1(y_averaged)

        x_out = self.relu1(x_out)
        y_out = self.relu1(y_out)

        # hidden layer 2
        x_out = self.fc2(x_out)
        y_out = self.fc2(y_out)

        x_out = self.relu1(x_out)
        y_out = self.relu1(y_out)

        # hidden layer 3
        x_out = self.fc3(x_out)
        y_out = self.fc3(y_out)


        # output layer
        out = x_out * y_out 
        out = torch.sum(out, 1, keepdim = True)

        return out

class TwoInputsCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, out_channels, window_size, fc_out_dim, dropout):
        super(TwoInputsCNN, self).__init__()
        
        # Create the embedding layer as usual
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # in_channels -- 1 text channel
        # out_channels -- the number of output channels
        # kernel_size is (window size x embedding dim)
        self.conv = nn.Conv2d(
          in_channels=1, out_channels=out_channels,
          kernel_size=(window_size, embedding_dim))
        
        # the dropout layer
        self.dropout = nn.Dropout(dropout)

        # the fully connected layer
        self.fc = nn.Linear(out_channels, fc_out_dim)
          
    def forward(self, x, y):
        # x -> (batch size, max_sent_length)
        
        # embedded -> (batch size, max_sent_length, embedding_dim)
        # images have 3 RGB channels 
        # for the text we add 1 channel
        # embedded -> (batch size, 1, max_sent_length, embedding_dim)
        embedded_x = self.embedding(x).unsqueeze(1)
        embedded_y = self.embedding(y).unsqueeze(1)

        # Compute the feature maps      
        feature_maps_x = self.conv(embedded_x).squeeze(3)
        feature_maps_y = self.conv(embedded_y).squeeze(3)
       
        # Apply ReLU
        feature_maps_x = F.relu(feature_maps_x)
        feature_maps_y = F.relu(feature_maps_y)
        
        # Apply the max pooling layer
        pooled_x = F.max_pool1d(feature_maps_x, feature_maps_x.shape[2]).squeeze(2)
        pooled_y = F.max_pool1d(feature_maps_y, feature_maps_y.shape[2]).squeeze(2)

        dropped_x = self.dropout(pooled_x)
        dropped_y = self.dropout(pooled_y)

        # Pass the fully connected layer
        #out_x = self.fc(pooled_x)
        #out_y = self.fc(pooled_y)

        # Apply ReLU
        #out_x = F.relu(out_x)
        #out_y = F.relu(out_y)
        
        # output layer
        out = dropped_x * dropped_y 
        preds = torch.sum(out, 1, keepdim = True)


        return preds


class TwoInputsRNN(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, fc_out_dim,
                 bidirectional, dropout, embeddings):

        super().__init__()

        self.bidirectional = bidirectional
        self.hidden_dim = hidden_dim
        
        # Here, we initialize our model with pre-trained embeddings (50D pre-trained GloVe embeddings in our case).
        # This layer will fine-tune these embeddings, specific to this model/dataset.
        #self.embedding = nn.Embedding.from_pretrained(TEXT.vocab.vectors)

        # We can also train the embeddings from scratch:
        #self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0) 
        self.embedding = nn.Embedding.from_pretrained(embeddings, padding_idx=0)
        
        # An RNN layer. We specify that the batch dimension goes first
        # We have a bidirectional flag which indicates whether the model is unidirectional or bidirectional
        # RNNs can be stacked - i.e. have multiple layers. Here, we will only look at the 1 layer case.
        self.rnn = nn.RNN(embedding_dim,
                          hidden_dim,
                          batch_first=True,
                          bidirectional=bidirectional,
                          num_layers=1)

          # The linear layer takes the final hidden state and feeds it through a fully connected layer.
          # The dimensionality of the output is equal to the output class count.
          # For classification in a bidirectional RNN we concatenate:
            #  - The last hidden state from the forward RNN (obtained from final word of the sentence)
            #  - The last hidden state from the backward RNN (obtained from the first word of the sentence)
          # Due to the concatenation, our hidden size is doubled.
        
        if self.bidirectional:
            linear_hidden_in = hidden_dim * 2
        else:
            linear_hidden_in = hidden_dim

        # The linear layer
        self.fc = nn.Linear(linear_hidden_in, fc_out_dim)
        
        # We apply dropout technique that sets a random set of activations of a layer to zero.
        # This prevents the network from learning to rely on specific weights and helps to prevent overfitting. 
        # Note that the dropout layer is only used during training, and not during test time.
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text1, text2):

        # ACRONYMS:
          # B = Batch size
          # T = Max sentence length
          # E = Embedding dimension
          # D = Hidden dimension
          # O = FC Output dimension

        # shape(text1) = [B, T]

        embedded1 = self.dropout(self.embedding(text1))
        embedded2 = self.dropout(self.embedding(text2))
        # shape(embedded1) = [B, T, E]
        
        # An RNN in PyTorch returns two values:
        # (1) All hidden states of the last RNN layer
        # (2) Hidden state of the last timestep for every layer
        # Note: we are only using 1 layer
        all_hidden1, last_hidden1 = self.rnn(embedded1)
        all_hidden2, last_hidden2 = self.rnn(embedded2)
        # shape(all_hidden1) = [B, T, D*num_directions]
        # shape(last_hidden1) = [num_layers*num_directions, B, D].  num_layers = 1
        # NOTE. If we were to NOT use the `batch_first` flag, shape of all_hidden would be [T, B, D*num_directions]
        
        if self.bidirectional:
            # Concat the final forward (hidden[0,:,:]) and backward (hidden[1,:,:]) hidden layers
            last_hidden1 = torch.cat((last_hidden1[0, :, :], last_hidden1[1, :, :]), dim=-1)
            last_hidden2 = torch.cat((last_hidden2[0, :, :], last_hidden2[1, :, :]), dim=-1)
            # shape(last_hidden1) = [B, D*2]

        else:
            last_hidden1 = last_hidden1.squeeze(0)
            last_hidden2 = last_hidden2.squeeze(0)
            # shape(last_hidden1) = [B, D]

        out1 = self.fc(self.dropout(last_hidden1))
        out2 = self.fc(self.dropout(last_hidden2))
        # shape(out1) = [B, O]

        # Our predictions.
        out = out1 * out2 
        preds = torch.sum(out, 1, keepdim = True)   
        # shape(preds) = [B, 1]
        
        return preds


class TwoInputsConcatRNN(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, fc1_out_dim, fc2_out_dim, out_dim,
                 bidirectional, dropout):

        super().__init__()

        self.bidirectional = bidirectional
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0) 
        
        self.rnn = nn.RNN(embedding_dim,
                          hidden_dim,
                          batch_first=True,
                          bidirectional=bidirectional,
                          num_layers=1)

        # Due to the concatenation, our hidden size is doubled.        
        if self.bidirectional:
            linear_hidden_in = hidden_dim * 2
        else:
            linear_hidden_in = hidden_dim

        # The linear layer 1, the input dim is linear_hidden_in * 2 after concatenation 
        self.fc1 = nn.Linear(linear_hidden_in * 2, fc1_out_dim)
        
        # The linear layer 2
        self.fc2 = nn.Linear(fc1_out_dim, fc2_out_dim)

        # The output layer
        self.fc3 = nn.Linear(fc2_out_dim, out_dim)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text1, text2):
        # ACRONYMS:
          # B = Batch size
          # T = Max sentence length
          # E = Embedding dimension
          # D = Hidden dimension
          # O = FC Output dimension

        # shape(text1) = [B, T]
        embedded1 = self.dropout(self.embedding(text1))
        embedded2 = self.dropout(self.embedding(text2))
        # shape(embedded1) = [B, T, E]
        
        all_hidden1, last_hidden1 = self.rnn(embedded1)
        all_hidden2, last_hidden2 = self.rnn(embedded2)
        
        if self.bidirectional:
            # Concat the final forward (hidden[0,:,:]) and backward (hidden[1,:,:]) hidden layers
            last_hidden1 = torch.cat((last_hidden1[0, :, :], last_hidden1[1, :, :]), dim=-1)
            last_hidden2 = torch.cat((last_hidden2[0, :, :], last_hidden2[1, :, :]), dim=-1)
            # shape(last_hidden1) = [B, D*2]

        else:
            last_hidden1 = last_hidden1.squeeze(0)
            last_hidden2 = last_hidden2.squeeze(0)
            # shape(last_hidden1) = [B, D]

        # Concat the last_hidden1 and last_hidden2
        last_hidden12 = torch.cat((last_hidden1, last_hidden2), dim=-1)

        # pass to linear layer 1
        out = self.fc1(self.dropout(last_hidden12))

        # pass to linear layer 2
        out = self.fc2(self.dropout(out))

        # Our predictions.
        preds = self.fc3(self.dropout(out))   
        # shape(preds) = [B, 1]
        
        return preds


class ClassifyRNN(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
                 bidirectional, dropout):

        super().__init__()

        self.bidirectional = bidirectional
        self.hidden_dim = hidden_dim       

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0) 
        
        self.rnn = nn.RNN(embedding_dim,
                          hidden_dim,
                          batch_first=True,
                          bidirectional=bidirectional,
                          num_layers=1)
      
        if self.bidirectional:
            linear_hidden_in = hidden_dim * 2
        else:
            linear_hidden_in = hidden_dim

        # The classification (linear) layer
        self.fc = nn.Linear(linear_hidden_in, output_dim)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):
        # ACRONYMS:
          # B = Batch size
          # T = Max sentence length
          # E = Embedding dimension
          # D = Hidden dimension
          # O = FC Output dimension

        # shape(text) = [B, T]
        embedded = self.dropout(self.embedding(text))
        
        all_hidden, last_hidden = self.rnn(embedded)
        
        if self.bidirectional:
            # Concat the final forward (hidden[0,:,:]) and backward (hidden[1,:,:]) hidden layers
            last_hidden = torch.cat((last_hidden[0, :, :], last_hidden[1, :, :]), dim=-1)
            # shape(last_hidden) = [B, D*2]

        else:
            last_hidden = last_hidden.squeeze(0)
            # shape(last_hidden1) = [B, D]

        # Our predictions.
        logits = self.fc(self.dropout(last_hidden))
        # shape(logits) = [B, O]
          
        return logits


class ClassifyCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, out_channels, window_size, output_dim, dropout):
        super(ClassifyCNN, self).__init__()
        
        # Create the embedding layer as usual
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # in_channels -- 1 text channel
        # out_channels -- the number of output channels
        # kernel_size is (window size x embedding dim)
        self.conv = nn.Conv2d(
          in_channels=1, out_channels=out_channels,
          kernel_size=(window_size, embedding_dim))
        
        # the dropout layer
        self.dropout = nn.Dropout(dropout)

        # the fully connected layer
        self.fc = nn.Linear(out_channels, output_dim)
          
    def forward(self, x):
        # x -> (batch size, max_sent_length)
        
        # embedded -> (batch size, max_sent_length, embedding_dim)
        # images have 3 RGB channels 
        # for the text we add 1 channel
        # embedded -> (batch size, 1, max_sent_length, embedding_dim)
        embedded = self.embedding(x).unsqueeze(1)
 
        # Compute the feature maps      
        feature_maps = self.conv(embedded).squeeze(3)
       
        # Apply ReLU
        feature_maps = F.relu(feature_maps)
        
        # Apply the max pooling layer
        pooled = F.max_pool1d(feature_maps, feature_maps.shape[2]).squeeze(2)

        dropped = self.dropout(pooled)
 
        # output layer
        preds = self.fc(dropped)

        return preds        

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
train_loc = 'gdrive/My Drive/subtask-1/train.csv'
dev_loc = 'gdrive/My Drive/subtask-1/dev.csv'
test_loc = 'gdrive/My Drive/subtask-1/test.csv'
train = pd.read_csv(train_loc)    
valid = pd.read_csv(dev_loc)
test = pd.read_csv(test_loc)

# Prepare the training corpus and labels
o_headls_n_headls, labels_list, new_word_list = processed_data_to_lists(train)
tokenized_headls = get_tokenized_headls(o_headls_n_headls)
word2idx = get_word2idx(tokenized_headls, new_word_list)
origin_tensor, new_tensor, label_tensor = get_model_inputs(tokenized_headls, word2idx, labels_list)

print('origin_tensor:')
print(origin_tensor)
print('new_tensor:')
print(new_tensor)
print('label_tensor:')
print(label_tensor)
print('vocab_size:')
print(len(word2idx))
print(origin_tensor.shape)

print()
print()

# Prepare the validation corpus and labels
valid_o_headls_n_headls, valid_labels_list, valid_new_word_list = processed_data_to_lists(valid)
valid_tokenized_headls = get_tokenized_headls(valid_o_headls_n_headls)
valid_origin_tensor, valid_new_tensor, valid_label_tensor = get_model_inputs(valid_tokenized_headls, word2idx, valid_labels_list)

print('valid_origin_tensor:')
print(valid_origin_tensor)
print('valid_new_tensor:')
print(valid_new_tensor)
print('valid_label_tensor:')
print(valid_label_tensor)
print(valid_origin_tensor.shape)

print()
print()

# Prepare the test corpus and labels
test_o_headls_n_headls, test_labels_list, test_new_word_list = processed_data_to_lists(test)
test_tokenized_headls = get_tokenized_headls(test_o_headls_n_headls)
test_origin_tensor, test_new_tensor, test_label_tensor = get_model_inputs(test_tokenized_headls, word2idx, test_labels_list)

print('test_origin_tensor:')
print(test_origin_tensor)
print('test_new_tensor:')
print(test_new_tensor)
print('test_label_tensor:')
print(test_label_tensor)
print(test_origin_tensor.shape)

origin_tensor:
tensor([[   1,    2,    3,  ...,    0,    0,    0],
        [  16,   17,   18,  ...,    0,    0,    0],
        [  32,   33,   34,  ...,    0,    0,    0],
        ...,
        [5728, 2737, 5729,  ...,    0,    0,    0],
        [7010,   80, 2169,  ...,    0,    0,    0],
        [ 105,   93,   27,  ...,    0,    0,    0]])
new_tensor:
tensor([[   1,    2,    3,  ...,    0,    0,    0],
        [  16,   17,   18,  ...,    0,    0,    0],
        [  32,   33,   34,  ...,    0,    0,    0],
        ...,
        [5728, 2737, 5729,  ...,    0,    0,    0],
        [7010,   80, 2169,  ...,    0,    0,    0],
        [ 105,   93,   27,  ...,    0,    0,    0]])
label_tensor:
tensor([0.2000, 1.6000, 1.0000,  ..., 0.6000, 1.4000, 0.4000])
vocab_size:
11722
torch.Size([9652, 27])


valid_origin_tensor:
tensor([[1674,  323, 1832,  ...,    0,    0,    0],
        [ 509, 2944,  855,  ...,    0,    0,    0],
        [1598,   80,  749,  ...,    0,    0,    0],
        ...,
        [  

In [12]:
# prepare class labels for classification task
round_labels = torch.round(label_tensor).long()
round_valid_labels = torch.round(valid_label_tensor).long()

print(round_labels)
print(round_valid_labels)

tensor([0, 2, 1,  ..., 1, 1, 0])
tensor([1, 1, 1,  ..., 1, 1, 1])


In [13]:
# do computation on a GPU if possible 
if torch.cuda.is_available():
  torch.backends.cudnn.deterministic = True
  DEVICE='cuda:0'
else:
  DEVICE='cpu'

print('Device is', DEVICE)

Device is cuda:0


In [14]:
class NewDataset(tud.Dataset):
    def __init__(self, x1, x2, y1):
        self.len = x1.shape[0]

        self.x1_data = x1.to(DEVICE)
        self.x2_data = x2.to(DEVICE)
        self.y1_data = y1.to(DEVICE)


    def __getitem__(self, index):
        return self.x1_data[index], self.x2_data[index], self.y1_data[index]


    def __len__(self):
        return self.len

# Batching
BATCH_SIZE = 36

train_dataset = NewDataset(origin_tensor, new_tensor, label_tensor)
valid_dataset = NewDataset(valid_origin_tensor, valid_new_tensor, valid_label_tensor)
test_dataset = NewDataset(test_origin_tensor, test_new_tensor, test_label_tensor)

train_dataloader = tud.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = tud.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = tud.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)


##### demo #####
print(train_dataloader)

for x1, x2, y1 in train_dataloader:
    demo_x1 = x1
    demo_x2 = x2
    demo_y1 = y1
    break
    
print(x1.shape)
print(x2.shape)
print(y1.shape)
print(len(train_dataloader))

<torch.utils.data.dataloader.DataLoader object at 0x7f3cb36b8c18>
torch.Size([36, 27])
torch.Size([36, 27])
torch.Size([36])
269


In [15]:
# prepare dataloader for classification task
classify_train_dataset = NewDataset(origin_tensor, new_tensor, round_labels)
classify_valid_dataset = NewDataset(valid_origin_tensor, valid_new_tensor, round_valid_labels)

classify_train_dataloader = tud.DataLoader(classify_train_dataset, batch_size=BATCH_SIZE, shuffle=True)
classify_valid_dataloader = tud.DataLoader(classify_valid_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [16]:
# define rmse
def rmse(predictions, labels):
    loss = torch.sqrt(((predictions - labels)**2).mean())

    return loss
    

In [17]:
# define train and evaluate
def train(model, train_dataloader, valid_dataloader, optimizer, scheduler, criterion, N_EPOCHS, are_two_input):
    optimizer = optimizer
    model = model.to(DEVICE)

    for epoch in range(N_EPOCHS):
    
        start_time = time.time()

        # To ensure the dropout is "turned on" while training
        # (good practice to include in your projects even if it is not used)
        model.train()
        
        epoch_loss = 0
    
        for origin_batch, new_batch, labels in train_dataloader:
                        
            # Zero the gradients
            optimizer.zero_grad()

            # shape(origin_batch) = [B, T]
            # shape(new_batch) = [B, T]
            # shape(label) = [B]

            if are_two_input:
               predictions = model(origin_batch, new_batch)
            else:
               predictions = model(new_batch)
            
            # compute the loss
            loss = criterion(predictions, labels)
            #print(loss)
                      
            # calculate the gradient of each parameter
            loss.backward()
        
            # update the parameters using the gradients and optimizer algorithm 
            optimizer.step()

            # update the learning rate
            scheduler.step()
            
            epoch_loss += loss.item()
            
        average_epoch_loss = epoch_loss / len(train_dataloader)
        
        end_time = time.time()
               
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
        average_epoch_valid_loss = evaluate(model, criterion, are_two_input, valid_dataloader)

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {average_epoch_loss:.3f} | Val. Loss: {average_epoch_valid_loss:.3f} ')


def evaluate(model, criterion, are_two_input, dataloader):

    epoch_loss = 0

    # Turn on evaluate mode. This de-activates dropout. 
    model.eval()

    # We do not compute gradients within this block, i.e. no training
    with torch.no_grad():

        for origin_batch, new_batch, labels in dataloader:
            
            if are_two_input:
               predictions = model(origin_batch, new_batch)
            else:
               predictions = model(new_batch)

            loss = criterion(predictions, labels)

            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


In [21]:
# classification task
fix_seed()

INPUT_DIM = len(word2idx)
EMBEDDING_DIM = 50
HIDDEN_DIM = 128
OUTPUT_DIM = 4
BIDIRECTIONAL = True
DROPOUT = 0.4

LRATE = 1e-4
N_EPOCHS = 10

classify_RNN_model = ClassifyRNN(INPUT_DIM, 
                                 EMBEDDING_DIM, 
                                 HIDDEN_DIM, 
                                 OUTPUT_DIM,
                                 BIDIRECTIONAL, 
                                 DROPOUT)

N_OUT_CHANNELS = 100
WINDOW_SIZE = 3

classify_CNN_model = ClassifyCNN(INPUT_DIM, 
                                 EMBEDDING_DIM, 
                                 N_OUT_CHANNELS,
                                 WINDOW_SIZE,
                                 OUTPUT_DIM,
                                 DROPOUT)

print(classify_RNN_model)

optimizer = optim.AdamW(classify_RNN_model.parameters(), lr=LRATE)
# schedule learning rate using scheduler
steps = 36
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, steps)
# we use the Cross Entropy Loss for classification
criterion = nn.CrossEntropyLoss()
# note that by default losses are averaged over the minibatch
train(classify_RNN_model, classify_train_dataloader, classify_valid_dataloader, optimizer, scheduler, criterion, N_EPOCHS, are_two_input=False)

ClassifyRNN(
  (embedding): Embedding(11722, 50, padding_idx=0)
  (rnn): RNN(50, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=4, bias=True)
  (dropout): Dropout(p=0.4, inplace=False)
)
Epoch: 01 | Epoch Time: 0m 1s
	Train Loss: 1.133 | Val. Loss: 1.014 
Epoch: 02 | Epoch Time: 0m 1s
	Train Loss: 1.031 | Val. Loss: 0.998 
Epoch: 03 | Epoch Time: 0m 1s
	Train Loss: 1.028 | Val. Loss: 1.000 
Epoch: 04 | Epoch Time: 0m 1s
	Train Loss: 1.020 | Val. Loss: 1.004 
Epoch: 05 | Epoch Time: 0m 1s
	Train Loss: 1.024 | Val. Loss: 1.007 
Epoch: 06 | Epoch Time: 0m 1s
	Train Loss: 1.018 | Val. Loss: 0.997 
Epoch: 07 | Epoch Time: 0m 1s
	Train Loss: 1.018 | Val. Loss: 0.998 
Epoch: 08 | Epoch Time: 0m 1s
	Train Loss: 1.015 | Val. Loss: 1.000 
Epoch: 09 | Epoch Time: 0m 1s
	Train Loss: 1.014 | Val. Loss: 1.002 
Epoch: 10 | Epoch Time: 0m 1s
	Train Loss: 1.008 | Val. Loss: 1.001 


In [160]:
ebd = model.embedding.weight.data
print(ebd)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.8249, -0.4135,  0.9061,  ...,  1.2568, -1.6915, -0.2913],
        [-0.3912,  0.3121,  0.0772,  ...,  0.8107,  1.0791,  1.0931],
        ...,
        [ 0.5560,  1.6289, -0.0073,  ..., -0.1796,  0.5554,  0.4185],
        [ 1.6986,  0.6687, -0.5823,  ..., -2.1529,  0.5683, -0.5236],
        [ 1.2224,  0.8689, -1.2513,  ..., -0.4243,  0.7523, -0.1753]],
       device='cuda:0')


In [175]:
# regression task 
fix_seed()

INPUT_DIM = len(word2idx)
EMBEDDING_DIM = 50
HIDDEN_DIM = 128
FC_OUTPUT_DIM = 32
BIDIRECTIONAL = True
DROPOUT = 0.3

LRATE = 1e-4
N_EPOCHS = 30

model = TwoInputsRNN(INPUT_DIM, 
                     EMBEDDING_DIM, 
                     HIDDEN_DIM, 
                     FC_OUTPUT_DIM,
                     BIDIRECTIONAL, 
                     DROPOUT,
                     ebd)

print(model)

optimizer = optim.AdamW(model.parameters(), lr=LRATE)
# schedule learning rate using scheduler
steps = 36
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, steps)
# we use the RMSE loss
criterion = rmse
# note that by default losses are averaged over the minibatch
train(model, train_dataloader, valid_dataloader, optimizer, scheduler, criterion, N_EPOCHS, are_two_input=True)

TwoInputsRNN(
  (embedding): Embedding(11722, 50, padding_idx=0)
  (rnn): RNN(50, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=32, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)
Epoch: 01 | Epoch Time: 0m 2s
	Train Loss: 0.672 | Val. Loss: 0.607 
Epoch: 02 | Epoch Time: 0m 2s
	Train Loss: 0.637 | Val. Loss: 0.595 
Epoch: 03 | Epoch Time: 0m 2s
	Train Loss: 0.623 | Val. Loss: 0.589 
Epoch: 04 | Epoch Time: 0m 2s
	Train Loss: 0.615 | Val. Loss: 0.597 
Epoch: 05 | Epoch Time: 0m 2s
	Train Loss: 0.608 | Val. Loss: 0.585 
Epoch: 06 | Epoch Time: 0m 2s
	Train Loss: 0.604 | Val. Loss: 0.580 
Epoch: 07 | Epoch Time: 0m 2s
	Train Loss: 0.599 | Val. Loss: 0.581 
Epoch: 08 | Epoch Time: 0m 2s
	Train Loss: 0.599 | Val. Loss: 0.582 
Epoch: 09 | Epoch Time: 0m 2s
	Train Loss: 0.596 | Val. Loss: 0.579 
Epoch: 10 | Epoch Time: 0m 2s
	Train Loss: 0.593 | Val. Loss: 0.577 
Epoch: 11 | Epoch Time: 0m 2s
	Train Loss: 0.594 | Val. Loss: 0.581 
Epoch: 12 |

In [178]:
# run on the test corpus


test_loss = 0
test_predictions = []

# Turn on evaluate mode. This de-activates dropout. 
model.eval()

# We do not compute gradients within this block, i.e. no training
with torch.no_grad():

    for origin_batch, new_batch, labels in test_dataloader:
        
        predictions__batch = model(origin_batch, new_batch).squeeze(1)
        test_predictions += predictions__batch.tolist()

        loss = torch.sqrt(((predictions__batch - labels)**2).mean())

        test_loss += loss.item()

    average_test_loss = test_loss / len(test_dataloader)

print(f'| Test Loss: {average_test_loss:.6f} |')
print(test_predictions)

| Test Loss: 0.570986 |
[0.911490797996521, 0.9284030795097351, 0.9483744502067566, 0.9401677846908569, 0.9436121582984924, 0.9446495771408081, 0.9446260929107666, 0.9508267045021057, 0.9472134113311768, 0.9080698490142822, 0.950901985168457, 0.9367788434028625, 0.9169864654541016, 0.96546471118927, 0.9398969411849976, 0.9585702419281006, 0.962973415851593, 0.9767415523529053, 0.9482578635215759, 0.9442061185836792, 0.9615516066551208, 0.9299001097679138, 0.9455171823501587, 0.9355592131614685, 0.9311229586601257, 0.9594352841377258, 0.9309986233711243, 0.9331187009811401, 0.9600181579589844, 0.9398894309997559, 0.9448714852333069, 0.935460090637207, 0.9531043171882629, 0.9369505047798157, 0.9360882639884949, 0.9581944346427917, 0.9523606896400452, 0.9517067074775696, 0.9230743646621704, 0.967362642288208, 0.9616958498954773, 0.9221483469009399, 0.9278527498245239, 0.9509905576705933, 0.9426507949829102, 0.9549456834793091, 0.9229129552841187, 0.9282147884368896, 0.962388277053833, 0.9

In [185]:
# CNN for regression task
fix_seed()

EPOCHS = 500
LRATE = 5e-3

EMBEDDING_DIM = 50
FC_OUT_DIM = 25

# the hyperparameters specific to CNN
# we define the number of filters
N_OUT_CHANNELS = 100

# we define the window size
WINDOW_SIZE = 3

# we apply the dropout with the probability 0.2
DROPOUT = 0.7

# Construct the model
model = TwoInputsCNN(len(word2idx), EMBEDDING_DIM, N_OUT_CHANNELS, WINDOW_SIZE, FC_OUT_DIM, DROPOUT)

# Print the model
print(model)

model = model.to(DEVICE)

# we use the stochastic gradient descent (SGD) optimizer
#optimizer = optim.SGD(model.parameters(), lr=LRATE)

optimizer = optim.Adam(model.parameters(), lr=LRATE)

# schedule learning rate using scheduler
steps = 150
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, steps)

# Input and label tensors for training
x_feature = origin_tensor.to(DEVICE)
y_feature = new_tensor.to(DEVICE)
target = label_tensor.to(DEVICE)

# Input and label tensors for validation
valid_x_feature = valid_origin_tensor.to(DEVICE)
valid_y_feature = valid_new_tensor.to(DEVICE)
valid_target = valid_label_tensor.to(DEVICE)


################
# Start training
################
print(f'Will train for {EPOCHS} epochs')
for epoch in range(1, EPOCHS + 1):
  model.train()
  
  optimizer.zero_grad()
  
  # squeeze is needed as the predictions will have the shape (batch size, 1)
  # and we need to remove the dimension of size 1
  predictions = model(x_feature, y_feature).squeeze(1)

  # Compute here the RMSE loss
  loss = torch.sqrt(((predictions - target)**2).mean())
  train_loss = loss.item()

  # calculate the gradient of each parameter
  loss.backward()

  # update the parameters using the gradients and optimizer algorithm 
  optimizer.step()
  
  # update the learning rate
  scheduler.step()

  # "evaluation mode" (turns off dropout and batch normalization)
  model.eval()

  # we do not compute gradients within this block, i.e. no training
  with torch.no_grad():
    valid_predictions = model(valid_x_feature, valid_y_feature).squeeze(1)
    valid_loss = torch.sqrt(((valid_predictions - valid_target)**2).mean()).item()
  
  print(f'| Epoch: {epoch:02} | Train Loss: {train_loss:.6f} | Val. Loss: {valid_loss:.6f} |')

TwoInputsCNN(
  (embedding): Embedding(11722, 50, padding_idx=0)
  (conv): Conv2d(1, 100, kernel_size=(3, 50), stride=(1, 1))
  (dropout): Dropout(p=0.7, inplace=False)
  (fc): Linear(in_features=100, out_features=25, bias=True)
)
Will train for 500 epochs
| Epoch: 01 | Train Loss: 98.611549 | Val. Loss: 59.526054 |
| Epoch: 02 | Train Loss: 66.814018 | Val. Loss: 39.622589 |
| Epoch: 03 | Train Loss: 45.480709 | Val. Loss: 25.952444 |
| Epoch: 04 | Train Loss: 30.376968 | Val. Loss: 16.712650 |
| Epoch: 05 | Train Loss: 20.173019 | Val. Loss: 10.630790 |
| Epoch: 06 | Train Loss: 12.964316 | Val. Loss: 6.785033 |
| Epoch: 07 | Train Loss: 8.638209 | Val. Loss: 4.454254 |
| Epoch: 08 | Train Loss: 5.851375 | Val. Loss: 3.113842 |
| Epoch: 09 | Train Loss: 4.222412 | Val. Loss: 2.395776 |
| Epoch: 10 | Train Loss: 3.270630 | Val. Loss: 2.041988 |
| Epoch: 11 | Train Loss: 2.837114 | Val. Loss: 1.889567 |
| Epoch: 12 | Train Loss: 2.606101 | Val. Loss: 1.839073 |
| Epoch: 13 | Train Loss

In [182]:
ebd = model.embedding.weight.data
print(ebd)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.8684, -0.4288,  0.9058,  ...,  1.3469, -1.8879, -0.2659],
        [-0.5214,  0.2853, -0.0157,  ...,  0.8558,  1.1096,  1.1314],
        ...,
        [ 0.6183,  1.8114, -0.0081,  ..., -0.1997,  0.6177,  0.4655],
        [ 1.8379,  0.6741, -0.6462,  ..., -2.3496,  0.6110, -0.5293],
        [ 1.2835,  0.9197, -1.3363,  ..., -0.4337,  0.7760, -0.2058]],
       device='cuda:0')


In [None]:
# FFNN for regression task
# Reset the seed before every model construction for reproducible results
fix_seed()

# we will train for N epochs (The model will see the corpus N times)
EPOCHS = 200

# Learning rate is initially set to 0.145
LRATE = 0.145

# we define our embedding dimension (dimensionality of the output of the first layer)
EMBEDDING_DIM = 300

# dimensionality of the output of the second hidden layer
HIDDEN_DIM_1 = 100

# dimensionality of the output of the third hidden layer
HIDDEN_DIM_2 = 50

# dimensionality of the output of the fourth hidden layer
HIDDEN_DIM_3 = 10

# Construct the model
model = TwoInputsNN(EMBEDDING_DIM, HIDDEN_DIM_1, HIDDEN_DIM_2, HIDDEN_DIM_3, len(word2idx))

# Print the model
print(model)

model = model.to(DEVICE)

# we use the stochastic gradient descent (SGD) optimizer
#optimizer = optim.SGD(model.parameters(), lr=LRATE)

LRATE = 1e-1
optimizer = optim.Adam(model.parameters(), lr=LRATE)

# schedule learning rate using scheduler
steps = 50
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, steps)

# Input and label tensors for training
x_feature = origin_tensor.to(DEVICE)
y_feature = new_tensor.to(DEVICE)
target = label_tensor.to(DEVICE)

# Input and label tensors for validation
valid_x_feature = valid_origin_tensor.to(DEVICE)
valid_y_feature = valid_new_tensor.to(DEVICE)
valid_target = valid_label_tensor.to(DEVICE)


################
# Start training
################
print(f'Will train for {EPOCHS} epochs')
for epoch in range(1, EPOCHS + 1):
  # to ensure the dropout (explained later) is "turned on" while training
  # good practice to include even if do not use here
  model.train()
  
  # we zero the gradients as they are not removed automatically
  optimizer.zero_grad()
 
  # squeeze is needed as the predictions will have the shape (batch size, 1)
  # and we need to remove the dimension of size 1
  predictions = model(x_feature, y_feature).squeeze(1)

  # Compute here the RMSE loss
  loss = torch.sqrt(((predictions - target)**2).mean())
  train_loss = loss.item()

  # calculate the gradient of each parameter
  loss.backward()

  # update the parameters using the gradients and optimizer algorithm 
  optimizer.step()
  
  # update the learning rate
  scheduler.step()

  # this puts the model in "evaluation mode" (turns off dropout and batch normalization)
  # good practise to include even if we do not use them right now
  model.eval()

  # we do not compute gradients within this block, i.e. no training
  with torch.no_grad():
    valid_predictions = model(valid_x_feature, valid_y_feature).squeeze(1)
    valid_loss = torch.sqrt(((valid_predictions - valid_target)**2).mean()).item()
  
  print(f'| Epoch: {epoch:02} | Train Loss: {train_loss:.6f} | Val. Loss: {valid_loss:.6f} |')

TwoInputsNN(
  (embedding): Embedding(11722, 300, padding_idx=0)
  (fc1): Linear(in_features=300, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (relu1): ReLU()
  (fc3): Linear(in_features=50, out_features=10, bias=True)
)
Will train for 200 epochs
| Epoch: 01 | Train Loss: 1.050 | Val. Loss: 22464.941 |
| Epoch: 02 | Train Loss: 18924.811 | Val. Loss: 9.176 |
| Epoch: 03 | Train Loss: 6.613 | Val. Loss: 1.084 |
| Epoch: 04 | Train Loss: 1.088 | Val. Loss: 1.086 |
| Epoch: 05 | Train Loss: 1.090 | Val. Loss: 1.058 |
| Epoch: 06 | Train Loss: 1.062 | Val. Loss: 1.014 |
| Epoch: 07 | Train Loss: 1.018 | Val. Loss: 0.962 |
| Epoch: 08 | Train Loss: 0.965 | Val. Loss: 0.906 |
| Epoch: 09 | Train Loss: 0.910 | Val. Loss: 0.850 |
| Epoch: 10 | Train Loss: 0.855 | Val. Loss: 0.798 |
| Epoch: 11 | Train Loss: 0.802 | Val. Loss: 0.750 |
| Epoch: 12 | Train Loss: 0.755 | Val. Loss: 0.708 |
| Epoch: 13 | Train Loss: 0.713 | Val. Loss: 0.672 |
| Epoch: 

In [None]:
# run on the test corpus
model.eval()

test_x_feature = test_origin_tensor.to(DEVICE)
test_y_feature = test_new_tensor.to(DEVICE)
test_target = test_label_tensor.to(DEVICE)

with torch.no_grad():
  test_predictions = model(test_x_feature, test_y_feature).squeeze(1)
  test_loss = torch.sqrt(((test_predictions - test_target)**2).mean()).item()

print(f'| Test Loss: {test_loss:.3f} |')

test_origin_tensor:
tensor([[  87, 2816,  234,  ...,    0,    0,    0],
        [ 392, 1532,  425,  ...,    0,    0,    0],
        [ 212,    2, 7535,  ...,    0,    0,    0],
        ...,
        [ 538,  234,  224,  ...,    0,    0,    0],
        [4808, 2153, 5571,  ...,    0,    0,    0],
        [  58,  429, 1988,  ...,    0,    0,    0]])
test_new_tensor:
tensor([[  87, 2816,  234,  ...,    0,    0,    0],
        [ 392, 1532,  773,  ...,    0,    0,    0],
        [ 212,    2, 7535,  ...,    0,    0,    0],
        ...,
        [ 538,  234,  224,  ...,    0,    0,    0],
        [4808, 2153, 5571,  ...,    0,    0,    0],
        [  58,  429, 1988,  ...,    0,    0,    0]])
test_label_tensor:
tensor([1.2000, 0.4000, 1.0000,  ..., 0.4000, 0.0000, 0.8000])
| Test Loss: 0.575 |


In [None]:
def write_predictions(predictions, test_data_frame, out_loc):
    test_data_frame['pred'] = predictions
    output = test_data_frame[['id','pred']]
    output.to_csv(out_loc, index=False)
        
    print('Output file created:\n\t- '+os.path.abspath(out_loc))


# write the predictions for the dev data into 'task-1-output.csv'
out_loc = 'gdrive/My Drive/subtask-1/task-1-output.csv'
write_predictions(test_predictions, test, out_loc)

Output file created:
	- /content/gdrive/My Drive/subtask-1/task-1-output.csv


In [None]:
def score(truth_loc, prediction_loc):
    truth = pd.read_csv(truth_loc, usecols=['id','meanGrade'])
    pred = pd.read_csv(prediction_loc, usecols=['id','pred'])
    
    assert(sorted(truth.id) == sorted(pred.id)),"ID mismatch between ground truth and prediction!"
    
    data = pd.merge(truth,pred)
    rmse = np.sqrt(np.mean((data['meanGrade'] - data['pred'])**2))
    
    print("RMSE = %.6f" % rmse)    

# print RMSE
truth_loc = 'gdrive/My Drive/subtask-1/test.csv'
prediction_loc = 'gdrive/My Drive/subtask-1/task-1-output.csv'
score(truth_loc, prediction_loc)

RMSE = 0.575020
