### Kaggle CommonLit Readability Challenge
Python version: 3.7 \
Date: 24-Nov-2021 \
This code uses PyTorch's tranformer encoder for estimating the score on sentence(s) written in English

In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from datetime import datetime
# from plotnine import *

from torch import Tensor
import math

In [None]:
start_time = datetime.now()

In [None]:
%run Kaggle_CommonLit_Challenge_TE_Func.ipynb

### Multi-batch transformer-encoder based Predictive Model 

In [None]:
# Set the hyperparameters
words_per_essay = 77

vec_len_per_word=50
# vec_len_per_word=100
# vec_len_per_word=200
# vec_len_per_word=300

number_of_heads = 1
number_of_layers = 1

num_feed_fwd_dim=2048

### Step 1: Read the excerpt embeddings

In [None]:
essay_embed = np.load('Embeddings_Valid_Excerpt.npy')
# essay_embed = np.load('Embeddings_Valid_Excerpt_Dim_100.npy')
# essay_embed = np.load('Embeddings_Valid_Excerpt_Dim_200.npy')
# essay_embed = np.load('Embeddings_Valid_Excerpt_Dim_300.npy')
essay_embed.shape

In [None]:
lit_score_frame = pd.read_csv('Scores_Valid_Excerpt.csv')
lit_score = torch.tensor(lit_score_frame['target'], dtype=torch.float) #dtype has to be float32 and not float64

In [None]:
# Spread of the 'target' values
lit_score_frame['target'].describe()

In [None]:
num_essays = lit_score_frame.shape[0]
torch_essay = torch.zeros((words_per_essay, num_essays, vec_len_per_word))

for idx in range(num_essays):
    start_idx = 0 + idx*words_per_essay
    end_idx = start_idx + words_per_essay
    torch_essay[:, idx, :] = torch.tensor(essay_embed[start_idx:end_idx, :])

# number_of_words x number_of_sentences x word_vec
torch_essay.size()

In [None]:
batch_size = 32
num_batches = int(np.ceil(lit_score_frame.shape[0]/batch_size))
print(num_batches)

rng = np.random.default_rng(100)
elements_per_batch = rng.choice(num_batches, lit_score_frame.shape[0], replace=True)
# np.unique(elements_per_batch, return_counts=True)

In [None]:
# Determine the number of unique integers
unique_batches = list(np.unique(elements_per_batch))
len(unique_batches)

In [None]:
# Number of sentences
len(elements_per_batch)

In [None]:
# 70:30 split between training and test set
0.7 * 2146

In [None]:
idx_count_per_batch = np.unique(elements_per_batch, return_counts=True)[1]
np.min(np.where(idx_count_per_batch.cumsum() >= 1502)[0])

In [None]:
trng_set_batches = list(set(range(0, 48)) & set(unique_batches))
test_set_batches = list(set(range(48, 68)) & set(unique_batches))

# Sort the batches in ascending order to faciliate debugging
trng_set_batches.sort()
test_set_batches.sort()

In [None]:
test_set_batches

In [None]:
torch.manual_seed(10)

# lit_model = Transformer_Encoder(embed_size=vec_len_per_word
#                                 , n_heads=number_of_heads
#                                 , n_layers=number_of_layers
#                                 , output_size=[30, 10, 1]) # embed_size = size of input vector

# lit_model = Transformer_Encoder_Self_Decoder(embed_size=vec_len_per_word
#                                 , n_heads=number_of_heads
#                                 , n_layers=number_of_layers
#                                 , output_size=[30, 10, 1]) # embed_size = size of input vector

lit_model = Transformer_Encoder_Pos_Embed(embed_size=vec_len_per_word
                                , n_heads=number_of_heads
                                , n_layers=number_of_layers
                                , output_size=[30, 10, 1] # embed_size = size of input vector
                                , dim_feedfwd=num_feed_fwd_dim) 

lit_loss_function = nn.MSELoss()
lit_optimizer = torch.optim.Adam(lit_model.parameters(), lr=0.001)

In [None]:
# Number of model parameters
vec_len_per_word = [50, 100, 200, 300]
number_of_heads=1
number_of_layers=1
num_feed_fwd_dim = [2048, 1024, 512, 256]

num_param_set = []

for vec_len in vec_len_per_word:
    for dim_len in num_feed_fwd_dim:
        
        lit_model = Transformer_Encoder_Pos_Embed(embed_size=vec_len
                                , n_heads=number_of_heads
                                , n_layers=number_of_layers
                                , output_size=[30, 10, 1] # embed_size = size of input vector
                                , dim_feedfwd=dim_len) 

        temp = sum(p.numel() for p in lit_model.parameters() if p.requires_grad)
        num_param_set = num_param_set + [temp]

num_param_set

In [None]:
num_epochs = 20
# num_epochs = 1
test_loss_per_epoch = []

for epoch in range(num_epochs):
    if epoch%1 == 0:
        print(epoch)
    
    lit_model.train()
    for trng_batch_idx in trng_set_batches:
        current_batch_idx = np.where(elements_per_batch == trng_batch_idx)[0]
        current_fit = lit_model(torch_essay[:, current_batch_idx, :])
        current_loss = lit_loss_function(current_fit, torch.unsqueeze(lit_score[current_batch_idx], 1))  # 2
        lit_optimizer.zero_grad()     # 3
        current_loss.backward()       # 4
        lit_optimizer.step()          # 5
    
    lit_model.eval()
    with torch.no_grad():
        temp_loss_per_batch = 0
        for test_batch_idx in test_set_batches:
            current_batch_idx = np.where(elements_per_batch == test_batch_idx)[0]
            current_fit = lit_model(torch_essay[:, current_batch_idx, :])
            current_loss = lit_loss_function(current_fit, torch.unsqueeze(lit_score[current_batch_idx], 1))  # 2
            temp_loss_per_batch += current_loss.item()
            # print(temp_loss_per_batch)
    
    test_loss_per_epoch.append(temp_loss_per_batch/len(test_set_batches))

In [None]:
np.min(test_loss_per_epoch)

In [None]:
np.min(np.sqrt(test_loss_per_epoch))

In [None]:
# per_itr_test_loss_frame = pd.DataFrame({'Itr':range(20), 'MSE_Loss':test_loss_per_epoch})
# Code is split across multiple lines for readability and ease of modification
# loss_plot = ggplot(per_itr_test_loss_frame, aes(x='Itr', y='MSE_Loss'))
# loss_plot = loss_plot + geom_point() + geom_line() + scale_x_continuous(breaks=range(0, 20, 1))
# loss_plot = loss_plot + labs(title='MSE Loss Across Iterations (Test Set)', x='Iteration', y='MSE Loss')
# loss_plot = loss_plot + theme(plot_title=element_text(face='bold')
#                              , axis_title_x=element_text(face='plain', size=12)
#                              , axis_title_y=element_text(face='plain', size=12)
#                              , figure_size=(15, 5))

# loss_plot

In [None]:
end_time_2 = datetime.now()
time_diff_2 = end_time_2 - start_time
time_diff_2

In [None]:
# Time (in minutes)
time_diff_2.total_seconds()/60