# Load Libraries

In [1]:
import numpy as np 
import pandas as pd
import math

import torch
from torch import nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import DataLoader, Dataset
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib.pyplot as plt

from sklearn.model_selection  import train_test_split
from sklearn.model_selection import KFold


from transformers import AutoModel, AutoTokenizer, BertTokenizer, AutoConfig

import itertools
import gc
import os 
import random

import spacy
import time
import timeit

ModuleNotFoundError: No module named 'torch'

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(seed=1326)

# HyperParameters

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# # for TPU
# device = xm.xla_device()
# torch.set_default_tensor_type('torch.FloatTensor')

batch_size = 16
epochs = 15 # The number of epochs
embedding_dim = 300
MODEL_NAME =  '../input/huggingface-roberta-variants/distilroberta-base/distilroberta-base'
# MODEL_NAME2 =  '../input/huggingface-roberta-variants/roberta-base/roberta-base'

# ../input/huggingface-bert/bert-base-cased
MODEL_NAME2 = None
useFeatures = False

# load dataframe

In [None]:
# datadir = '/kaggle/input/commonlitreadabilityprize'
# traindir = datadir + '/train.csv'
df = pd.read_csv('OHABotData.csv').sample(frac = 1)

# OHADataset

In [None]:
class CommonLitDataset(Dataset):
    def __init__(self, df):
        self.df = df 
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        label = self.df.iloc[idx][1]
        text = self.df.iloc[idx][0]     
        if useFeatures:
            readability = np.array([x for x in self.df.iloc[idx][2:].values])
        else:
            readability = torch.zeros((1,7))
#         print(readability)
#         print(type(readability))
#         print(readability.shape)
        return text, label, readability
    

    
# train_data = CommonLitDataset(df)
# train_dataloader = DataLoader(train_data, batch_size = batch_size, shuffle = True)

# val_data = CommonLitDataset(df)
# val_dataloader = DataLoader(val_data, batch_size = batch_size, shuffle = True)

KFold_data = CommonLitDataset(df)

# Transformer Model

In [None]:
ass CommonLitModel(nn.Module):
    def __init__(self,path, path2 = None):
        super(CommonLitModel,self).__init__()
        self.config = AutoConfig.from_pretrained(path)
        self.config.update({'output_hidden_states':True})
        self.bert = AutoModel.from_pretrained(path,output_hidden_states=False)
        
        if path2:
            self.bert2 = AutoModel.from_pretrained(path2, output_hidden_states = False) 
            self.linear1 = nn.Linear(1536,1536)
            self.linear2 = nn.Linear(1536,1)
        else:
            print('768 Features used')
            self.linear1 = nn.Linear(768,768)
            self.linear2 = nn.Linear(768,1)
            
            
        self.linear = nn.Linear(775,1)   
        self.dropout = nn.Dropout(0.50)
        self.lrelu = nn.LeakyReLU()
    

    def forward(self,xb, x2 = None, readability = None):
        x = self.bert(**xb)[1]
        if x2:
            x1 = self.bert2(**x2)[1]
            x = torch.cat((x, x1))
#             x = torch.mean(torch.stack([x, x1]))
        
#         x = self.dropout(x)
#         x = self.linear1(x)
#         x = self.lrelu(x) 
#         print(x.size())
#         print(readability.size())
        if useFeatures:
            x = torch.cat((x,readability),1)
            x = self.dropout(x)
            x = self.linear(x)
        else:
            x = self.dropout(x)
            x = self.linear2(x)
        return x
    
    

model = CommonLitModel(MODEL_NAME,MODEL_NAME2).to(device)
torch.save(model.state_dict(), 'initialModel')

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if MODEL_NAME2:
    tokenizer2 = AutoTokenizer.from_pretrained(MODEL_NAME2)

In [None]:
import time

bptt = 35

criterion = nn.MSELoss()
lr = 0.00002 # learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay= 1e-5)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.95, verbose = True)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.95)


def train(model):
    model.train() # Turn on the train mode
    return_loss = []
    total_loss = 0.
    start_time = time.time()
#     src_mask = model.generate_square_subsequent_mask(bptt).to(device)
    batch = 0
    for data, targets, readability in train_loader:
        model.train()
        optimizer.zero_grad()
      
        data1 = tokenizer.batch_encode_plus([*data],pad_to_max_length='max_length', return_tensors='pt').to(device)
        readability = torch.tensor(readability).float().to(device)
        if MODEL_NAME2:
            data2 = tokenizer2.batch_encode_plus([*data],pad_to_max_length='max_length', return_tensors='pt').to(device)
            final_output = model(data1 ,data2)
            
        else:
            final_output = model(data1 ,None, readability)
        targets = targets.float().to(device)
        final_output = torch.squeeze(final_output.float())
        loss = criterion(final_output, targets)


        loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        return_loss.append(loss.item())
        log_interval = 100
        batch += 1
#         if batch % log_interval == 0 and batch > 0:
#             cur_loss = total_loss / log_interval
#             elapsed = time.time() - start_time
#             print('| epoch {:3d} | {:5d}/{:5d} batches | '
#                   'lr {:02.2f} | ms/batch {:5.2f} | '
#                   'loss {:5.2f} | ppl {:8.2f}'.format(
#                     epoch, batch, len(train_data) // bptt, scheduler.get_last_lr()[0],
#                     elapsed * 1000 / log_interval,
#                     cur_loss, np.exp(cur_loss)))
        total_loss = 0
        start_time = time.time()
        
    return torch.mean(torch.tensor(return_loss))

def evaluate(eval_model):
#     losses = []
    eval_model.eval() # Turn on the evaluation mode
#     total_loss = 0.
    total_loss = []
    with torch.no_grad():
        for data, targets, readability in val_loader:
            
            data1 = tokenizer.batch_encode_plus([*data],pad_to_max_length='max_length', return_tensors='pt').to(device)
            readability = torch.tensor(readability).float().to(device)
            if MODEL_NAME2:
                data2 = tokenizer2.batch_encode_plus([*data],pad_to_max_length='max_length', return_tensors='pt').to(device)
                final_output = model(data1 ,data2)
            else:
                final_output = model(data1 , None, readability)
            targets = targets.float().to(device)
            final_output = torch.squeeze(final_output.float())
#             loss = criterion(final_output, targets)
            currLoss = criterion(final_output, targets).item()
#             total_loss += len(data) * currLoss
            total_loss.append(currLoss)

#     return total_loss
    return torch.mean(torch.tensor(total_loss))

Regular Split

In [None]:
# best_val_loss = float("inf")
# best_model = None

# train_data = CommonLitDataset(df)
# train_dataloader = DataLoader(train_data, batch_size = batch_size, shuffle = True)

# val_data = CommonLitDataset(df)
# val_dataloader = DataLoader(val_data, batch_size = batch_size, shuffle = True)


# train_losses = []
# val_losses = []
# for epoch in range(1, epochs + 1):
#     epoch_start_time = time.time()
#     train_loss = train(best_val_loss)
#     val_loss = evaluate(model)
#     print('-' * 89)
#     print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '.format(epoch, (time.time() - epoch_start_time),val_loss))
#     print('-' * 89)
    
#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         best_model = model
#         torch.save(best_model, 'bestModel')

#     scheduler.step()
#     val_losses.append(val_loss)
#     train_losses.append(train_loss)
    
# #     print('Hi')
# #     print(train_losses)
# #     print('*' * 80)

#     if epoch % 5 == 0:
#         plt.plot(train_losses, label = "Train_Loss")
#         plt.plot(val_losses, label = "Val_Loss")
#         plt.show()

In [None]:
train_data = CommonLitDataset(df)
train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)


model.load_state_dict(torch.load('initialModel'))

for epoch in range(1, best_epoch + 1):
# for epoch in range(1, 8 + 1):
        print('-' * 89)
        print(f'Starting epoch {epoch}')
        epoch_start_time = time.time()
        train_loss = train(model)
        print(f'| end of epoch: {epoch} | time: {time.time() - epoch_start_time}s  | train loss: {train_loss} |')
        print('-' * 89)

# Saving for inference

In [None]:
torch.save(model, 'bestModel')