In [None]:
from sklearn.metrics import mean_absolute_error
import pandas as pd
import string
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.optim import lr_scheduler

nltk.download('punkt')

## Loading dataset

In [None]:
PATH_TO_TRAIN_DATA = '../input/hseds-texts-2020/train.csv'
PATH_TO_TEST_DATA = '../input/hseds-texts-2020/test.csv'

df = pd.read_csv(PATH_TO_TRAIN_DATA)
test = pd.read_csv(PATH_TO_TEST_DATA)


## Preprocessing data

In [None]:
def process(text):
    lemmatizer = WordNetLemmatizer()
    return [word for word in word_tokenize(text.lower()) if word not in string.punctuation]

def merge(text):
    text['all'] = text['positive'] + ' ' + text['negative']
    text['all'] = text['all'].apply(process)
    return text

test = merge(test)
df = merge(df)


df_train, df_test = train_test_split(df)
y_train = df_train['score']
y_test = df_test['score']

df_train = df_train.reset_index()
df_test = df_test.reset_index()
df_test.drop(['review_id'],inplace=True,axis=1)
df_train.drop(['review_id'],inplace=True,axis=1)

## Creating vocabulary with given mininal word frequency

In [None]:
def get_vocab(texts, min_count=8):
    counter = {}
    WORDS = set()
    WORDS.add('<UNK>')
    for sent in list(texts):
        for w in sent:
            if w in counter:
                counter[w] +=1
            else:
                counter[w] = 1
            WORDS.add(w)

    for i in counter.keys():
        if counter[i] < min_count:
            WORDS.remove(i)
    return WORDS, len(WORDS)

WORDS, len_vocab = get_vocab(df['all'], min_count=8)
len_vocab

## Creating datasets and dataloaders with padding and truncation

In [None]:
def get_padded_data(texts, vocab = WORDS, seq_length=200):

    int2word = dict(enumerate(tuple(WORDS)))
    word2int = {w: ii for ii, w in int2word.items()}
    pad = pad_sequence([torch.as_tensor([word2int[w] if w in WORDS else word2int['<UNK>']
                                                   for w in seq][:seq_length]) for seq in texts], 
                               batch_first=True)

    return pad



train_pos_pad, test_pos_pad = get_padded_data(df_train['all']), get_padded_data(df_test['all'])
sub_pos_pad = get_padded_data(test['all'])
all_train_pos_pad = get_padded_data(df['all'])

In [None]:
sub_pos_pad.size()

In [None]:
class ReviewsDataset(torch.utils.data.Dataset):
    def __init__(self, data, target=0, include_score = True):
        self.review=data
        self.score = target
        self.include_score = include_score

    def __len__(self):
        return len(self.review)
    
    def __getitem__(self, idx):
        text = self.review[idx]
        if self.include_score:
            score = self.score[idx]
            return text, score
        else:
            return text
    

In [None]:
BATCH_SIZE = 750

train_dataset = ReviewsDataset(train_pos_pad, df_train['score'], include_score = True)
test_dataset = ReviewsDataset(test_pos_pad, df_test['score'], include_score = True)
all_train_dataset = ReviewsDataset(all_train_pos_pad,df['score'], include_score = True)
sub_dataset = ReviewsDataset(sub_pos_pad, include_score = False)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE)
all_train_dataloader = torch.utils.data.DataLoader(all_train_dataset, batch_size=BATCH_SIZE)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)
# to predict
sub_dataloader = torch.utils.data.DataLoader(sub_dataset, batch_size=BATCH_SIZE)

## Specifying our model

In [None]:
class GRU(nn.Module):
    
    def __init__(self, embedding_dim, vocab_size = len(WORDS), n_hidden=150, n_layers=2, lr=0.01,drop_prob=0.4):
        super().__init__()
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.gru = nn.GRU(embedding_dim, n_hidden, n_layers,
                            batch_first=True, dropout=drop_prob)
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, scale_grad_by_freq =True)
        self.fc = nn.Linear(n_hidden, 50)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(50, 1)

        
        
    def  forward(self, x):
        
        embeds = self.word_embeddings(x)
        embeds.permute(1, 0, 2)
        gru_out, hidden = self.gru(embeds)
        out = self.fc(hidden[-1])
        out = self.fc2(self.relu(out))

        return out
    
    def init_hidden(self, batch_size):

        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden[0]



## Building train/val loop

In [None]:
torch.cuda.empty_cache()

In [None]:


def train(model,
          NUM_EPOCHS,
          optim,
          criterion,
          train_dataloader,
          val_dataloader,
          batch_size=BATCH_SIZE,
          clip=5,
          print_every=10,
         ):
    
    if(train_on_gpu):
        model.cuda()
  
    for n in range(NUM_EPOCHS):
        model.train()
        clear_output()
        counter=0
        for x,y in train_dataloader:
            counter +=1
            if(train_on_gpu):
                x, y = x.cuda(), y.cuda()

                
            model.zero_grad()
            
            output = model(x)
            loss = criterion(output.double().flatten(), y.flatten())
            loss.backward()
            
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            optim.step()
            
            # validation frequency is set by print_every param
            
            if counter % print_every == 0:
                val_losses = []
                model.eval()
                for x, y in val_dataloader:

                    with torch.no_grad():
                        inputs, targets = x, y
                        if(train_on_gpu):
                            inputs, targets = inputs.cuda(), targets.cuda()

                        output= model(inputs.long())
                        val_loss = criterion(output.flatten(), targets.flatten()) ### view if train_on_gpu

                        val_losses.append(val_loss.item())

                model.train() 
                
    
                print("Epoch: {}/{}...".format(n+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))
        


## Train our model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_on_gpu = torch.cuda.is_available()

from IPython.display import clear_output
import numpy as np

model= GRU(embedding_dim=100)
model.to(device)

lr=0.001

optim = torch.optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.L1Loss()

epochs = 6

train(model, epochs,optim, criterion, train_dataloader, test_dataloader)

## Save model

In [None]:
print("The state dict keys: \n\n", model.state_dict().keys())

In [None]:
checkpoint = {'model': GRU(100),
              'state_dict': model.state_dict(),
              'optimizer' : optim.state_dict()}

torch.save(checkpoint, 'checkpoint.pth')

## & load model

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def load_checkpoint(filepath):
    checkpoint = torch.load(filepath)
    model = checkpoint['model']
    model.load_state_dict(checkpoint['state_dict'])
    for parameter in model.parameters():
        parameter.requires_grad = False
    
    model.eval()
    
    return model
#model = load_checkpoint('../input/review-classification-model/checkpoint_0.65.pth').to(device)


## Make predictions

In [None]:
def predict(dataloader, model):
    pred=[]
    with torch.no_grad():
        for x in dataloader:
            x = x.cuda()
            # move prediction to cpu to empty gpu memory
            pred.append(model(x).cpu())
    return torch.cat(pred)

In [None]:
pred = predict(sub_dataloader, model)

In [None]:
sub = [float(i) for i in pred]
len(sub)

In [None]:
submission = pd.DataFrame()
submission['review_id'] = test['review_id']
submission['score'] = sub
submission


In [None]:
submission.to_csv('submission8.csv', index=False)