In [29]:
import pandas 
import numpy

import nltk

import gensim.downloader
from gensim.models import Word2Vec
from gensim import models
from gensim import utils

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.metrics import accuracy_score

import torch
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn as nn
import torch.nn.functional as F

from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
#Setting up cuda environment to be used
device = torch.device("cuda:0" if torch.cuda.is_available() else "CPU")
torch.backends.cudnn.benchmark = True

#print(torch.cuda.is_available(), torch.__version__)

In [31]:
#importing data as dataframe
#data = pandas.read_csv('https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Jewelry_v1_00.tsv.gz', sep='\t', on_bad_lines='skip')
data = pandas.read_csv('data.tsv', sep='\t', on_bad_lines='skip')

  data = pandas.read_csv('https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Jewelry_v1_00.tsv.gz', sep='\t', on_bad_lines='skip')


In [32]:
#Remove null value rows and reset index
data = data.dropna()
data = data.reset_index(drop=True)

#Keep only review_body column and corresponding star_rating column
data = data[['review_body', 'star_rating']]

#Removing all non-integer star_rating
data['star_rating'] = data['star_rating'].astype(int)

In [33]:
#Sample 100000 having 20000 from each star_rating class
data_1 = data[data['star_rating'] == 1].sample(n = 20000, random_state = 1)
data_2 = data[data['star_rating'] == 2].sample(n = 20000, random_state = 1)
data_3 = data[data['star_rating'] == 3].sample(n = 20000, random_state = 1)
data_4 = data[data['star_rating'] == 4].sample(n = 20000, random_state = 1)
data_5 = data[data['star_rating'] == 5].sample(n = 20000, random_state = 1)
dataset = pandas.concat([data_1, data_2, data_3, data_4, data_5])

#print(len(dataset))

In [34]:
#Loading pre-trained model
model_pre_trained = gensim.downloader.load('word2vec-google-news-300')

In [35]:
#Check the similarity between two similar words
print('Similarity between excellent and outstanding = ', model_pre_trained.similarity('excellent', 'outstanding'))

#Find the corresponding word given that king - man + woman = queen 
print(model_pre_trained.most_similar(positive=['king', 'woman'], negative=['man'], topn=1))

Similarity between excellent and outstanding =  0.55674857
[('queen', 0.7118193507194519)]


In [36]:
#Training my own Word2Vec Model
class MyCorpus:
    def __iter__(self):
        for line in dataset['review_body']:
            yield utils.simple_preprocess(line)

In [37]:
#Loading my own Word2Vec Model
model_own_trained = models.Word2Vec(sentences = MyCorpus(), vector_size = 300, window = 11, min_count = 10)

In [38]:
#Check the similarity between two similar words
print('Similarity between excellent and outstanding = ', model_own_trained.wv.similarity('excellent', 'outstanding'))

#Find the corresponding word given that king - man + woman = queen 
print(model_own_trained.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=1))

Similarity between excellent and outstanding =  0.78527915
[('avenue', 0.5512246489524841)]


The pretrained model was trained by a much larger corpus (in terms of both quantities and varieties) than my own training dataset. Therefore, the pretrained model encodes better similar words between words than my own model. But since it has such a vast corpus, my model encodes better similarity between words.

In [39]:
#Function to find the mean of Word2Vec vectors for each review as the input feature

def word2vec_mean(sentences, model):
    sentence_split = sentences.split(' ')
    sum = numpy.zeros(shape = (300,))
    count = 0

    for word in sentence_split:
        if word in model:
            word_vector = model[word]
            sum += word_vector
            count += 1
    mean_vector = sum/count

    return mean_vector

In [40]:
#Function to concatenate the first 10 Word2Vec vectors for each review as the input feature

def word2vec_concatenation(sentence, model):

  ls = []
  if type(sentence) == list:
    ls = sentence
  if type(sentence) == str:
    ls = sentence.split(' ')

  i = 0 
  j = 0 

  while (i < len(ls)) & (j < 10):
    if ls[i] in model:
      wv = model[ls[i]]
      if j == 0:
        a = wv
      else:
        a = numpy.concatenate((a, wv))
      i += 1
      j += 1
    else:
      i += 1
      
  if j < 10:
    n = 10 - j
    zeros = numpy.zeros(shape=(300*n, ))
    if n == 10:
      a = zeros
    else:
      a = numpy.concatenate((a, zeros))
  return a

In [41]:
#Function to truncate longer reviews to the length of 20 and padding smaller reviews

def word2vec_sequence(sentence, model):

    if type(sentence) == str:
        ls = sentence.split(' ')
    if type(sentence) == list:
        ls = sentence

    word_vector = []
    for i in range(20):
      try:
        wv = model[ls[i]]
        word_vector.append(wv)
      except:
        pass

    if len(word_vector) < 20:
      for _ in range(20-len(word_vector)):
        word_vector.append([0 for _ in range(300)])
        
    return word_vector

In [42]:
#Function to get the indexes of NaN in the dataset

def idx_nan(matrix):
  if numpy.any(numpy.isnan(matrix)):
    arr_nan = numpy.argwhere(numpy.isnan(matrix))
    num_nan = arr_nan.shape[0]
    arr = numpy.arange(0, num_nan, 300)
    idx = []
    for i in arr:
      idx.append(arr_nan[i][0])
    return idx
  else:
    return None

In [43]:
#Splitting data into train and test dataset

labels = dataset['star_rating'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(dataset['review_body'], labels, test_size=0.2, random_state=1)

In [44]:
#Getting average of Word2Vec vectors as input for training and testing data and removing the NaN values

X_train_average = X_train.apply(lambda x: word2vec_mean(x, model_pre_trained))
X_train_pre_trained = numpy.array(X_train_average.values.tolist())

X_test_average = X_test.apply(lambda x: word2vec_mean(x, model_pre_trained))
X_test_pre_trained = numpy.array(X_test_average.values.tolist())

idx_nan_train = idx_nan(X_train_pre_trained)
if idx_nan_train == None:
  y_train_pre_trained = y_train
else:
  X_train_pre_trained = numpy.delete(X_train_pre_trained, idx_nan_train, 0)
  y_train_pre_trained = numpy.delete(y_train, idx_nan_train)

idx_nan_test = idx_nan(X_test_pre_trained)
if idx_nan_test == None:
  y_test_pre_trained = y_test
else:
  X_test_pre_trained = numpy.delete(X_test_pre_trained, idx_nan_test, 0)
  y_test_pre_trained = numpy.delete(y_test, idx_nan_test)

In [45]:
#Perceptron Analysis

perceptron_pre_trained = Perceptron()
perceptron_pre_trained.fit(X_train_pre_trained, y_train_pre_trained)

prediction_perceptron_pre_trained = perceptron_pre_trained.predict(X_test_pre_trained)

#print(metrics.classification_report(y_test_pre_trained, prediction_perceptron_pre_trained))
print('Perceptron Accuracy = ', accuracy_score(y_test_pre_trained, prediction_perceptron_pre_trained))
print('Perceptron Accuracy using TFIDF Feature Extraction = 0.40675')


Perceptron Accuracy =  0.3560122606904176
Perceptron Accuracy using TFIDF Feature Extraction = 0.40675


In [46]:
#SVM Analysis

svm_pre_trained = LinearSVC()
svm_pre_trained.fit(X_train_pre_trained, y_train_pre_trained)

prediction_svm_pre_trained = svm_pre_trained.predict(X_test_pre_trained)

#print(metrics.classification_report(y_test_pre_trained, prediction_svm_pre_trained))
print('SVM Accuracy = ', accuracy_score(y_test_pre_trained, prediction_svm_pre_trained))
print('SVM Accuracy using TFIDF Feature Extraction = 0.4897')

SVM Accuracy =  0.4649515099743731
SVM Accuracy using TFIDF Feature Extraction = 0.4897


TF-IDF model's performance is better than the Word2vec model because the number of data in each rating class is less.

In [47]:
#Getting average of Word2Vec vectors as input for training and testing data and removing NaN Values

X_train_nn = X_train.apply(lambda x: word2vec_mean(x, model_pre_trained))
X_train_fnn = numpy.array(X_train_nn.values.tolist())

X_test_nn = X_test.apply(lambda x: word2vec_mean(x, model_pre_trained))
X_test_fnn = numpy.array(X_test_nn.values.tolist())

idx_nan_train = idx_nan(X_train_fnn)
if idx_nan_train != None:
  X_train_fnn_pre_trained = numpy.delete(X_train_fnn, idx_nan_train, 0)
  y_train_fnn_pre_trained = numpy.delete(y_train, idx_nan_train)

idx_nan_test = idx_nan(X_test_fnn)
if idx_nan_test != None:
  X_test_fnn_pre_trained = numpy.delete(X_test_fnn, idx_nan_test, 0)
  y_test_fnn_pre_trained = numpy.delete(y_test, idx_nan_test)

In [48]:
#Defining the dataset classes

class Train(Dataset):
  def __init__(self, Xtrain, ytrain):
    'Initialization'
    self.data = Xtrain
    self.labels = ytrain

  def __len__(self):
    'Denotes the total number of samples'
    return len(self.data)

  def __getitem__(self, index):
    'Generates one sample of data'
    X = self.data[index]
    y = self.labels[index]

    return X, y

class Test(Dataset):
  def __init__(self, Xtest, ytest):
    'Initialization'
    self.data = Xtest
    self.labels = ytest

  def __len__(self):
    'Denotes the total number of samples'
    return len(self.data)

  def __getitem__(self, index):
    'Generates one sample of data'
    X = self.data[index]
    y = self.labels[index]

    return X, y

In [49]:
#Creating the training and testing dataset for the FNN Model

train_fnn_pre_trained = Train(X_train_fnn_pre_trained, y_train_fnn_pre_trained - 1)
test_fnn_pre_trained = Test(X_test_fnn_pre_trained, y_test_fnn_pre_trained - 1)

In [50]:
#Batching and Loading data for the FNN Model

num_workers = 0
batch_size = 100
valid_size = 0.2

num_train = len(train_fnn_pre_trained)
indices = list(range(num_train))
numpy.random.shuffle(indices)
split = int(numpy.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

train_loader_fnn = torch.utils.data.DataLoader(train_fnn_pre_trained, batch_size=batch_size,
                                           sampler=train_sampler, num_workers
                                           =num_workers)
valid_loader_fnn = torch.utils.data.DataLoader(train_fnn_pre_trained, batch_size=batch_size,
                                           sampler=valid_sampler, num_workers
                                           =num_workers)
test_loader_fnn = torch.utils.data.DataLoader(test_fnn_pre_trained, batch_size=batch_size,
                                           num_workers=num_workers)

In [51]:
#Defining the MLP Architecture

class ThreeLayerMLP(torch.nn.Module):
  def __init__(self, D_in, H1, H2, D_out):
    super().__init__()
    self.linear1 = torch.nn.Linear(D_in, H1)
    self.linear2 = torch.nn.Linear(H1, H2)
    self.linear3 = torch.nn.Linear(H2, D_out)
    self.dropout = nn.Dropout(0.2)

  def forward(self, x):
    h1_relu = torch.nn.functional.relu(self.linear1(x))
    h1_drop = self.dropout(h1_relu)
    h2_relu = torch.nn.functional.relu(self.linear2(h1_drop))
    h2_drop = self.dropout(h2_relu)
    h2_output = self.linear3(h2_drop)

    return h2_output

In [52]:
#Initializing the FNN Model

model_fnn = ThreeLayerMLP(300, 50, 10, 5)
model_fnn.cuda()
print(model_fnn)

ThreeLayerMLP(
  (linear1): Linear(in_features=300, out_features=50, bias=True)
  (linear2): Linear(in_features=50, out_features=10, bias=True)
  (linear3): Linear(in_features=10, out_features=5, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [53]:
#Loading the parameters for the FNN Model

criterion = torch.nn.CrossEntropyLoss()
optimizer_fnn = torch.optim.SGD(model_fnn.parameters(), lr=0.0065)

In [54]:
#Training the FNN Model

n_epochs = 200

valid_loss_min = numpy.Inf 

for epoch in range(n_epochs):

  train_loss = 0.0
  valid_loss = 0.0

  model_fnn.train() 
  for data, target in train_loader_fnn:
    target = target.type(torch.LongTensor) 
    data, target = data.to(device), target.to(device)
    optimizer_fnn.zero_grad()
    output = model_fnn(data.float())
    loss = criterion(output, target)
    loss.backward()
    optimizer_fnn.step()
    train_loss += loss.item()*data.size(0)

  model_fnn.eval() 
  for data, target in valid_loader_fnn:
    target = target.type(torch.LongTensor) 
    data, target = data.to(device), target.to(device)
    output = model_fnn(data.float())
    loss = criterion(output, target)
    valid_loss += loss.item()*data.size(0)

  train_loss = train_loss/len(train_loader_fnn.dataset)
  valid_loss = valid_loss/len(valid_loader_fnn.dataset)

  print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, train_loss, valid_loss))
  if valid_loss <= valid_loss_min:
    print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'
        .format(valid_loss_min, valid_loss))
    torch.save(model_fnn.state_dict(), 'model_fnn.pt')
    valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 1.292683 	Validation Loss: 0.322149
Validation loss decreased (inf --> 0.322149).  Saving model ...
Epoch: 2 	Training Loss: 1.287957 	Validation Loss: 0.321724
Validation loss decreased (0.322149 --> 0.321724).  Saving model ...
Epoch: 3 	Training Loss: 1.286769 	Validation Loss: 0.321563
Validation loss decreased (0.321724 --> 0.321563).  Saving model ...
Epoch: 4 	Training Loss: 1.286202 	Validation Loss: 0.321425
Validation loss decreased (0.321563 --> 0.321425).  Saving model ...
Epoch: 5 	Training Loss: 1.285575 	Validation Loss: 0.321263
Validation loss decreased (0.321425 --> 0.321263).  Saving model ...
Epoch: 6 	Training Loss: 1.284920 	Validation Loss: 0.321063
Validation loss decreased (0.321263 --> 0.321063).  Saving model ...
Epoch: 7 	Training Loss: 1.284021 	Validation Loss: 0.320808
Validation loss decreased (0.321063 --> 0.320808).  Saving model ...
Epoch: 8 	Training Loss: 1.282907 	Validation Loss: 0.320470
Validation loss decreased (0.32080

In [55]:
#Load the model with the lowest validation loss
model_fnn.load_state_dict(torch.load('model_fnn.pt'))

<All keys matched successfully>

In [56]:
#Evaluating the FNN Model

correct = 0
total = 0

with torch.no_grad():
  for data in test_loader_fnn:
    embeddings, labels = data
    model_fnn.to("cpu")
    outputs = model_fnn(embeddings.float())
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()

print('Accuracy: %d %%' % (100 * correct / total))

Accuracy: 46 %


In [98]:
#Using the function word2vec_concatenation to get values for the datasets and removing NaN values

X_train_fnn_10_val = X_train.apply(lambda x: word2vec_concatenation(x, model_pre_trained))
X_train_fnn_10 = numpy.array(X_train_fnn_10_val.values.tolist())

X_test_fnn_10_val = X_test.apply(lambda x: word2vec_concatenation(x, model_pre_trained))
X_test_fnn_10 = numpy.array(X_test_fnn_10_val.values.tolist())

idx_nan_train = idx_nan(X_train_fnn_10)
if idx_nan_train != None:
    X_train_fnn_10_pre_trained = numpy.delete(X_train_fnn_10, idx_nan_train, 0)
    y_train_fnn_10_pre_trained = numpy.delete(y_train, idx_nan_train)
else:
    X_train_fnn_10_pre_trained = X_train_fnn_10
    y_train_fnn_10_pre_trained = y_train

idx_nan_test = idx_nan(X_test_fnn_10)
if idx_nan_test != None:
    X_test_fnn_10_pre_trained = numpy.delete(X_test_fnn_10, idx_nan_test, 0)
    y_test_fnn_10_pre_trained = numpy.delete(y_test, idx_nan_test)
else:
    X_test_fnn_10_pre_trained = X_test_fnn_10
    y_test_fnn_10_pre_trained = y_test

In [99]:
#Defining the dataset classes

class Train(Dataset):
  def __init__(self, xtrain, ytrain):
    'Initialization'
    self.data = xtrain
    self.labels = ytrain

  def __len__(self):
    'Denotes the total number of samples'
    return len(self.data)

  def __getitem__(self, index):
    'Generates one sample of data'
    X = self.data[index]
    y = self.labels[index]

    return X, y
class Test(Dataset):
  def __init__(self, xtest, ytest):
    'Initialization'
    self.data = xtest
    self.labels = ytest

  def __len__(self):
    'Denotes the total number of samples'
    return len(self.data)

  def __getitem__(self, index):
    'Generates one sample of data'
    X = self.data[index]
    y = self.labels[index]

    return X, y

In [100]:
#Creating the training and testing dataset for FNN Model

train_fnn_10 = Train(X_train_fnn_10_pre_trained, y_train_fnn_10_pre_trained-1)
test_fnn_10 = Test(X_test_fnn_10_pre_trained, y_test_fnn_10_pre_trained-1)

In [101]:
#Batching and Loadind data for the FNN Model

num_workers = 0
batch_size = 100
valid_size = 0.2

num_train = len(train_fnn_10)
indices = list(range(num_train))
numpy.random.shuffle(indices)
split = int(numpy.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

train_loader_fnn_10 = torch.utils.data.DataLoader(train_fnn_10, batch_size=batch_size,
                                           sampler=train_sampler, num_workers
                                           =num_workers)
valid_loader_fnn_10 = torch.utils.data.DataLoader(train_fnn_10, batch_size=batch_size,
                                           sampler=valid_sampler, num_workers
                                           =num_workers)
test_loader_fnn_10 = torch.utils.data.DataLoader(test_fnn_10, batch_size=batch_size,
                                           num_workers=num_workers)

In [124]:
#Defining the MLP Architecture and loading the model

class ThreeLayerMLP(nn.Module):
  def __init__(self, D_in, H1, H2, D_out):
    super().__init__()
    self.linear1 = nn.Linear(D_in, H1)
    self.linear2 = nn.Linear(H1, H2)
    self.linear3 = nn.Linear(H2, D_out)
    self.dropout = nn.Dropout(0.2)

  def forward(self, x):
    h1_relu = F.relu(self.linear1(x))
    h1_drop = self.dropout(h1_relu)
    h2_relu = F.relu(self.linear2(h1_drop))
    h2_drop = self.dropout(h2_relu)
    h2_output = self.linear3(h2_drop)

    return h2_output
    
model_fnn_10 = ThreeLayerMLP(3000, 50, 10, 5)
model_fnn_10.to(device)
print(model_fnn_10)

ThreeLayerMLP(
  (linear1): Linear(in_features=3000, out_features=50, bias=True)
  (linear2): Linear(in_features=50, out_features=10, bias=True)
  (linear3): Linear(in_features=10, out_features=5, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [125]:
#Specifying the parameters

criterion = nn.CrossEntropyLoss()
optimizer_fnn_10 = torch.optim.SGD(model_fnn_10.parameters(), lr=0.007)

In [126]:
#Training the FNN Model

n_epochs = 100

valid_loss_min = numpy.Inf 

for epoch in range(n_epochs):
  train_loss = 0.0
  valid_loss = 0.0

  model_fnn_10.train() 
  for data, target in train_loader_fnn_10:
    target = target.type(torch.LongTensor)
    data, target = data.to(device), target.to(device)
    optimizer_fnn_10.zero_grad()
    output = model_fnn_10(data.float())
    loss = criterion(output, target)
    loss.backward()
    optimizer_fnn_10.step()
    train_loss += loss.item()*data.size(0)

  for data, target in valid_loader_fnn_10:
    target = target.type(torch.LongTensor)
    data, target = data.to(device), target.to(device)
    output = model_fnn_10(data.float())
    loss = criterion(output, target)
    valid_loss += loss.item()*data.size(0)

  train_loss = train_loss/len(train_loader_fnn_10.dataset)
  valid_loss = valid_loss/len(valid_loader_fnn_10.dataset)

  print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, train_loss, valid_loss))

  if valid_loss <= valid_loss_min:
    print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'
        .format(valid_loss_min, valid_loss))
    torch.save(model_fnn_10.state_dict(), 'model_fnn_10.pt')
    valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 1.295137 	Validation Loss: 0.322230
Validation loss decreased (inf --> 0.322230).  Saving model ...
Epoch: 2 	Training Loss: 1.286968 	Validation Loss: 0.321317
Validation loss decreased (0.322230 --> 0.321317).  Saving model ...
Epoch: 3 	Training Loss: 1.283529 	Validation Loss: 0.320560
Validation loss decreased (0.321317 --> 0.320560).  Saving model ...
Epoch: 4 	Training Loss: 1.279664 	Validation Loss: 0.319484
Validation loss decreased (0.320560 --> 0.319484).  Saving model ...
Epoch: 5 	Training Loss: 1.274466 	Validation Loss: 0.317722
Validation loss decreased (0.319484 --> 0.317722).  Saving model ...
Epoch: 6 	Training Loss: 1.265770 	Validation Loss: 0.315138
Validation loss decreased (0.317722 --> 0.315138).  Saving model ...
Epoch: 7 	Training Loss: 1.253480 	Validation Loss: 0.311501
Validation loss decreased (0.315138 --> 0.311501).  Saving model ...
Epoch: 8 	Training Loss: 1.235391 	Validation Loss: 0.306813
Validation loss decreased (0.31150

In [127]:
#Load the model with the lowest validation loss
model_fnn_10.load_state_dict(torch.load('model_fnn_10.pt'))

<All keys matched successfully>

In [128]:
#Evaluating the FNN Model

correct = 0
total = 0

with torch.no_grad():
  for data in test_loader_fnn_10:
    embeddings, labels = data
    model_fnn_10.to("cpu")
    outputs = model_fnn_10(embeddings.float())
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()
print('Accuracy: %d %%' % (100 * correct / total))

Accuracy: 39 %


The accuracies of FNN is better as compared to the simple models. This is so because of the learning algorithms for FNN. 

In [66]:
#Calculating values for training and testing dataset using word2vec_sequence and removing NaN values

X_train_rnn_val = X_train.apply(lambda x: word2vec_sequence(x, model_pre_trained))
X_train_rnn = numpy.array(X_train_rnn_val.values.tolist())

X_test_rnn_val = X_test.apply(lambda x: word2vec_sequence(x, model_pre_trained))
X_test_rnn = numpy.array(X_test_rnn_val.values.tolist())

idx_nan_train = idx_nan(X_train_rnn)
if idx_nan_train != None:
  X_train_rnn_pre_trained = numpy.delete(X_train_rnn, idx_nan_train, 0)
  y_train_rnn_pre_trained = numpy.delete(y_train, idx_nan_train)
else:
  X_train_rnn_pre_trained = X_train_rnn
  y_train_rnn_pre_trained = y_train

idx_nan_test = idx_nan(X_test_rnn)
if idx_nan_test != None:
  X_test_rnn_pre_trained = numpy.delete(X_test_rnn, idx_nan_test, 0)
  y_test_rnn_pre_trained = numpy.delete(y_test, idx_nan_test)
else:
  X_test_rnn_pre_trained = X_test_rnn
  y_test_rnn_pre_trained = y_test

In [67]:
#Creating the training and testing dataset

train_rnn_pre_trained = Train(X_train_rnn_pre_trained, y_train_rnn_pre_trained - 1)
test_rnn_pre_trained = Test(X_test_rnn_pre_trained, y_test_rnn_pre_trained - 1)

In [68]:
#Batching and Loading data for the RNN Model

num_workers = 0
batch_size = 100
valid_size = 0.2

num_train = len(train_rnn_pre_trained)
indices = list(range(num_train))
numpy.random.shuffle(indices)
split = int(numpy.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

train_loader_rnn = torch.utils.data.DataLoader(train_rnn_pre_trained, batch_size=batch_size,
                                           sampler=train_sampler, num_workers
                                           =num_workers)
valid_loader_rnn = torch.utils.data.DataLoader(train_rnn_pre_trained, batch_size=batch_size,
                                           sampler=valid_sampler, num_workers
                                           =num_workers)
test_loader_rnn = torch.utils.data.DataLoader(test_rnn_pre_trained, batch_size=batch_size,
                                           num_workers=num_workers)

In [69]:
#Defining the RNN Architecture

class RNNModel(nn.Module):
  def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
    super().__init__()
    self.hidden_dim = hidden_dim
    self.layer_dim = layer_dim
    self.rnn = nn.RNN(input_dim, hidden_dim, layer_dim, batch_first=True,
                     nonlinearity='relu')
    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, x):
    
    h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(device)
    out, hn = self.rnn(x, h0)
    out = self.fc(out[:, -1, :])
    return out

In [70]:
#Initializing the RNN Model

model_rnn = RNNModel(300, 20, 1, 5)
model_rnn.cuda()
print(model_rnn)

RNNModel(
  (rnn): RNN(300, 20, batch_first=True)
  (fc): Linear(in_features=20, out_features=5, bias=True)
)


In [71]:
#Loading the Parameters for the RNN Model

optimizer_rnn = torch.optim.SGD(model_rnn.parameters(), lr=0.0075)

In [72]:
#Training the RNN Model

n_epochs = 200

valid_loss_min = numpy.Inf 

for epoch in range(n_epochs):

  train_loss = 0.0
  valid_loss = 0.0

  model_rnn.train() 

  for data, target in train_loader_rnn:
    target = target.type(torch.LongTensor)
    data, target = data.to(device), target.to(device)
    optimizer_rnn.zero_grad()
    output = model_rnn(data.float())
    loss = criterion(output, target)
    loss.backward()
    optimizer_rnn.step()
    train_loss += loss.item()*data.size(0)

  model_rnn.eval() 

  for data, target in valid_loader_rnn:
    target = target.type(torch.LongTensor)
    data, target = data.to(device), target.to(device)
    output = model_rnn(data.float())
    loss = criterion(output, target)
    valid_loss += loss.item()*data.size(0)
    

  train_loss = train_loss/len(train_loader_rnn.dataset)
  valid_loss = valid_loss/len(valid_loader_rnn.dataset)

  print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, train_loss, valid_loss))

  if valid_loss <= valid_loss_min:
    print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'
        .format(valid_loss_min, valid_loss))
    torch.save(model_rnn.state_dict(), 'model_rnn.pt')
    valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 1.288773 	Validation Loss: 0.321832
Validation loss decreased (inf --> 0.321832).  Saving model ...
Epoch: 2 	Training Loss: 1.287371 	Validation Loss: 0.321800
Validation loss decreased (0.321832 --> 0.321800).  Saving model ...
Epoch: 3 	Training Loss: 1.287258 	Validation Loss: 0.321784
Validation loss decreased (0.321800 --> 0.321784).  Saving model ...
Epoch: 4 	Training Loss: 1.287198 	Validation Loss: 0.321772
Validation loss decreased (0.321784 --> 0.321772).  Saving model ...
Epoch: 5 	Training Loss: 1.287117 	Validation Loss: 0.321756
Validation loss decreased (0.321772 --> 0.321756).  Saving model ...
Epoch: 6 	Training Loss: 1.287017 	Validation Loss: 0.321729
Validation loss decreased (0.321756 --> 0.321729).  Saving model ...
Epoch: 7 	Training Loss: 1.286842 	Validation Loss: 0.321672
Validation loss decreased (0.321729 --> 0.321672).  Saving model ...
Epoch: 8 	Training Loss: 1.286565 	Validation Loss: 0.321572
Validation loss decreased (0.32167

In [73]:
#Load the model with the lowest validation loss
model_rnn.load_state_dict(torch.load('model_rnn.pt'))

<All keys matched successfully>

In [74]:
#Evaluating the RNN Model

correct = 0
total = 0

with torch.no_grad():
  for data in test_loader_rnn:
    embeddings, labels = data
    embeddings, labels = embeddings.to(device), labels.to(device)
    model_rnn.to(device)
    outputs = model_rnn(embeddings.float())
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()
    
print('Accuracy of RNN Model: %d %%' % (100 * correct / total))

Accuracy of RNN Model: 44 %


RNN works better when the dataset provided is large, but since we are working on smaller batches and data, the accuracy of FNN is much better as compared to RNN.

In [88]:
#Using the function word2vec_sequence to get values for training and testing data, and removing any NaN values

X_train_gru_val = X_train.apply(lambda x: word2vec_sequence(x, model_pre_trained))
X_train_gru = numpy.array(X_train_gru_val.values.tolist())

X_test_gru_val = X_test.apply(lambda x: word2vec_sequence(x, model_pre_trained))
X_test_gru = numpy.array(X_test_gru_val.values.tolist())

idx_nan_train = idx_nan(X_train_gru)
if idx_nan_train != None:
  X_train_gru_pre_trained = numpy.delete(X_train_gru, idx_nan_train, 0)
  y_train_gru_pre_trained = numpy.delete(y_train, idx_nan_train)
else:
  X_train_gru_pre_trained = X_train_gru
  y_train_gru_pre_trained = y_train

idx_nan_test = idx_nan(X_test_gru)
if idx_nan_test != None:
  X_test_gru_pre_trained = numpy.delete(X_test_gru, idx_nan_test, 0)
  y_test_gru_pre_trained = numpy.delete(y_test, idx_nan_test)
else:
  X_test_gru_pre_trained = X_test_gru
  y_test_gru_pre_trained = y_test

In [89]:
#Defining the data classes

class Train(Dataset):
  def __init__(self, xtrain, ytrain):
    'Initialization'
    self.data = xtrain
    self.labels = ytrain

  def __len__(self):
    'Denotes the total number of samples'
    return len(self.data)

  def __getitem__(self, index):
    'Generates one sample of data'
    X = self.data[index]
    y = self.labels[index]

    return X, y
class Test(Dataset):
  def __init__(self, xtest, ytest):
    'Initialization'
    self.data = xtest
    self.labels = ytest

  def __len__(self):
    'Denotes the total number of samples'
    return len(self.data)

  def __getitem__(self, index):
    'Generates one sample of data'
    X = self.data[index]
    y = self.labels[index]

    return X, y

In [90]:
#Creating the training and testing dataset

train_gru_pre_trained = Train(X_train_gru_pre_trained, y_train_gru_pre_trained - 1) 
test_gru_pre_trained = Test(X_test_gru_pre_trained, y_test_gru_pre_trained - 1)

In [91]:
#Batching and Loading the Data for the GRU Model

num_workers = 0
batch_size = 100
valid_size = 0.2

num_train = len(train_gru_pre_trained)
indices = list(range(num_train))
numpy.random.shuffle(indices)
split = int(numpy.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

train_loader_gru = torch.utils.data.DataLoader(train_gru_pre_trained, batch_size=batch_size,
                                           sampler=train_sampler, num_workers
                                           =num_workers)
valid_loader_gru = torch.utils.data.DataLoader(train_gru_pre_trained, batch_size=batch_size,
                                           sampler=valid_sampler, num_workers
                                           =num_workers)
test_loader_gru = torch.utils.data.DataLoader(test_gru_pre_trained, batch_size=batch_size,
                                           num_workers=num_workers)

In [92]:
#Defining the GRU Architecture

class GRUModel(nn.Module):
  def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
    super().__init__()
    self.hidden_dim = hidden_dim
    self.layer_dim = layer_dim
    self.gru = nn.GRU(input_dim, hidden_dim, layer_dim, batch_first=True)
    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, x):
    h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(device)
    out, hn = self.gru(x, h0)
    out = self.fc(out[:, -1, :])
    return out

In [93]:
#Initializing the GRU Model

model_gru = GRUModel(300, 20, 1, 5)
model_gru.cuda()
print(model_gru)

GRUModel(
  (gru): GRU(300, 20, batch_first=True)
  (fc): Linear(in_features=20, out_features=5, bias=True)
)


In [94]:
#Setting the GRU Parameters

criterion = nn.CrossEntropyLoss()
optimizer_gru = torch.optim.SGD(model_gru.parameters(), lr=0.0075)

In [95]:
#Training the GRU Model

n_epochs = 100

valid_loss_min = numpy.Inf 

for epoch in range(n_epochs):

  train_loss = 0.0
  valid_loss = 0.0

  model_gru.train() 

  for data, target in train_loader_gru:
    target = target.type(torch.LongTensor)
    data, target = data.to(device), target.to(device)
    optimizer_gru.zero_grad()
    output = model_gru(data.float())
    loss = criterion(output, target)
    loss.backward()
    optimizer_gru.step()
    train_loss += loss.item()*data.size(0)

  model_gru.eval() 

  for data, target in valid_loader_gru:
    target = target.type(torch.LongTensor)
    data, target = data.to(device), target.to(device)
    output = model_gru(data.float())
    loss = criterion(output, target)
    valid_loss += loss.item()*data.size(0)

  train_loss = train_loss/len(train_loader_gru.dataset)
  valid_loss = valid_loss/len(valid_loader_gru.dataset)

  print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, train_loss, valid_loss))
    
  if valid_loss <= valid_loss_min:
    print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'
        .format(valid_loss_min, valid_loss))
    torch.save(model_gru.state_dict(), 'model_gru.pt')
    valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 1.291488 	Validation Loss: 0.321927
Validation loss decreased (inf --> 0.321927).  Saving model ...
Epoch: 2 	Training Loss: 1.287266 	Validation Loss: 0.321792
Validation loss decreased (0.321927 --> 0.321792).  Saving model ...
Epoch: 3 	Training Loss: 1.287033 	Validation Loss: 0.321759
Validation loss decreased (0.321792 --> 0.321759).  Saving model ...
Epoch: 4 	Training Loss: 1.286880 	Validation Loss: 0.321720
Validation loss decreased (0.321759 --> 0.321720).  Saving model ...
Epoch: 5 	Training Loss: 1.286723 	Validation Loss: 0.321686
Validation loss decreased (0.321720 --> 0.321686).  Saving model ...
Epoch: 6 	Training Loss: 1.286538 	Validation Loss: 0.321642
Validation loss decreased (0.321686 --> 0.321642).  Saving model ...
Epoch: 7 	Training Loss: 1.286359 	Validation Loss: 0.321599
Validation loss decreased (0.321642 --> 0.321599).  Saving model ...
Epoch: 8 	Training Loss: 1.286143 	Validation Loss: 0.321549
Validation loss decreased (0.32159

In [96]:
#Load the model with the lowest validation loss

model_gru.load_state_dict(torch.load('model_gru.pt'))

<All keys matched successfully>

In [97]:
#Evaluating the GRU Model

correct = 0
total = 0

with torch.no_grad():
  for data in test_loader_gru:
    embeddings, labels = data
    embeddings, labels = embeddings.to(device), labels.to(device)
    model_gru.to(device)
    outputs = model_gru(embeddings.float())
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()
print('GRU Accuracy: %d %%' % (100 * correct / total))

GRU Accuracy: 43 %


The flow of information is controlled and the problem of long delays is completely eliminated in the Gated RNN's, thus, the accuracy is lower as compared to RNN.