In [6]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('punkt')
import re
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
import os
import contractions

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jeevithagc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jeevithagc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
os.chdir("/Users/jeevithagc/Documents/Knowledge/CSCI544")


## 1. Dataset Generation

In [8]:
data = pd.read_csv("amazon_reviews_us_Jewelry_v1_00.tsv", sep='\t+')

In [9]:
# Keep Reviews and Ratings
data1 = data[['star_rating', 'review_body']]

In [10]:
# Select 20000 reviews randomly from each rating class
data_sampled = data1.groupby('star_rating', group_keys=False).apply(lambda x: x.sample(20000))
data_sampled.groupby('star_rating').agg({'review_body':'count'})

Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1
1,19999
2,20000
3,19999
4,20000
5,19999


#### Data Cleaning

Refer to https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial/notebook for more steps

In [11]:
# eliminate missing values
data_sampled = data_sampled.dropna().reset_index(drop=True)
data_sampled.isnull().sum()

star_rating    0
review_body    0
dtype: int64

In [12]:
# removing NaN
data_sampled = data_sampled.dropna()
data_sampled = data_sampled.drop_duplicates()

# lower case
data_sampled['review_body'] = data_sampled['review_body'].str.lower()

# remove the HTML and URLs from the reviews
data_sampled['review_body'] = data_sampled['review_body'].map(lambda x: re.sub(r'http\S+', '', x)) 
data_sampled['review_body'] = data_sampled['review_body'].map(lambda x: re.sub(r'<.*?>', '', x)) 

# remove non-alphabetical characters
data_sampled['review_body'] = data_sampled['review_body'].str.replace('[^A-Za-z ]+', '')

# remove whitespaces
# data_sampled['review_body']=data_sampled['review_body'].str.strip()
data_sampled['review_body']=data_sampled['review_body'].str.split()
data_sampled['review_body']=data_sampled['review_body'].map(lambda x:' '.join(x))

# perform contractions on the reviews
data_sampled['review_body']=data_sampled['review_body'].map(lambda x:contractions.fix(x))

# remove missing values and duplicates
from nltk.tokenize import word_tokenize
data_sampled = data_sampled.dropna().drop_duplicates()
data_sampled['review_tokens'] = data_sampled['review_body'].apply(lambda text:word_tokenize(text))
data_sampled = data_sampled.drop((data_sampled.loc[data_sampled['review_tokens'].str.len()<3]).index)
data_sampled

Unnamed: 0,star_rating,review_body,review_tokens
0,1,i did not like them at allbecause when i was r...,"[i, did, not, like, them, at, allbecause, when..."
1,1,not impressed with this bead it is just a milk...,"[not, impressed, with, this, bead, it, is, jus..."
2,1,warningthis locket is ridiculously small liter...,"[warningthis, locket, is, ridiculously, small,..."
3,1,to hard to open,"[to, hard, to, open]"
4,1,chains were not like in description cheap look...,"[chains, were, not, like, in, description, che..."
...,...,...,...
99991,5,i have bought maybe a dozen eternity rings on ...,"[i, have, bought, maybe, a, dozen, eternity, r..."
99992,5,very different and unique love them,"[very, different, and, unique, love, them]"
99994,5,simple yet elegant for the non girly girl manl...,"[simple, yet, elegant, for, the, non, girly, g..."
99995,5,it is an excellent article comply with the spe...,"[it, is, an, excellent, article, comply, with,..."


In [13]:
# data_sampled_x = data_sampled.copy()
# data_sampled_x['review_body_list'] = data_sampled_x['review_body'].str.split()
# data_sampled_x['len_words'] = data_sampled_x['review_body_list'].map(lambda x:len(x))
# data_sampled_x[data_sampled_x['len_words']==1]

## 2. Word embedding

In [14]:
# (a)

import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [45]:
import difflib

# (b) Training Word2Vec on dataset

import multiprocessing

from gensim.models import Word2Vec
cores = multiprocessing.cpu_count() # count number of cores

from gensim.models.phrases import Phrases, Phraser

sent = [row.split() for row in data_sampled['review_body']]
phrases = Phrases(sent)
bigram = Phraser(phrases)
sentences = bigram[sent]
# data_sampled['sent'] = None
# for i in range(len(sentences)):
#   data_sampled['sent'][i] = sentences[i]

0.006666666666666667
0.01
0.01


In [69]:
# Eg 1
print(wv.most_similar('computer', topn=3))

# Eg 2
print(wv.similarity('beautiful', 'horrible'))

# Eg 3
print(wv.doesnt_match(['fire', 'water', 'land', 'sea', 'air', 'chair']))

[('computers', 0.7979379296302795), ('laptop', 0.6640492677688599), ('laptop_computer', 0.6548868417739868)]
0.3883018
chair


In [33]:
from time import time
w2v_model = Word2Vec(min_count=10, window=11, vector_size=300, workers=cores-1)
# w2v_model.save("w2v_model.model")
# model = Word2Vec.load("w2v_model.model")

# Building the vocab tree
t = time()
w2v_model.build_vocab(sentences, progress_per=10000)

# model training
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))
# print(w2v_model.get_latest_training_loss())

Time to train the model: 1.53 mins


In [72]:
# Eg 1
print(w2v_model.wv.most_similar('computer', topn=3))

# Eg 2
print(w2v_model.wv.similarity('beautiful', 'horrible'))

# Eg 3
print(w2v_model.wv.doesnt_match(['fire', 'water', 'land', 'sea', 'air', 'chair']))

[('screen', 0.6647399067878723), ('website', 0.5683073401451111), ('site', 0.48910003900527954)]
0.2534125
fire


## Question 3

In [15]:
y_output = data_sampled['star_rating'].map(lambda x: x-1)
X_input = data_sampled['review_tokens']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X_input, y_output, test_size=0.2, random_state=42)
words = set(wv.index_to_key)
X_train_vect = np.array([np.array([wv[i] for i in ls if i in words])
                         for ls in X_train])
X_test_vect = np.array([np.array([wv[i] for i in ls if i in words])
                         for ls in X_test])

Generating average vector

In [16]:
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(300, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(300, dtype=float))
    

Perceptron

In [87]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report
clf = Perceptron(tol = 1e-8, random_state=0)
clf.fit(X_train_vect_avg, y_train)
y_train_pred = clf.predict(X_train_vect_avg)
y_test_pred = clf.predict(X_test_vect_avg)

# classification metrics
cls_report = classification_report(y_test, y_test_pred, output_dict=True)
print(f"{cls_report['0']['precision']},{cls_report['0']['recall']},{cls_report['0']['f1-score']}")
print(f"{cls_report['1']['precision']},{cls_report['1']['recall']},{cls_report['1']['f1-score']}")
print(f"{cls_report['2']['precision']},{cls_report['2']['recall']},{cls_report['2']['f1-score']}")
print(f"{cls_report['3']['precision']},{cls_report['3']['recall']},{cls_report['3']['f1-score']}")
print(f"{cls_report['4']['precision']},{cls_report['4']['recall']},{cls_report['4']['f1-score']}")
print(f"{cls_report['macro avg']['precision']},{cls_report['macro avg']['recall']},{cls_report['macro avg']['f1-score']}")

print(f'Mean accuracy on test input data and predicted labels: {sum(y_test_pred == y_test)/y_test.shape[0]}')

0.2806679511881824,0.9538881309686221,0.43371999255629307
0.2727272727272727,0.012483745123537062,0.023874658045262374
0.38245412844036697,0.1781517094017094,0.24307580174927115
0.43112701252236135,0.06672203765227021,0.11555981778949892
0.5916149068322981,0.5792034052903619,0.5853433707174682
0.3917182543420963,0.35808980568730014,0.2803147281715587
Mean accuracy on test input data and predicted labels: 0.3501514734232994


SVM

In [31]:
from sklearn import svm
from sklearn.metrics import classification_report

svm_clf = svm.LinearSVC(random_state=0)
svm_clf.fit(X_train_vect_avg, y_train)
y_test_pred_svm = svm_clf.predict(X_test_vect_avg)

# classification metrics
cls_report = classification_report(y_test, y_test_pred_svm, output_dict=True)
print(f"{cls_report['0']['precision']},{cls_report['0']['recall']},{cls_report['0']['f1-score']}")
print(f"{cls_report['1']['precision']},{cls_report['1']['recall']},{cls_report['1']['f1-score']}")
print(f"{cls_report['2']['precision']},{cls_report['2']['recall']},{cls_report['2']['f1-score']}")
print(f"{cls_report['3']['precision']},{cls_report['3']['recall']},{cls_report['3']['f1-score']}")
print(f"{cls_report['4']['precision']},{cls_report['4']['recall']},{cls_report['4']['f1-score']}")
print(f"{cls_report['macro avg']['precision']},{cls_report['macro avg']['recall']},{cls_report['macro avg']['f1-score']}")

print(f'Mean accuracy on test input data and predicted labels: {sum(y_test_pred_svm == y_test)/y_test.shape[0]}')

0.4969430645777608,0.70949263502455,0.584494382022472
0.3881158330199323,0.27093725387240747,0.31910946196660483
0.39165009940357853,0.366365568544102,0.378586135895676
0.44811504080839487,0.3207232267037552,0.37386511024643315
0.6005271986580398,0.7534576067348165,0.6683557807707694
0.46507024729354124,0.48419525817592624,0.4648821741803911
Mean accuracy on test input data and predicted labels: 0.4774779735682819


## Question 4

### Perceptron with PyTorch

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

import torchvision.transforms as transforms
from torch.utils.data import TensorDataset, DataLoader

In [18]:
BATCH_SIZE = 64
X_train_tensor = torch.Tensor(X_train_vect_avg)

class review_dataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.labels = torch.LongTensor([i for i in y])
        self.texts = torch.Tensor(X)

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = np.array(self.labels[idx])

        return batch_texts, batch_y
    
train_data, test_data = review_dataset(X_train_vect_avg, y_train), review_dataset(X_test_vect_avg, y_test)
train_dataloader = DataLoader(train_data, batch_size=64)
test_dataloader = DataLoader(test_data, batch_size=1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:
class MLP_mod(nn.Module):
    def __init__(self, input_dim, output_dim, activation_func = "relu"):
        super().__init__()
        
        self.input_fc = nn.Linear(input_dim, 50)
        self.hidden_fc = nn.Linear(50, 10)
        self.output_fc = nn.Linear(10, output_dim)
        self.activation_function = activation_func
        

    def forward(self, x):

        batch_size = x.shape[0]
        x = x.view(batch_size, -1)
        if self.activation_function == "relu":
            h_1 = F.relu(self.input_fc(x))
            h_2 = F.relu(self.hidden_fc(h_1))
        if self.activation_function == "leaky_relu":
            h_1 = F.leaky_relu(self.input_fc(x))
            h_2 = F.leaky_relu(self.hidden_fc(h_1))
        y_pred = self.output_fc(h_2)
        return y_pred

# class MLP(nn.Module):
#     def __init__(self, input_dim, output_dim):
#         super().__init__()

#         self.input_fc = nn.Linear(input_dim, 50)
#         self.hidden_fc = nn.Linear(50, 10)
#         self.output_fc = nn.Linear(10, output_dim)

#     def forward(self, x):

#         # x = [batch size, height, width]

#         batch_size = x.shape[0]

#         x = x.view(batch_size, -1)

#         # x = [batch size, height * width]

#         h_1 = F.relu(self.input_fc(x))

#         # h_1 = [batch size, 250]

#         h_2 = F.relu(self.hidden_fc(h_1))

#         # h_2 = [batch size, 100]

#         y_pred = self.output_fc(h_2)

#         # y_pred = [batch size, output dim]

#         return y_pred, h_2

class MLP(nn.Module):
    def __init__(self,xshape):
        super(MLP, self).__init__()
        hidden_1 = 50
        hidden_2 = 10
        self.fc1 = nn.Linear(xshape, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 5)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):

       
        x = F.relu(self.fc1(x))
      
        x = self.dropout(x)
       
        x = F.relu(self.fc2(x))
       
        x = self.dropout(x)
     
        x = self.fc3(x)
        return x

model = MLP(300).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [20]:
# def calculate_accuracy(y_pred, y):
#     top_pred = y_pred.argmax(1, keepdim=True)
#     correct = top_pred.eq(y.view_as(top_pred)).sum()
#     acc = correct.float() / y.shape[0]
#     return acc

from tqdm.notebook import trange

def train(model, iterator, optimizer, criterion, device, n_epochs):

    for epoch in range(n_epochs):
        print("epoch",epoch)
        epoch_loss = 0.0
        # epoch_acc = 0
        model.train()

        for x, y in iterator:
            optimizer.zero_grad()
            x = x.to(device)
            y = y.to(device)

            y_pred = model(x)
            loss = criterion(y_pred, y)

            # acc = calculate_accuracy(y_pred, y)

            loss.backward()

            optimizer.step()
            
            epoch_loss += loss.item()*x.size(0)
            # epoch_loss += loss.item()
            # epoch_acc += acc.item()
        
        epoch_loss = epoch_loss/len(iterator.dataset)
        print('Epoch: {} \tTraining Loss: {:.6f}'.format(
                epoch+1, 
                epoch_loss,
                ))
    return model

In [None]:
mlp_model = train(model,train_dataloader, optimizer, criterion, device, 50)

epoch 0
Epoch: 1 	Training Loss: 1.611035
epoch 1
Epoch: 2 	Training Loss: 1.606735
epoch 2
Epoch: 3 	Training Loss: 1.603359
epoch 3
Epoch: 4 	Training Loss: 1.596334
epoch 4
Epoch: 5 	Training Loss: 1.577726
epoch 5
Epoch: 6 	Training Loss: 1.529847
epoch 6
Epoch: 7 	Training Loss: 1.458246
epoch 7
Epoch: 8 	Training Loss: 1.406392
epoch 8
Epoch: 9 	Training Loss: 1.373051
epoch 9
Epoch: 10 	Training Loss: 1.349990
epoch 10
Epoch: 11 	Training Loss: 1.333961
epoch 11
Epoch: 12 	Training Loss: 1.321677
epoch 12
Epoch: 13 	Training Loss: 1.313083
epoch 13
Epoch: 14 	Training Loss: 1.303674
epoch 14
Epoch: 15 	Training Loss: 1.298315
epoch 15
Epoch: 16 	Training Loss: 1.291237
epoch 16
Epoch: 17 	Training Loss: 1.285353
epoch 17
Epoch: 18 	Training Loss: 1.280743
epoch 18
Epoch: 19 	Training Loss: 1.275397
epoch 19
Epoch: 20 	Training Loss: 1.270757
epoch 20
Epoch: 21 	Training Loss: 1.268415
epoch 21
Epoch: 22 	Training Loss: 1.264425
epoch 22
Epoch: 23 	Training Loss: 1.260338
epoch 2

In [None]:
prediction_list = []
for i, batch in enumerate(test_dataloader):   
    outputs = model(batch[0])
    _, predicted = torch.max(outputs.data, 1) 
    prediction_list.append(predicted)

In [None]:
# predictions = predict(mlp_model, test_dataloader)
predictions = np.array(prediction_list)
tp=0
tp += (predictions == y_test).sum()
accuracy = 100 * tp / len(predictions)
print(accuracy)

45.6182869732856


In [100]:
N_EPOCHS = [100]
activation_funcs = ["relu", "leaky_relu"]
from itertools import product
output_df = pd.DataFrame(product(activation_funcs, N_EPOCHS), columns = ['Activation_Function', 'N_Epochs'])
train_losses = []
train_accs = []
accuracies = []
for activation_func in activation_funcs:
    for i, epochs in enumerate(N_EPOCHS):
        print("Running for Combination :", activation_func, epochs)
        INPUT_DIM = 300
        OUTPUT_DIM = 5

        model = MLP_mod(INPUT_DIM, OUTPUT_DIM, activation_func)
        optimizer = optim.Adam(model.parameters())
        criterion = nn.CrossEntropyLoss()
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = model.to(device)
        criterion = criterion.to(device)

        model_updated = train(model, train_dataloader, optimizer, criterion, device, epochs)
            
        prediction_list = []
        for i, batch in enumerate(test_dataloader):   
            outputs = model_updated(batch[0])
            _, predicted = torch.max(outputs.data, 1) 
            prediction_list.append(predicted)    
            
        predictions = np.array(prediction_list)
        tp=0
        tp += (predictions == y_test).sum()
        accuracy = 100 * tp / len(predictions)
        accuracies.append(accuracy)
    
output_df['Test_Accuracy'] = accuracies



Running for Combination : relu 100
epoch 0
Epoch: 1 	Training Loss: 1.291615
epoch 1
Epoch: 2 	Training Loss: 1.192348
epoch 2
Epoch: 3 	Training Loss: 1.172078
epoch 3
Epoch: 4 	Training Loss: 1.160785
epoch 4
Epoch: 5 	Training Loss: 1.152192
epoch 5
Epoch: 6 	Training Loss: 1.144928
epoch 6
Epoch: 7 	Training Loss: 1.138464
epoch 7
Epoch: 8 	Training Loss: 1.132707
epoch 8
Epoch: 9 	Training Loss: 1.127519
epoch 9
Epoch: 10 	Training Loss: 1.122516
epoch 10
Epoch: 11 	Training Loss: 1.117953
epoch 11
Epoch: 12 	Training Loss: 1.113587
epoch 12
Epoch: 13 	Training Loss: 1.109814
epoch 13
Epoch: 14 	Training Loss: 1.105957
epoch 14
Epoch: 15 	Training Loss: 1.102448
epoch 15
Epoch: 16 	Training Loss: 1.098978
epoch 16
Epoch: 17 	Training Loss: 1.095686
epoch 17
Epoch: 18 	Training Loss: 1.092565
epoch 18
Epoch: 19 	Training Loss: 1.089473
epoch 19
Epoch: 20 	Training Loss: 1.086778
epoch 20
Epoch: 21 	Training Loss: 1.083965
epoch 21
Epoch: 22 	Training Loss: 1.081440
epoch 22
Epoch: 

In [101]:
# obtain best params
# output_df.loc[output_df['Test_Accuracy'].idxmax()]
output_df

Unnamed: 0,Activation_Function,N_Epochs,Test_Accuracy
0,relu,100,48.256679
1,leaky_relu,100,48.477004


### Q4 (b)

In [21]:
X_train_vect_concat = []
for i in X_train_vect:
    subset_list = []
    for v in range(10):
        try:
            if i[v].size:
                subset_list.extend(i[v])
        except:
            subset_list.extend(np.zeros(300, dtype=float))
    X_train_vect_concat.append(subset_list)
X_train_vect_concat = np.array(X_train_vect_concat) 
        
X_test_vect_concat = []
for i in X_test_vect:
    subset_list = []
    for v in range(10):
        try:
            if i[v].size:
                subset_list.extend(i[v])
        except:
            subset_list.extend(np.zeros(300, dtype=float))
    X_test_vect_concat.append(subset_list)
X_test_vect_concat = np.array(X_test_vect_concat) 

In [105]:
print(len(X_test_vect_concat))
print(len(X_test_vect_concat[0]))

18155
3000


In [22]:
train_data_f10, test_data_f10 = review_dataset(X_train_vect_concat, y_train), review_dataset(X_test_vect_concat, y_test)
train_dataloader_f10 = DataLoader(train_data_f10, batch_size=64)
test_dataloader_f10 = DataLoader(test_data_f10, batch_size=1)

N_EPOCHS = [100]
activation_funcs = ["relu", "leaky_relu"]
from itertools import product
output_df2 = pd.DataFrame(product(activation_funcs, N_EPOCHS), columns = ['Activation_Function', 'N_Epochs'])
test_accuracies = []
for activation_func in activation_funcs:
    for i, epochs in enumerate(N_EPOCHS):
        print("Running for Combination :", activation_func, epochs)
        INPUT_DIM = 3000
        OUTPUT_DIM = 5

        model = MLP_mod(INPUT_DIM, OUTPUT_DIM, activation_func)
        optimizer = optim.Adam(model.parameters())
        criterion = nn.CrossEntropyLoss()
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = model.to(device)
        criterion = criterion.to(device)

        updated_model = train(model, train_dataloader_f10, optimizer, criterion, device, epochs)
  
        prediction_list = []
        for i, batch in enumerate(test_dataloader_f10):   
            outputs = updated_model(batch[0])
            _, predicted = torch.max(outputs.data, 1) 
            prediction_list.append(predicted)    
            
        predictions = np.array(prediction_list)
        tp=0
        tp += (predictions == y_test).sum()
        accuracy = 100 * tp / len(predictions)
        test_accuracies.append(accuracy)

output_df2['Test_Accuracy'] = test_accuracies

Running for Combination : relu 100
epoch 0
Epoch: 1 	Training Loss: 1.325984
epoch 1
Epoch: 2 	Training Loss: 1.243667
epoch 2
Epoch: 3 	Training Loss: 1.203100
epoch 3
Epoch: 4 	Training Loss: 1.162756
epoch 4
Epoch: 5 	Training Loss: 1.122003
epoch 5
Epoch: 6 	Training Loss: 1.081444
epoch 6
Epoch: 7 	Training Loss: 1.040732
epoch 7
Epoch: 8 	Training Loss: 1.000380
epoch 8
Epoch: 9 	Training Loss: 0.961321
epoch 9
Epoch: 10 	Training Loss: 0.922671
epoch 10
Epoch: 11 	Training Loss: 0.885373
epoch 11
Epoch: 12 	Training Loss: 0.849837
epoch 12
Epoch: 13 	Training Loss: 0.817757
epoch 13
Epoch: 14 	Training Loss: 0.785639
epoch 14
Epoch: 15 	Training Loss: 0.755246
epoch 15
Epoch: 16 	Training Loss: 0.726884
epoch 16
Epoch: 17 	Training Loss: 0.702159
epoch 17
Epoch: 18 	Training Loss: 0.679145
epoch 18
Epoch: 19 	Training Loss: 0.657398
epoch 19
Epoch: 20 	Training Loss: 0.635994
epoch 20
Epoch: 21 	Training Loss: 0.618694
epoch 21
Epoch: 22 	Training Loss: 0.608478
epoch 22
Epoch: 

In [23]:
output_df2

Unnamed: 0,Activation_Function,N_Epochs,Test_Accuracy
0,relu,100,36.987885
1,leaky_relu,100,36.894273


## Question 5

In [15]:
# Data preparation
X_train_vect_rnn = []
for i in X_train_vect:
    subset_list = []
    for v in range(20):
        try:
            if i[v].size:
                subset_list.append(i[v])
        except:
            subset_list.append(np.zeros(300, dtype=float))
    X_train_vect_rnn.append(np.array(subset_list))
X_train_vect_rnn = np.array(X_train_vect_rnn)

X_test_vect_rnn = []
for i in X_test_vect:
    subset_list = []
    for v in range(20):
        try:
            if i[v].size:
                subset_list.append(i[v])
        except:
            subset_list.append(np.zeros(300, dtype=float))
    X_test_vect_rnn.append(np.array(subset_list))
X_test_vect_rnn = np.array(X_test_vect_rnn)

In [111]:
print((X_train_vect_rnn).shape)

(72619, 20, 300)


In [112]:
# RNN Model

n_categories = 5

import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        # input = input.reshape(1, -1)
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

n_words = 300
n_hidden = 20

rnn = RNN(n_words, n_hidden, n_categories)

In [22]:
from tqdm.notebook import trange, tqdm

batch_size = 64
def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

def train(rnn, iterator, optimizer, criterion, device):

    epoch_loss = 0
    epoch_acc = 0

    rnn = rnn.to(device)
    rnn.train()

    for x, y in iterator:
        if x.shape[0] != batch_size:
                continue
        hidden = torch.zeros(batch_size, n_hidden,requires_grad=False)
        hidden = hidden.to(device)
        x = x.to(device)
        y = y.to(device)
        
        rnn.zero_grad()
        loss = 0
        
        for i in range(x.shape[1]):
            output, hidden = rnn(x[:, i,:], hidden)
            l = criterion(output, y)
            loss += l
            
        optimizer.zero_grad()

        loss = criterion(output, y)

        acc = calculate_accuracy(output, y)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(rnn, iterator, criterion, device):
    epoch_acc = 0

    # rnn = rnn.to(device)
    rnn.eval()

    with torch.no_grad():

        # for (x, y) in tqdm(iterator, desc="Evaluating", leave=False):
        for x, y in iterator:
            # if x.shape[0] != batch_size:
            #     continue
            hidden = torch.zeros(1, n_hidden,requires_grad=False)
            hidden = hidden.to(device)
            x = x.to(device)
            y = y.to(device)
            
            for i in range(x.shape[1]):
                output, hidden = rnn(x[:, i, :], hidden)

            # loss = criterion(output, y)

            acc = calculate_accuracy(output, y)

            # epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_acc / len(iterator)

In [114]:
EPOCHS = 100

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# optimizer = optim.SGD(rnn.parameters(), lr = 0.001)
optimizer = optim.Adam(rnn.parameters())
criterion = nn.NLLLoss()
# criterion = nn.CrossEntropyLoss()
criterion.to(device)

train_data_rnn, test_data_rnn = review_dataset(X_train_vect_rnn, y_train), review_dataset(X_test_vect_rnn, y_test)
train_dataloader_rnn = DataLoader(train_data_rnn, batch_size=64, shuffle=True)


for epoch in trange(EPOCHS):

    train_loss, train_acc = train(rnn, train_dataloader_rnn, optimizer, criterion, device)
    # test_loss, test_acc = evaluate(rnn, test_dataloader, criterion, device)

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    # print(f'\t Val. Loss: {test_loss:.3f} |  Val. Acc: {test_acc*100:.2f}%')

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 01
	Train Loss: 1.366 | Train Acc: 36.69%
Epoch: 02
	Train Loss: 1.303 | Train Acc: 40.61%
Epoch: 03
	Train Loss: 1.298 | Train Acc: 41.00%
Epoch: 04
	Train Loss: 1.274 | Train Acc: 42.93%
Epoch: 05
	Train Loss: 1.234 | Train Acc: 45.57%
Epoch: 06
	Train Loss: 1.230 | Train Acc: 46.18%
Epoch: 07
	Train Loss: 1.225 | Train Acc: 46.29%
Epoch: 08
	Train Loss: 1.223 | Train Acc: 46.57%
Epoch: 09
	Train Loss: 1.222 | Train Acc: 46.51%
Epoch: 10
	Train Loss: 1.220 | Train Acc: 46.65%
Epoch: 11
	Train Loss: 1.217 | Train Acc: 46.60%
Epoch: 12
	Train Loss: 1.218 | Train Acc: 46.69%
Epoch: 13
	Train Loss: 1.216 | Train Acc: 46.97%
Epoch: 14
	Train Loss: 1.214 | Train Acc: 46.94%
Epoch: 15
	Train Loss: 1.213 | Train Acc: 47.04%
Epoch: 16
	Train Loss: 1.212 | Train Acc: 47.23%
Epoch: 17
	Train Loss: 1.209 | Train Acc: 47.33%
Epoch: 18
	Train Loss: 1.208 | Train Acc: 47.37%
Epoch: 19
	Train Loss: 1.206 | Train Acc: 47.61%
Epoch: 20
	Train Loss: 1.205 | Train Acc: 47.64%
Epoch: 21
	Train Los

In [115]:
test_dataloader_rnn = DataLoader(test_data_rnn, batch_size=1)
test_acc = evaluate(rnn, test_dataloader_rnn, criterion, device)

In [116]:
test_acc

0.4422473147893142

GRU

In [16]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, bias=True):
        super(GRU, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias

        self.layer1 = nn.Linear(input_size, 3 * hidden_size, bias=bias)
        self.layer2 = nn.Linear(hidden_size, 3 * hidden_size, bias=bias)
        

        self.reset_parameters()


    def reset_parameters(self):
        std = 1.0 / np.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def forward(self, input, hidden=None):
        
        inp = self.layer1(input)
        h = self.layer2(hidden)

        inp_reset, inp_upd, inp_new = inp.chunk(3, 1)
        hidden_reset, hidden_upd, hidden_new = h.chunk(3, 1)

        reset_gate = torch.sigmoid(inp_reset + hidden_reset)
        update_gate = torch.sigmoid(inp_upd + hidden_upd)
        new_gate = torch.tanh(inp_new + (reset_gate * hidden_new))

        out = update_gate * hidden + (1 - update_gate) * new_gate

        return out

In [17]:
class GRUNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, drop_prob=0):
        super(GRUNet, self).__init__()
        self.hidden_dim = hidden_dim
        # self.n_layers = n_layers
        
        # self.gru = nn.GRU(input_dim, hidden_dim, n_layers, batch_first=True, dropout=drop_prob)
        self.gru = GRU(input_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x, h):
        h = self.gru(x, h)
        out = self.fc(self.relu(h))
        out = self.softmax(out)
        return out, h
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(batch_size, self.hidden_dim).zero_().to(device)
        return hidden

In [18]:
model_gru = GRUNet(300, 20, 5)
model_gru

GRUNet(
  (gru): GRU(
    (layer1): Linear(in_features=300, out_features=60, bias=True)
    (layer2): Linear(in_features=20, out_features=60, bias=True)
  )
  (fc): Linear(in_features=20, out_features=5, bias=True)
  (relu): ReLU()
  (softmax): LogSoftmax(dim=1)
)

In [25]:
EPOCHS = 100

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# optimizer = optim.SGD(rnn.parameters(), lr = 0.001)
optimizer = optim.Adam(model_gru.parameters())
criterion = nn.NLLLoss()
# criterion = nn.CrossEntropyLoss()
criterion.to(device)
n_words = 300
n_hidden = 20

# train_data_rnn, test_data_rnn = review_dataset(X_train_vect_rnn, y_train), review_dataset(X_test_vect_rnn, y_test)
# train_dataloader_rnn = DataLoader(train_data_rnn, batch_size=64, shuffle=True)

from tqdm.notebook import trange
train_data_rnn, test_data_rnn = review_dataset(X_train_vect_rnn, y_train), review_dataset(X_test_vect_rnn, y_test)
train_dataloader_rnn = DataLoader(train_data_rnn, batch_size=64, shuffle=True)
for epoch in trange(EPOCHS):

    train_loss, train_acc = train(model_gru, train_dataloader_rnn, optimizer, criterion, device)
    # test_loss, test_acc = evaluate(rnn, test_dataloader, criterion, device)

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.6f} | Train Acc: {train_acc*100:.2f}%')
    # print(f'\t Val. Loss: {test_loss:.3f} |  Val. Acc: {test_acc*100:.2f}%')

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 01
	Train Loss: 1.352750 | Train Acc: 37.90%
Epoch: 02
	Train Loss: 1.178736 | Train Acc: 47.89%
Epoch: 03
	Train Loss: 1.137225 | Train Acc: 49.67%
Epoch: 04
	Train Loss: 1.114993 | Train Acc: 50.74%
Epoch: 05
	Train Loss: 1.100405 | Train Acc: 51.41%
Epoch: 06
	Train Loss: 1.089523 | Train Acc: 51.96%
Epoch: 07
	Train Loss: 1.080737 | Train Acc: 52.37%
Epoch: 08
	Train Loss: 1.072892 | Train Acc: 52.77%
Epoch: 09
	Train Loss: 1.065436 | Train Acc: 53.10%
Epoch: 10
	Train Loss: 1.059081 | Train Acc: 53.23%
Epoch: 11
	Train Loss: 1.053607 | Train Acc: 53.58%
Epoch: 12
	Train Loss: 1.049011 | Train Acc: 53.81%
Epoch: 13
	Train Loss: 1.043446 | Train Acc: 53.95%
Epoch: 14
	Train Loss: 1.040088 | Train Acc: 54.19%
Epoch: 15
	Train Loss: 1.035862 | Train Acc: 54.32%
Epoch: 16
	Train Loss: 1.031963 | Train Acc: 54.44%
Epoch: 17
	Train Loss: 1.026862 | Train Acc: 55.05%
Epoch: 18
	Train Loss: 1.023113 | Train Acc: 55.06%
Epoch: 19
	Train Loss: 1.021010 | Train Acc: 55.07%
Epoch: 20
	T

In [28]:
test_dataloader_rnn = DataLoader(test_data_rnn, batch_size=1)
test_acc = evaluate(model_gru, test_dataloader_rnn, criterion, device)

In [29]:
test_acc

0.4845814977973568