In [1]:
#!/usr/bin/env python
# coding: utf-8
# # CSCI544_HW2_JingyanPeng
# - 09/26/2022

#version python3.9

In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import nltk
import re
from bs4 import BeautifulSoup
from sklearn.metrics import accuracy_score
import torch
CUDA_LAUNCH_BLOCKING = "1"

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# 1.Dataset Generation

- Build a Balanced Dataset Through Random Selection

  Load the dataset and build a balanced dataset of 100K reviews along with their ratings to create labels through random selection.

In [4]:
df = pd.read_csv('data.tsv', sep='\t', on_bad_lines='skip')
df['reviews'] = df['review_headline'] + ' ' + df['review_body']
df['ratings'] = df['star_rating']
df = df[['ratings','reviews']]
df = df.dropna()

s1=df[df.ratings == 1]
s2=df[df.ratings == 2]
s3=df[df.ratings == 3]
s4=df[df.ratings == 4]
s5=df[df.ratings == 5]
s1 = s1.sample(n = 20000, random_state = None)
s2 = s2.sample(n = 20000, random_state = None)
s3 = s3.sample(n = 20000, random_state = None)
s4 = s4.sample(n = 20000, random_state = None)
s5 = s5.sample(n = 20000, random_state = None)
dataset = pd.concat([s1, s2, s3, s4, s5])
dataset = dataset.reset_index(drop = True)

- Simple Data Cleaning without Preprocessing

In [5]:
dataset['reviews'] = dataset['reviews'].str.lower()

dataset['reviews'] = dataset['reviews'].map(lambda x: re.sub(re.compile(r'[http|https]*://[a-zA-Z0-9.?/&=:]*', re.S), " ", x))
dataset['reviews'] = dataset['reviews'].map(lambda x: BeautifulSoup(x,"html.parser").get_text())

dataset['reviews'] = dataset['reviews'].map(lambda x: re.sub("[^a-zA-Z]+", " ", x))

dataset['reviews'] = dataset['reviews'].map(lambda x: re.sub(r'\s\s+', ' ', x))
#dataset['reviews'] = dataset['reviews'].map(lambda x: x.strip())

import contractions
def contractionFunc(s):
    s = contractions.fix(s)
    s = re.sub("[^a-zA-Z]+", " ", s)
    return s
dataset['reviews'] = dataset['reviews'].map(lambda x: contractionFunc(x))

print(dataset.head(5))

  ratings                                            reviews
0       1  really low quality crap product turned orange ...
1       1  waste of money unless you re buying them for a...
2       1  one star not really what i wanted it s way to ...
3       1  twice the price for half the amount this cost ...
4     1.0  can t believe it i didn t like anything about ...


# 2. Word Embedding
- Reference:
    - Gensim > Documentation > Word2Vec Model
    - https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html

- (a) Pretrained Word2Vec Model

Load the pretrained “word2vec-google-news-300” Word2Vec model.

In [6]:
import gensim.downloader as api
google_wv = api.load('word2vec-google-news-300')

Check semantic similarities of the generated vectors using wv.most_similar() & wv.similarity().

My own three examples:

In [7]:
print(google_wv.most_similar(positive=['uncle', 'woman'], negative=['man'], topn=1))
print(google_wv.similarity('bike', 'bicycle'))
print(google_wv.similarity('crucial', 'vital'))

[('aunt', 0.8022665977478027)]
0.85213083
0.82077205


Given two examples in assignment doc:

In [8]:
print(google_wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=1))
print(google_wv.similarity('excellent', 'outstanding'))

[('queen', 0.7118193507194519)]
0.55674857


- (b) My Word2Vec Model

Train a Word2Vec model using my own dataset.

In [9]:
from gensim.test.utils import datapath
from gensim import utils
import gensim.models

class MyCorpus:
    def __iter__(self):
        for line in dataset['reviews']:
            yield utils.simple_preprocess(line)

model = gensim.models.Word2Vec(sentences=MyCorpus(), vector_size=300, window=11, min_count=10)

Check semantic similarities of the generated vectors using wv.most_similar() & wv.similarity().

Given two examples in assignment doc:

In [10]:
print(model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=1))
print(model.wv.similarity('excellent', 'outstanding'))

[('poem', 0.5422888994216919)]
0.81494856


- Conclusion
    - It shows that for the example of 'King − Man + Woman = Queen', the “word2vec-google-news-300” Word2Vec model works better. The model trained with my own dataset cannot give the answer, 'queen'. But for the example of 'excellent ∼ outstanding'. The model trained with my own dataset works better (0.81 > 0.56).
     
    - This may be related to characteristics of different datasets. If the words appears often in the dataset, as 'excellent' and 'outstanding' is more commonly used in my own dataset than in google news dataset, the corresponding vectors of these commonly used words trained with this kind of dataset can be more accurate.

# Train / Test split

80%/20% training/testing

In [11]:
train = dataset.sample(frac = 0.8, random_state = 1)
test = dataset.drop(train.index)
train = train.reset_index(drop = True)
test = test.reset_index(drop = True)

X_train = train['reviews']
X_test = test['reviews']
Y_train = train['ratings']
Y_test = test['ratings']

# Function Definition for Word2Vec -> Input
Here I define 5 functions to process the Word2Vec to input data.
1. Delete the corresponding labels of the NaN training vectors.
2. Delete the NaN training vectors.

3. Calculate the average vector for each review.
4. Concatenate the first 10 vectors for each review. Truncate the longer one and pad the shorter one with 0.
5. VStack the first 20 vectors for each review. Truncate the longer one and pad the shorter one with 0.

In [12]:
# 1) processing NaN
def process_nanY(x_mtx, y_mtx):
    idx = []
    if np.any(np.isnan(x_mtx)):
        arr_nan = np.argwhere(np.isnan(x_mtx))
        num_nan = arr_nan.shape[0]
        arr = np.arange(0, num_nan, 300)
        for i in arr:
            idx.append(arr_nan[i][0])
    if idx != None:
        mtx = np.delete(y_mtx, idx)
    return mtx;

def process_nanX(x_mtx):
    idx = []
    if np.any(np.isnan(x_mtx)):
        arr_nan = np.argwhere(np.isnan(x_mtx))
        num_nan = arr_nan.shape[0]
        arr = np.arange(0, num_nan, 300)
        for i in arr:
            idx.append(arr_nan[i][0])
    if idx != None:
        mtx = np.delete(x_mtx, idx, 0)
    return mtx;


# 2) the average Word2Vec vectors
def w2v_average(wv_model, input_words):
    wordlist = input_words.split(' ')
    embed_sum = np.zeros(shape = (300,))
    count = 0
    for word in wordlist:
        if word in wv_model:
            embed_sum += wv_model[word]
            count += 1
    return embed_sum / count

# 3) concatenate the first 10 Word2Vec vectors
def w2v_first10(wv_model, input_words):
    wordlist = input_words.split(' ')
    idx = 0;
    coun = 0;
    # go through the reviews to find 10 words in W2V model
    while(idx < len(wordlist)) & (coun <10):
        if wordlist[idx] in wv_model:
            wv_current = wv_model[wordlist[idx]]
            if coun == 0:
                result = wv_current
            else:
                result = np.concatenate((result, wv_current))
            idx += 1
            coun += 1
        else:
            idx +=1
    # if reviews length < 10:
    if coun == 0:
        return np.zeros(shape = 3000, )
    if coun < 10:
        zeros = np.zeros(shape = (300 * (10 - coun), ))
        return np.concatenate((result, zeros))
    else:
        return result
    
# 4) vStack the first 20 Word2Vec vectors
#    limit the review length to 20 by truncating and padding
def w2v_seq20(wv_model, input_words):
    wordlist = input_words.split(' ')
    idx = 0;
    coun = 0;
    # go through the reviews to find 10 words in W2V model
    while(idx < len(wordlist)) & (coun < 20):
        if wordlist[idx] in wv_model:
            wv_current = wv_model[wordlist[idx]]
            if coun == 0:
                result = wv_current
            else:
                result = np.vstack((result, wv_current))
            idx += 1
            coun += 1
        else:
            idx +=1
    # if reviews length < 20:
    if coun == 0:
        return np.zeros(shape = (20, 300) )
    if coun < 20:
        zeros = np.zeros(shape = (20 - coun, 300))
        return np.vstack((result, zeros))
    else:
        return result

# 3. Simple Model

In [13]:
#Calculate the average Word2Vec vectors of Google-pretrained Word2Vec Model. 
X_train_preW2Vave =np.array(X_train.apply(lambda x: w2v_average(google_wv, x)).values.tolist())
X_test_preW2Vave =np.array(X_test.apply(lambda x: w2v_average(google_wv, x)).values.tolist())
Y_train_preW2Vave = np.array(Y_train.values.tolist())
Y_test_preW2Vave = np.array(Y_test.values.tolist())
#Delete NaN Vectors with their Labels.
Y_train_preW2Vave = process_nanY(X_train_preW2Vave, Y_train_preW2Vave)
X_train_preW2Vave = process_nanX(X_train_preW2Vave)
Y_test_preW2Vave = process_nanY(X_test_preW2Vave, Y_test_preW2Vave)
X_test_preW2Vave = process_nanX(X_test_preW2Vave)
## print(X_train_preW2Vave.shape)

- Perceptron

Use sklearn.linear_model.Perceptron().fit() to train a Perceptron model and sklearn.linear_model.Perceptron().predict() to get the accurancy.

In [14]:
from sklearn.linear_model import Perceptron
perceptron_pre = Perceptron(max_iter = 1000, tol = 0, random_state = 0, eta0 = 0.01)
perceptron_pre.fit(X_train_preW2Vave, Y_train_preW2Vave)
perceptron_pre_test = perceptron_pre.predict(X_test_preW2Vave)
perceptron_pre_test_accuracy = accuracy_score(Y_test_preW2Vave, perceptron_pre_test)
print(perceptron_pre_test_accuracy)

0.4594959495949595


- SVM

Use sklearn.svm.LinearSVC().fit() to train a SVM model and sklearn.svm.LinearSVC().predict() to get the accurancy.

In [15]:
from sklearn.svm import LinearSVC
svm_pre = LinearSVC(max_iter = 5000, random_state = 0)
svm_pre.fit(X_train_preW2Vave, Y_train_preW2Vave)
svm_pre_test = svm_pre.predict(X_test_preW2Vave)
svm_pre_test_accurancy = accuracy_score(Y_test_preW2Vave, svm_pre_test)
print(svm_pre_test_accurancy)

0.5621562156215621


- Conclusion
    - In HW1, I got the accurancy values of these simple models with TF-IDF as input——0.42 for Perceptron model and 0.44 for SVM model. They are all worse than the results using pretrained Word2Vec as input here——0.46 for Perceptron model and 0.56 for SVM model.
    - So, in this case, the Word2Vec model works better than TF-IDF for words embedding.

# Custom Dataset
Define the dataset for Neural Network training.

In [16]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn as nn
import torch.nn.functional as F
import time
class Train(Dataset):
    def __init__(self, xtrain, ytrain):
        self.data = xtrain
        self.labels = ytrain

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        X = self.data[index]
        y = self.labels[index]
        return X, y

    
class Test(Dataset):
    def __init__(self, xtest, ytest):
        self.data = xtest
        self.labels = ytest

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        X = self.data[index]
        y = self.labels[index]
        return X, y

# 4. Feedforward Neural Networks
- Reference:
    - Pytorch Multi-Layer Perceptron, MNIST | Author: BHARAT BUSHAN MISHRA
    - https://www.kaggle.com/code/mishra1993/pytorch-multi-layer-perceptron-mnist/notebook

Define the Network Architecture (MLP)

In [17]:
# define the NN architecture 
class MLP(nn.Module):
    def __init__(self, D_input, D_output):
        super(MLP, self).__init__()
        # number of hidden nodes in each layer 
        # layer1: 50 nodes; layer2: 10 nodes
        hidden_1 = 50
        hidden_2 = 10
        # linear layer (300 -> hidden_1)
        self.fc1 = nn.Linear(D_input, hidden_1)
        # linear layer (n_hidden -> hidden_2)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        # linear layer (n_hidden -> 5)
        self.fc3 = nn.Linear(hidden_2, D_output)
        # dropout layer (p=0.2)
        # dropout prevents overfitting of data
        self.dropout = nn.Dropout(0.2)
    def forward(self, x):
        # add hidden layer, with relu activation function
        x = F.relu(self.fc1(x))
        # add dropout layer
        x = self.dropout(x)
        # add hidden layer, with relu activation function
        x = F.relu(self.fc2(x))
        # add dropout layer
        x = self.dropout(x)
        # add output layer
        x = self.fc3(x)
        return x

# initialize the NN
model_4a = MLP(300, 5)
model_4b = MLP(3000, 5)
model_4a.cuda()
model_4b.cuda()
print(model_4a)
print(model_4b)

# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()
# specify optimizer (stochastic gradient descent) and learning rate = 0.01
optimizer_4a = torch.optim.SGD(model_4a.parameters(), lr=0.01)
optimizer_4b = torch.optim.SGD(model_4b.parameters(), lr=0.01)

MLP(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=5, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
MLP(
  (fc1): Linear(in_features=3000, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=5, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


- (a) the average Word2Vec vectors as input

In [18]:
############################# Load the data_4a ############################# 
train_data_4a = Train(X_train_preW2Vave, Y_train_preW2Vave-1)
test_data_4a = Test(X_test_preW2Vave, Y_test_preW2Vave-1)

# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 100
# percentage of training set to use as validation
valid_size = 0.2

# obtain training indices that will be used for validation
num_train = len(train_data_4a)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader_4a = torch.utils.data.DataLoader(train_data_4a, batch_size=batch_size, sampler=train_sampler, num_workers=num_workers,)
valid_loader_4a = torch.utils.data.DataLoader(train_data_4a, batch_size=batch_size, sampler=valid_sampler, num_workers=num_workers)
test_loader_4a = torch.utils.data.DataLoader(test_data_4a, batch_size=batch_size, num_workers=num_workers)

In [19]:
############################# Train the model_4a ############################# 
start = time.time()

# number of epochs to train the model
n_epochs = 150

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ###################
    # train the model #
    ###################
    model_4a.train() # prep model for training
    for data, target in train_loader_4a:
        # transfer data and target to GPU
        data, target = data.to(device), target.to(device)
        # clear the gradients of all optimized variables
        optimizer_4a.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_4a(data.float())
        # calculate the loss
        loss = criterion(output, target.to(torch.long))
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer_4a.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
    
    ######################    
    # validate the model #
    ######################
    model_4a.eval() # prep model for evaluation
    for data, target in valid_loader_4a:
        # transfer data and target to GPU
        data, target = data.to(device), target.to(device)
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_4a(data.float())
        # calculate the loss
        loss = criterion(output, target.to(torch.long))
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader_4a.dataset)
    valid_loss = valid_loss/len(valid_loader_4a.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss
        ))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model_4a.state_dict(), 'model.pt')
        valid_loss_min = valid_loss
        
end = time.time()
print('Time elapsed: %.2f s' % (end - start))

Epoch: 1 	Training Loss: 1.291888 	Validation Loss: 0.322190
Validation loss decreased (inf --> 0.322190).  Saving model ...
Epoch: 2 	Training Loss: 1.287869 	Validation Loss: 0.321915
Validation loss decreased (0.322190 --> 0.321915).  Saving model ...
Epoch: 3 	Training Loss: 1.287442 	Validation Loss: 0.321847
Validation loss decreased (0.321915 --> 0.321847).  Saving model ...
Epoch: 4 	Training Loss: 1.287303 	Validation Loss: 0.321805
Validation loss decreased (0.321847 --> 0.321805).  Saving model ...
Epoch: 5 	Training Loss: 1.287180 	Validation Loss: 0.321782
Validation loss decreased (0.321805 --> 0.321782).  Saving model ...
Epoch: 6 	Training Loss: 1.287018 	Validation Loss: 0.321735
Validation loss decreased (0.321782 --> 0.321735).  Saving model ...
Epoch: 7 	Training Loss: 1.286869 	Validation Loss: 0.321695
Validation loss decreased (0.321735 --> 0.321695).  Saving model ...
Epoch: 8 	Training Loss: 1.286668 	Validation Loss: 0.321644
Validation loss decreased (0.32169

In [20]:
#############################  Calculate & Print the Accurancy_4a on Test############################# 
# Load the model with the lowest validation loss
model_4a.load_state_dict(torch.load('model.pt'))
# Calculate the accurancy
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader_4a:
        embeddings, labels = data
        # calculating outputs by running embeddings through the network
        model_4a.to("cpu")
        outputs = model_4a(embeddings.float())
        # the class with the highest score is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(correct/total)

0.5713071307130713


- (b) concatenate the first 10 Word2Vec vectors as input

In [21]:
#Concatenate the first 10 Word2Vec vectors of Google-pretrained Word2Vec Model. 
X_train_preW2Vfirst10 =np.array(X_train.apply(lambda x: w2v_first10(google_wv, x)).values.tolist())
X_test_preW2Vfirst10 =np.array(X_test.apply(lambda x: w2v_first10(google_wv, x)).values.tolist())
Y_train_preW2Vfirst10 = np.array(Y_train.values.tolist())
Y_test_preW2Vfirst10 = np.array(Y_test.values.tolist())
#Delete NaN Vectors with their Labels
Y_train_preW2Vfirst10 = process_nanY(X_train_preW2Vfirst10, Y_train_preW2Vfirst10)
X_train_preW2Vfirst10 = process_nanX(X_train_preW2Vfirst10)
Y_test_preW2Vfirst10 = process_nanY(X_test_preW2Vfirst10, Y_test_preW2Vfirst10)
X_test_preW2Vfirst10 = process_nanX(X_test_preW2Vfirst10)

In [22]:
############################# Load the data_4b ############################# 
train_data_4b = Train(X_train_preW2Vfirst10, Y_train_preW2Vfirst10-1)
test_data_4b = Test(X_test_preW2Vfirst10, Y_test_preW2Vfirst10-1)

# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 100
# percentage of training set to use as validation
valid_size = 0.2

# obtain training indices that will be used for validation
num_train = len(train_data_4b)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader_4b = torch.utils.data.DataLoader(train_data_4b, batch_size=batch_size, sampler=train_sampler, num_workers=num_workers,)
valid_loader_4b = torch.utils.data.DataLoader(train_data_4b, batch_size=batch_size, sampler=valid_sampler, num_workers=num_workers)
test_loader_4b = torch.utils.data.DataLoader(test_data_4b, batch_size=batch_size, num_workers=num_workers)

In [23]:
############################# Train the model_4b ############################# 
start = time.time()

# number of epochs to train the model
n_epochs = 60

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ###################
    # train the model #
    ###################
    model_4b.train() # prep model for training
    for data, target in train_loader_4b:
        # transfer data and target to GPU
        data, target = data.to(device), target.to(device)
        # clear the gradients of all optimized variables
        optimizer_4b.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_4b(data.float())
        # calculate the loss
        loss = criterion(output, target.to(torch.long))
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer_4b.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
    
    ######################    
    # validate the model #
    ######################
    model_4b.eval() # prep model for evaluation
    for data, target in valid_loader_4b:
        # transfer data and target to GPU
        data, target = data.to(device), target.to(device)
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_4b(data.float())
        # calculate the loss
        loss = criterion(output, target.to(torch.long))
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader_4b.dataset)
    valid_loss = valid_loss/len(valid_loader_4b.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss
        ))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model_4b.state_dict(), 'model.pt')
        valid_loss_min = valid_loss
        
end = time.time()
print('Time elapsed: %.2f s' % (end - start))

Epoch: 1 	Training Loss: 1.289511 	Validation Loss: 0.320628
Validation loss decreased (inf --> 0.320628).  Saving model ...
Epoch: 2 	Training Loss: 1.274435 	Validation Loss: 0.315348
Validation loss decreased (0.320628 --> 0.315348).  Saving model ...
Epoch: 3 	Training Loss: 1.234018 	Validation Loss: 0.296562
Validation loss decreased (0.315348 --> 0.296562).  Saving model ...
Epoch: 4 	Training Loss: 1.145433 	Validation Loss: 0.270334
Validation loss decreased (0.296562 --> 0.270334).  Saving model ...
Epoch: 5 	Training Loss: 1.074494 	Validation Loss: 0.256175
Validation loss decreased (0.270334 --> 0.256175).  Saving model ...
Epoch: 6 	Training Loss: 1.036164 	Validation Loss: 0.248015
Validation loss decreased (0.256175 --> 0.248015).  Saving model ...
Epoch: 7 	Training Loss: 1.010050 	Validation Loss: 0.242429
Validation loss decreased (0.248015 --> 0.242429).  Saving model ...
Epoch: 8 	Training Loss: 0.986314 	Validation Loss: 0.237203
Validation loss decreased (0.24242

In [24]:
#############################  Calculate & Print the Accurancy_4b on Test############################# 
# Load the model with the lowest validation loss
model_4b.load_state_dict(torch.load('model.pt'))
# Calculate the accurancy
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader_4b:
        embeddings, labels = data
        # calculating outputs by running embeddings through the network
        model_4b.to("cpu")
        outputs = model_4b(embeddings.float())
        # the class with the highest score is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(correct/total)

0.5851


- Conclusion
    - FNN model obviously works better than simple models with average vectors as training data.
    - Using the first 10 concatenated vectors as input is better than using the average vectors. The possible reason may be the loss of information when using average vectors. In addition, in most reviews, the first 10 words can correctly determine the classification result of this review.

# Recurrent Neural Networks

- Define the Network Architecture (RNN)

In [25]:
# define the RNN/GRN architecture 
class RNN(nn.Module):
    def __init__(self, model_type = "rnn"):
        super(RNN, self).__init__()
        
        # define the RNN's parameters
        self.hidden_dim = 20
        self.n_layers = 1
        self.model_type = model_type
        
        #RNN
        if self.model_type == "gru":
            self.rnn = nn.GRU(300, 20, 1, batch_first=True)
        else:
            self.rnn = nn.RNN(300, 20, 1, batch_first=True, nonlinearity='relu')

        #Outpur layer
        self.fc = nn.Linear(20, 5)
    
    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(1, x.size(0), 20).to(device)
    
        # One time step
        out, hn = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

# initialize the NN
model_5a = RNN(model_type = "rnn")
model_5b = RNN(model_type = "gru")
model_5a.cuda()
model_5b.cuda()
print(model_5a)
print(model_5b)

# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()
# specify optimizer (stochastic gradient descent) and learning rate = 0.01
optimizer_5a = torch.optim.SGD(model_5a.parameters(), lr=0.01)
optimizer_5b = torch.optim.SGD(model_5b.parameters(), lr=0.01)

RNN(
  (rnn): RNN(300, 20, batch_first=True)
  (fc): Linear(in_features=20, out_features=5, bias=True)
)
RNN(
  (rnn): GRU(300, 20, batch_first=True)
  (fc): Linear(in_features=20, out_features=5, bias=True)
)


In [26]:
#vStack the first 20 Word2Vec vectors of Google-pretrained Word2Vec Model with trancating & padding 
X_train_preW2Vseq20 =np.array(X_train.apply(lambda x: w2v_seq20(google_wv, x)).values.tolist())
X_test_preW2Vseq20 =np.array(X_test.apply(lambda x: w2v_seq20(google_wv, x)).values.tolist())
Y_train_preW2Vseq20 = np.array(Y_train.values.tolist())
Y_test_preW2Vseq20 = np.array(Y_test.values.tolist())
#Delete NaN Vectors with their Labels
Y_train_preW2Vseq20 = process_nanY(X_train_preW2Vseq20, Y_train_preW2Vseq20)
X_train_preW2Vseq20 = process_nanX(X_train_preW2Vseq20)
Y_test_preW2Vseq20 = process_nanY(X_test_preW2Vseq20, Y_test_preW2Vseq20)
X_test_preW2Vseq20 = process_nanX(X_test_preW2Vseq20)
#print(X_train_preW2Vseq20.shape)

- (a) Train a simple RNN for sentiment analysis, limiting the review length to 20

In [27]:
############################# Load the data_5a ############################# 
train_data_5a = Train(X_train_preW2Vseq20, Y_train_preW2Vseq20-1)
test_data_5a = Test(X_test_preW2Vseq20, Y_test_preW2Vseq20-1)

# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 500
# percentage of training set to use as validation
valid_size = 0.2

# obtain training indices that will be used for validation
num_train = len(train_data_5a)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader_5a = torch.utils.data.DataLoader(train_data_5a, batch_size=batch_size, sampler=train_sampler, num_workers=num_workers,)
valid_loader_5a = torch.utils.data.DataLoader(train_data_5a, batch_size=batch_size, sampler=valid_sampler, num_workers=num_workers)
test_loader_5a = torch.utils.data.DataLoader(test_data_5a, batch_size=batch_size, num_workers=num_workers)

In [28]:
############################# Train the model_5a ############################# 
start = time.time()

# number of epochs to train the model
n_epochs = 100

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ###################
    # train the model #
    ###################
    model_5a.train() # prep model for training
    for data, target in train_loader_5a:
        # transfer data and target to GPU
        data, target = data.to(device), target.to(device)
        # clear the gradients of all optimized variables
        optimizer_5a.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_5a(data.float())
        # calculate the loss
        loss = criterion(output, target.to(torch.long))
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer_5a.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
    
    ######################    
    # validate the model #
    ######################
    model_5a.eval() # prep model for evaluation
    for data, target in valid_loader_5a:
        # transfer data and target to GPU
        data, target = data.to(device), target.to(device)
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_5a(data.float())
        # calculate the loss
        loss = criterion(output, target.to(torch.long))
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader_5a.dataset)
    valid_loss = valid_loss/len(valid_loader_5a.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss
        ))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model_5a.state_dict(), 'model.pt')
        valid_loss_min = valid_loss
        
end = time.time()
print('Time elapsed: %.2f s' % (end - start))

Epoch: 1 	Training Loss: 1.292324 	Validation Loss: 0.322846
Validation loss decreased (inf --> 0.322846).  Saving model ...
Epoch: 2 	Training Loss: 1.290458 	Validation Loss: 0.322543
Validation loss decreased (0.322846 --> 0.322543).  Saving model ...
Epoch: 3 	Training Loss: 1.289515 	Validation Loss: 0.322386
Validation loss decreased (0.322543 --> 0.322386).  Saving model ...
Epoch: 4 	Training Loss: 1.288990 	Validation Loss: 0.322294
Validation loss decreased (0.322386 --> 0.322294).  Saving model ...
Epoch: 5 	Training Loss: 1.288644 	Validation Loss: 0.322227
Validation loss decreased (0.322294 --> 0.322227).  Saving model ...
Epoch: 6 	Training Loss: 1.288379 	Validation Loss: 0.322172
Validation loss decreased (0.322227 --> 0.322172).  Saving model ...
Epoch: 7 	Training Loss: 1.288153 	Validation Loss: 0.322122
Validation loss decreased (0.322172 --> 0.322122).  Saving model ...
Epoch: 8 	Training Loss: 1.287949 	Validation Loss: 0.322075
Validation loss decreased (0.32212

In [29]:
#############################  Calculate & Print the Accurancy_5a on Test############################# 
# Load the model with the lowest validation loss
model_5a.load_state_dict(torch.load('model.pt'))
# Calculate the accurancy
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader_5a:
        embeddings, labels = data
        # transfer data and target to GPU
        embeddings, labels = embeddings.to(device), labels.to(device)
        # calculating outputs by running embeddings through the network
        model_5a.to(device)
        outputs = model_5a(embeddings.float())
        # the class with the highest score is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(correct/total)

0.5171


- (b) Repeat part (a) by considering a gated recurrent unit cell

In [30]:
############################# Load the data_5b ############################# 
train_data_5b = Train(X_train_preW2Vseq20, Y_train_preW2Vseq20-1)
test_data_5b = Test(X_test_preW2Vseq20, Y_test_preW2Vseq20-1)

# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 500
# percentage of training set to use as validation
valid_size = 0.2

# obtain training indices that will be used for validation
num_train = len(train_data_5b)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader_5b = torch.utils.data.DataLoader(train_data_5b, batch_size=batch_size, sampler=train_sampler, num_workers=num_workers,)
valid_loader_5b = torch.utils.data.DataLoader(train_data_5b, batch_size=batch_size, sampler=valid_sampler, num_workers=num_workers)
test_loader_5b = torch.utils.data.DataLoader(test_data_5b, batch_size=batch_size, num_workers=num_workers)

In [31]:
############################# Train the model_5b ############################# 
start = time.time()

# number of epochs to train the model
n_epochs = 100

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ###################
    # train the model #
    ###################
    model_5b.train() # prep model for training
    for data, target in train_loader_5b:
        # transfer data and target to GPU
        data, target = data.to(device), target.to(device)
        # clear the gradients of all optimized variables
        optimizer_5b.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_5b(data.float())
        # calculate the loss
        loss = criterion(output, target.to(torch.long))
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer_5b.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
    
    ######################    
    # validate the model #
    ######################
    model_5b.eval() # prep model for evaluation
    for data, target in valid_loader_5b:
        # transfer data and target to GPU
        data, target = data.to(device), target.to(device)
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_5b(data.float())
        # calculate the loss
        loss = criterion(output, target.to(torch.long))
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader_5b.dataset)
    valid_loss = valid_loss/len(valid_loader_5b.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss
        ))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model_5b.state_dict(), 'model.pt')
        valid_loss_min = valid_loss
        
end = time.time()
print('Time elapsed: %.2f s' % (end - start))

Epoch: 1 	Training Loss: 1.297767 	Validation Loss: 0.323643
Validation loss decreased (inf --> 0.323643).  Saving model ...
Epoch: 2 	Training Loss: 1.292575 	Validation Loss: 0.322852
Validation loss decreased (0.323643 --> 0.322852).  Saving model ...
Epoch: 3 	Training Loss: 1.290530 	Validation Loss: 0.322515
Validation loss decreased (0.322852 --> 0.322515).  Saving model ...
Epoch: 4 	Training Loss: 1.289610 	Validation Loss: 0.322346
Validation loss decreased (0.322515 --> 0.322346).  Saving model ...
Epoch: 5 	Training Loss: 1.289103 	Validation Loss: 0.322238
Validation loss decreased (0.322346 --> 0.322238).  Saving model ...
Epoch: 6 	Training Loss: 1.288743 	Validation Loss: 0.322155
Validation loss decreased (0.322238 --> 0.322155).  Saving model ...
Epoch: 7 	Training Loss: 1.288443 	Validation Loss: 0.322082
Validation loss decreased (0.322155 --> 0.322082).  Saving model ...
Epoch: 8 	Training Loss: 1.288166 	Validation Loss: 0.322015
Validation loss decreased (0.32208

In [32]:
#############################  Calculate & Print the Accurancy_5b on Test############################# 
# Load the model with the lowest validation loss
model_5b.load_state_dict(torch.load('model.pt'))
# Calculate the accurancy
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader_5b:
        embeddings, labels = data
        # transfer data and target to GPU
        embeddings, labels = embeddings.to(device), labels.to(device)
        # calculating outputs by running embeddings through the network
        model_5b.to(device)
        outputs = model_5b(embeddings.float())
        # the class with the highest score is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(correct/total)

0.4332


- Conclusion
    - RNN model(0.52) works worse than FNN models(0.57 / 0.59) here. 
    - GRU model(0.43) (Gated Recurrent Neural Network) even work worse than simple RNN model(0.52).
    - The poor results of RNN may also be related to the network settings of RNN.

# Accuracy values for 6 cases: 
1. Perceptron -> 0.46
2. SVM -> 0.56
3. FNN(average Word2Vec vectors) -> 0.57
4. FNN(first 10 Word2Vec vectors) -> 0.59
5. RNN -> 0.52
6. GRN -> 0.43