In [1]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
import re
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as Data

from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
from torch.nn.utils.rnn import pad_sequence

# python version: 3.8.2

[nltk_data] Downloading package wordnet to /Users/amy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/amy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/amy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/amy/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# 1.Dataset Generation

## Read Data and keep Reviews and Ratings

In [3]:
df = pd.read_csv("amazon_reviews_us_Beauty_v1_00.tsv", sep = "\t",
                 usecols = ['star_rating', 'review_body'], 
                 on_bad_lines='skip', low_memory=False)
df = df.astype(str)

 ## We form three classes and select 20000 reviews randomly from each class.



In [4]:
def fun(x):
    if(x > "5"):
        return "-1"
    elif (x >= "4" and x <= "5"):
        return "3"
    elif x == "3":
        return "2"
    elif (x >= "1" and x <= "2"):
        return "1"

df['label'] = df['star_rating'].apply(lambda x : fun(x))
class1 = df[df.label == "1"].sample(n = 20000, random_state = 1)
class2 = df[df.label == "2"].sample(n = 20000, random_state = 1)
class3 = df[df.label == "3"].sample(n = 20000, random_state = 1)
dataset = pd.concat([class1, class2, class3], ignore_index = True)

## Data Cleaning



remove the HTML and URLs from the reviews since most of them contain useless word and we should remove them

In [5]:
#replace HTML of a text
dataset['review_body'] = dataset['review_body'].str.replace('</?\w+[^>]*>', ' ')
#replace URL of a text
dataset['review_body'] = dataset['review_body'].str.replace(
    'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
            ' ')

# Split dataset into 80% training dataset and 20% testing dataset

In [6]:
from sklearn import model_selection
y = dataset['label']
y = y.apply(lambda x : int(x))
x = dataset['review_body']
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, 
test_size = 0.2, random_state = 2)


# 2. Word Embedding
## reference: https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html

## （a）Load the pretrained “word2vec-google-news-300” Word2Vec model

In [7]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [8]:
pairs = [
    ('excellent', 'outstanding'),  
    ('man', 'king'),  
    ('woman', 'queen'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'excellent'	'outstanding'	0.56
'man'	'king'	0.23
'woman'	'queen'	0.32


## (b) Train a Word2Vec model using my own dataset

### First, generate corpus based on our dataset

In [9]:
from gensim import utils
myCorpus = []
for d in dataset['review_body']:
    myCorpus.append(utils.simple_preprocess(d))

In [10]:
import gensim.models
sentences = myCorpus
myModel = gensim.models.Word2Vec(sentences=sentences,vector_size=300, window=13, min_count=9)

In [11]:
pairs = [
    ('excellent', 'outstanding'),  
    ('man', 'king'),  
    ('woman', 'queen'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, myModel.wv.similarity(w1, w2)))

'excellent'	'outstanding'	0.66
'man'	'king'	0.36
'woman'	'queen'	0.29


Answer:  It seems the model I trained based on our dataset has better encoding capability since the semantic similarity between words pair in our example has higher scores. I have learned that the corpus word2vec-google-news-300 used is much larger than our dataset. Therefore, different corpus or different size between two corpus may give model different parameters and output different scores.

# 3. Simple models

## First, generate the average Word2Vec vectors for each review as the input feature 

In [12]:
def getAverageWV(dataset, wv):
    averageWV = []
    for review in dataset:
        count = 0
        reviewVec = np.zeros(300)
        words = review.split()
        for word in words:
            if word in wv:
                count = count + 1 
                reviewVec = reviewVec + wv[word]
        if count > 0:
            averageWV.append(reviewVec / count)
        else:
            averageWV.append(np.zeros(300))
    return averageWV


In [13]:
x_vec_train = getAverageWV(x_train, wv)
y_vec_train = y_train
x_vec_test = getAverageWV(x_test, wv)
y_vec_test = y_test

In [14]:
y_vec_train = y_vec_train.apply(lambda x : (x - 1))
y_vec_test = y_vec_test.apply(lambda x : (x - 1))

## Perceptron
### here I use the same perceptron model as HW1 does

In [15]:
from sklearn.linear_model import Perceptron
perceptron = Perceptron(max_iter = 80, tol = 1e-5, eta0 = 0.01, penalty = 'elasticnet')
perceptron.fit(x_vec_train, y_vec_train)
y_vec_test_pred = perceptron.predict(x_vec_test)

In [16]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_vec_test, y_vec_test_pred)
print(accuracy)

0.5753333333333334


Note : The perceptron accuracy based on TF-IDF features is : 0.6246666666666667

## SVM

In [17]:
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty = 'l2', loss = 'squared_hinge', multi_class= 'ovr',
                intercept_scaling=1, max_iter=1000)
svm.fit(x_vec_train, y_vec_train)
y_vec_test_pred = svm.predict(x_vec_test)

In [18]:
accuracy = accuracy_score(y_vec_test, y_vec_test_pred)
print(accuracy)

0.6300833333333333


Note : The SVM accuracy based on TF-IDF features is : 0.694

Answer: Both perceptron and SVM model based on TF-IDF features get higher accuracy. It gives us a direction that the extracted TF-IDF features perform better on our dataset.

# 4. Feedforward Neural Networks
### (a) Using the Word2Vec features generated from Task3, train a feedforward multilayer perceptron network for classification
#### Reference: https://www.kaggle.com/code/mishra1993/pytorch-multi-layer-perceptron-mnist/notebook

input size = 300, 
hidden1_size = 100, 
hidden2_size = 10, 
output_size = 3 -> 3-classfication problem, label is 1, 2, 3

In [19]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(300, 100)
        self.fc2 = nn.Linear(100, 10)
        self.fc3 = nn.Linear(10, 3)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = Net()

#### Using the Word2Vec features generated from Task3.  
Here I convert x_vec_train and x_vec_test into float type and then convert them into tensor. For y_vec_train and y_vec_test, I convert them into float type and then Long type for future computing and then convert them into tensor. The tensor type is what I needed to feed model.

In [20]:
x_FNN_train = torch.tensor(np.asarray(x_vec_train), dtype = torch.float32)
x_FNN_test = torch.tensor(np.asarray(x_vec_test), dtype = torch.float32)
y_FNN_train = torch.tensor(np.asarray(y_vec_train), dtype = torch.float32)
y_FNN_test = torch.tensor(np.asarray(y_vec_test), dtype = torch.float32)
y_FNN_train = y_FNN_train.type(torch.LongTensor)
y_FNN_test = y_FNN_test.type(torch.LongTensor)


In [21]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
n_epochs = 100

for epoch in range(1, n_epochs + 1):
    model.train()
    optimizer.zero_grad()
    output = model(x_FNN_train)
    loss = criterion(output, y_FNN_train)
    loss.backward()
    optimizer.step()
    if(epoch % 10 == 0):
        print('Epoch: {} \t Traning Loss: {:.6f}'.format(epoch, loss.item()))     
        

Epoch: 10 	 Traning Loss: 0.977401
Epoch: 20 	 Traning Loss: 0.911542
Epoch: 30 	 Traning Loss: 0.866037
Epoch: 40 	 Traning Loss: 0.837069
Epoch: 50 	 Traning Loss: 0.814971
Epoch: 60 	 Traning Loss: 0.796916
Epoch: 70 	 Traning Loss: 0.790416
Epoch: 80 	 Traning Loss: 0.773514
Epoch: 90 	 Traning Loss: 0.761992
Epoch: 100 	 Traning Loss: 0.751186


In [22]:
y_FNN_test_pred = model(x_FNN_test)
y_FNN_test_pred = y_FNN_test_pred.detach().numpy().argmax(axis = 1)

totalCount = 0
currentCount = 0
for i in range(len(y_FNN_test_pred)):
    totalCount = totalCount + 1
    if y_FNN_test_pred[i] == y_FNN_test[i]:
        currentCount = currentCount + 1

if(totalCount > 0):
    print('Accuracy is : ', currentCount / totalCount)

Accuracy is :  0.6405


#### (b) To generate the input features, concatenate the first 10 Word2Vec vectors for each review as the input feature
We deal with word by word in each review. We skip a word if it is not in wv. We add the wv score of a word if it is in wv until the current size of reviewVec is equal to 10 or this review has no word to read. If size of reviewVec is less than 10, we append np.zeros(300) into reviewVec to make its size equal to 10.

In [23]:
def getAverageWV_10(dataset, wv):
    averageWV = []
    for review in dataset:
        count = 10
        reviewVec = []
        words = review.split()
        for word in words:
            if count > 0 and word in wv:
                reviewVec.append(wv[word])
                count = count - 1
        while count > 0:
            reviewVec.append(np.zeros(300))
            count = count - 1
        averageWV.append(reviewVec)
    return averageWV

In [24]:
x_vec_10_train = getAverageWV_10(x_train, wv)
y_vec_10_train = y_train
x_vec_10_test = getAverageWV_10(x_test, wv)
y_vec_10_test = y_test

In [25]:
y_vec_10_train = y_vec_10_train.apply(lambda x : (x - 1))
y_vec_10_test = y_vec_10_test.apply(lambda x : (x - 1))

#### Using the Word2Vec features generated from Task3.  
Here I convert x_vec_10_train and x_vec_10_test into float type and then convert them into tensor. For y_vec_10_train and y_vec_10_test, I convert them into float type and then Long type for future computing and then convert them into tensor. The tensor type is what I needed to feed model.

In [26]:
x_FNN_10_train = torch.tensor(np.asarray(x_vec_10_train), dtype = torch.float32)
x_FNN_10_test = torch.tensor(np.asarray(x_vec_10_test), dtype = torch.float32)
y_FNN_10_train = torch.tensor(np.asarray(y_vec_10_train), dtype = torch.float32)
y_FNN_10_test = torch.tensor(np.asarray(y_vec_10_test), dtype = torch.float32)
y_FNN_10_train = y_FNN_10_train.type(torch.LongTensor)
y_FNN_10_test = y_FNN_10_test.type(torch.LongTensor)


Here the input_size is 300 * 10 as first 10 Word2Vec are collected.

hidden1_size = 100, hidden2_size = 10, output_size = 3 -> 3-classification problem

In [27]:
class ConcatNet(nn.Module):
    def __init__(self):
        super(ConcatNet, self).__init__()
        self.fc1 = nn.Linear(300 * 10, 100)
        self.fc2 = nn.Linear(100, 10)
        self.fc3 = nn.Linear(10, 3)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        # flatten input into 300*10 size to feed into model
        x = x.view(-1, 300 * 10)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

concatFNNModel = ConcatNet()

In [28]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(concatFNNModel.parameters(), lr=0.01)
n_epochs = 30

for epoch in range(1, n_epochs + 1):
    concatFNNModel.train()
    optimizer.zero_grad()
    output = concatFNNModel(x_FNN_10_train)
    loss = criterion(output, y_FNN_10_train)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print('Epoch: {} \t Traning Loss: {:.6f}'.format(epoch, loss.item()))

Epoch: 10 	 Traning Loss: 1.000331
Epoch: 20 	 Traning Loss: 0.914064
Epoch: 30 	 Traning Loss: 0.874733


In [29]:
y_FNN_10_test_pred = concatFNNModel(x_FNN_10_test)
y_FNN_10_test_pred = y_FNN_10_test_pred.detach().numpy().argmax(axis = 1)
print(y_FNN_10_test_pred)

totalCount = 0
currentCount = 0
for i in range(len(y_FNN_10_test_pred)):
    totalCount = totalCount + 1
    if y_FNN_10_test_pred[i] == y_FNN_10_test[i]:
        currentCount = currentCount + 1

if(totalCount > 0):
    print('Accuracy is : ', currentCount / totalCount)

[0 2 0 ... 1 0 2]
Accuracy is :  0.5529166666666666


Answer: It seems FNN based on average Word2Vec has higher accuracy.I have checked our dataset, the average of each review is more than 10 words, which means first 10 Word2Vec may not be enough for this 3-classification problem. However, due to smaller size of first 10 Word2Vec, the computing speed of it increases a lot.

# 5. Recurrent Neural Networks
### Reference: https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html

#### To generate the input features, concatenate the first 20 Word2Vec vectors for each review as the input feature
We deal with word by word in each review. We skip a word if it is not in wv. We add the wv score of a word if it is in wv until the current size of reviewVec is equal to 20 or this review has no word to read. If size of reviewVec is less than 20, we append np.zeros(300) into reviewVec to make its size equal to 20.

In [30]:
def getAverageWV_20(dataset, wv):
    averageWV = []
    for review in dataset:
        count = 20
        reviewVec = []
        words = review.split()
        for word in words:
            if count > 0 and word in wv:
                reviewVec.append(wv[word])
                count = count - 1 
        while count > 0:
            reviewVec.append(np.zeros(300))
            count = count - 1
        averageWV.append(reviewVec)
    return averageWV

This is a helper function for DataLoader since we need to split our dataset into specific batch size and get (padded_data, label, length) to feed into model.

In [31]:
def collate_fn(batch):
    X = []
    label = []
    length = []
    for x in batch:
        x_vec_tensor = torch.tensor(x[0], dtype=torch.float32)
        X.append(x_vec_tensor)
        label.append(x[1])
        length.append(len(x[0]))
    res = pad_sequence(X, batch_first = True, padding_value = 0)
    return res, label, length

In [32]:
x_vec_20_train = getAverageWV_20(x_train, wv)
y_vec_20_train = y_train
x_vec_20_test = getAverageWV_20(x_test, wv)
y_vec_20_test = y_test

In [33]:
y_vec_20_train = y_vec_20_train.apply(lambda x : (x - 1))
y_vec_20_test = y_vec_20_test.apply(lambda x : (x - 1))

Here I convert x_vec_20_train and x_vec_20_test into float type and then convert them into tensor. For y_vec_20_train and y_vec_20_test, I convert them into float type and then Long type for future computing and then convert them into tensor. The tensor type is what I needed to feed model.

In [34]:
x_RNN_20_train = torch.tensor(np.asarray(x_vec_20_train), dtype = torch.float32)
x_RNN_20_test = torch.tensor(np.asarray(x_vec_20_test), dtype = torch.float32)
y_RNN_20_train = torch.tensor(np.asarray(y_vec_20_train), dtype = torch.float32)
y_RNN_20_test = torch.tensor(np.asarray(y_vec_20_test), dtype = torch.float32)
y_RNN_20_train = y_RNN_20_train.type(torch.LongTensor)
y_RNN_20_test = y_RNN_20_test.type(torch.LongTensor)

## (a) simple RNN

consider an RNN cell with the hidden state size of 20 for this 3-classification problem

For this part, in order to perform batch processing more efficiently, it is necessary to fill the sample sequence to ensure that each sample has the same length. Although the length of the filled sample sequence is the same, the sequence may be filled with a lot of invalid value 0. Feeding the filled value 0 to the RNN for forward calculation will not only waste computing resources, but may also have errors in the final value. Therefore, before sending the sequence to RNN for processing, pack_padded_sequence needs to be used for compression to compress invalid padding values. After the sequence is processed by RNN, the output is still a compressed sequence. It is necessary to use pad_packed_sequence to fill the compressed sequence back for subsequent processing.

In [36]:
input_size = 300
hidden_size = 20
output_size = 3
num_layers = 1
sequence_length = 20 #length of review as required

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.layer = nn.RNN(input_size, hidden_size, num_layers, batch_first = True)
        self.fc = nn.Linear(sequence_length * hidden_size, output_size)

    def forward(self, input, length):
        batch_size = input.size(0)
        hidden = self.initHidden(batch_size)
        x_packed = pack_padded_sequence(input, length, batch_first = True, enforce_sorted = False)
        output, _ = self.layer(x_packed, hidden)
        output, output_length = pad_packed_sequence(output, batch_first = True)
        output = self.fc(output.reshape(output.shape[0], -1))
        return output

    def initHidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size)

rnn = RNN(input_size, hidden_size, num_layers, output_size)


In [37]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.002)

n_epochs = 40
BATCH_SIZE = 64
rnn_dataset = Data.TensorDataset(x_RNN_20_train,y_RNN_20_train)
rnn_loader = Data.DataLoader(dataset = rnn_dataset, batch_size = BATCH_SIZE, collate_fn = collate_fn)

for epoch in range(1, n_epochs + 1):
    rnn.train()
    for data, target,length in rnn_loader:
        target = torch.tensor(target, dtype = torch.long)
        output = rnn(data, length)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print('Epoch: {} \t Traning Loss: {:.6f}'.format(epoch, loss.item()))

Epoch: 1 	 Traning Loss: 0.942759
Epoch: 2 	 Traning Loss: 0.911457
Epoch: 3 	 Traning Loss: 0.905028
Epoch: 4 	 Traning Loss: 0.879520
Epoch: 5 	 Traning Loss: 0.865990
Epoch: 6 	 Traning Loss: 0.862684
Epoch: 7 	 Traning Loss: 0.860867
Epoch: 8 	 Traning Loss: 0.859361
Epoch: 9 	 Traning Loss: 0.857098
Epoch: 10 	 Traning Loss: 0.852774
Epoch: 11 	 Traning Loss: 0.846582
Epoch: 12 	 Traning Loss: 0.840210
Epoch: 13 	 Traning Loss: 0.834388
Epoch: 14 	 Traning Loss: 0.828894
Epoch: 15 	 Traning Loss: 0.823316
Epoch: 16 	 Traning Loss: 0.817630
Epoch: 17 	 Traning Loss: 0.812294
Epoch: 18 	 Traning Loss: 0.807734
Epoch: 19 	 Traning Loss: 0.803968
Epoch: 20 	 Traning Loss: 0.801633
Epoch: 21 	 Traning Loss: 0.800215
Epoch: 22 	 Traning Loss: 0.799657
Epoch: 23 	 Traning Loss: 0.799701
Epoch: 24 	 Traning Loss: 0.799466
Epoch: 25 	 Traning Loss: 0.799051
Epoch: 26 	 Traning Loss: 0.798524
Epoch: 27 	 Traning Loss: 0.798798
Epoch: 28 	 Traning Loss: 0.798449
Epoch: 29 	 Traning Loss: 0.7

In [38]:
rnn_test_dataset = Data.TensorDataset(x_RNN_20_test,y_RNN_20_test)
rnn_test_loader = Data.DataLoader(dataset = rnn_test_dataset, batch_size = BATCH_SIZE, collate_fn = collate_fn)


In [39]:
with torch.no_grad():
    totalCount = 0
    currentCount = 0
    for data, target, length in rnn_test_loader:
        output = rnn(data, length)
        _, pred = torch.max(output, 1)
        currentCount += sum(np.array(pred) == target)
        totalCount += pred.size(0)
    rnn.train()
    print('accuracy is :', currentCount / totalCount)

accuracy is : 0.5863333333333334


Answer: We could learn that the accuracy of simple RNN is between that of FNN based on avaerage wv and first-10-wv FNN. FNN based on average wv still gives better accuracy since it contains average vector of each reviews, which could deliver more information to model for classification problem. And for this compared to first-10-wv FNN, Task5 leans from first-20 vectors, so it contains more information, which is acceptable.

## (b) GRU

consider an RNN cell with the hidden state size of 20 for this 3-classification problem

For this part, in order to perform batch processing more efficiently, it is necessary to fill the sample sequence to ensure that each sample has the same length. Although the length of the filled sample sequence is the same, the sequence may be filled with a lot of invalid value 0. Feeding the filled value 0 to the RNN for forward calculation will not only waste computing resources, but may also have errors in the final value. Therefore, before sending the sequence to RNN for processing, pack_padded_sequence needs to be used for compression to compress invalid padding values. After the sequence is processed by RNN, the output is still a compressed sequence. It is necessary to use pad_packed_sequence to fill the compressed sequence back for subsequent processing.

In [40]:
input_size = 300
hidden_size = 20
output_size = 3
num_layers = 1
sequence_length = 20

class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.layer = nn.GRU(input_size, hidden_size, num_layers, batch_first = True)
        self.fc = nn.Linear(sequence_length * hidden_size, output_size)

    def forward(self, input, length):
        batch_size = input.size(0)
        hidden = self.initHidden(batch_size)
        x_packed = pack_padded_sequence(input, length, batch_first = True, enforce_sorted = False)
        output, _ = self.layer(x_packed, hidden)
        output, output_length = pad_packed_sequence(output, batch_first = True)
        output = self.fc(output.reshape(output.shape[0], -1))
        return output

    def initHidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size)

GRU = GRUModel(input_size, hidden_size, num_layers, output_size)


In [41]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(GRU.parameters(), lr=0.008)

n_epochs = 20
BATCH_SIZE = 64

GRU_dataset = Data.TensorDataset(x_RNN_20_train,y_RNN_20_train)
GRU_loader = Data.DataLoader(dataset = GRU_dataset, batch_size = BATCH_SIZE, collate_fn = collate_fn)

for epoch in range(1, n_epochs + 1):
    GRU.train()
    for data, target,length in GRU_loader:
        target = torch.tensor(target, dtype = torch.long)
        output = GRU(data, length)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print('Epoch: {} \t Traning Loss: {:.6f}'.format(epoch, loss.item()))

Epoch: 1 	 Traning Loss: 0.954589
Epoch: 2 	 Traning Loss: 0.933131
Epoch: 3 	 Traning Loss: 0.936841
Epoch: 4 	 Traning Loss: 0.921497
Epoch: 5 	 Traning Loss: 0.862357
Epoch: 6 	 Traning Loss: 0.805082
Epoch: 7 	 Traning Loss: 0.741512
Epoch: 8 	 Traning Loss: 0.674689
Epoch: 9 	 Traning Loss: 0.656668
Epoch: 10 	 Traning Loss: 0.680654
Epoch: 11 	 Traning Loss: 0.554102
Epoch: 12 	 Traning Loss: 0.597045
Epoch: 13 	 Traning Loss: 0.556302
Epoch: 14 	 Traning Loss: 0.546781
Epoch: 15 	 Traning Loss: 0.534088
Epoch: 16 	 Traning Loss: 0.481103
Epoch: 17 	 Traning Loss: 0.521146
Epoch: 18 	 Traning Loss: 0.488755
Epoch: 19 	 Traning Loss: 0.515605
Epoch: 20 	 Traning Loss: 0.485159


In [42]:
GRU_test_dataset = Data.TensorDataset(x_RNN_20_test,y_RNN_20_test)
GRU_test_loader = Data.DataLoader(dataset = GRU_test_dataset, batch_size = BATCH_SIZE, collate_fn = collate_fn)


In [43]:
with torch.no_grad():
    totalCount = 0
    currentCount = 0
    for data, target, length in rnn_test_loader:
        output = GRU(data, length)
        _, pred = torch.max(output, 1)
        currentCount += sum(np.array(pred) == target)
        totalCount += pred.size(0)
    rnn.train()
    print('accuracy is :', currentCount / totalCount)

accuracy is : 0.5969166666666667


We could learn that the accuracy of GRU is between that of FNN based on avaerage wv and first-10-wv FNN. FNN based on average wv still gives better accuracy since it contains average vector of each reviews, which could deliver more information to model for classification problem. And for this compared to first-10-wv FNN, Task5 leans from first-20 vectors, so it contains more information, which is acceptable.

## （c）LSTM

For this part, in order to perform batch processing more efficiently, it is necessary to fill the sample sequence to ensure that each sample has the same length. Although the length of the filled sample sequence is the same, the sequence may be filled with a lot of invalid value 0. Feeding the filled value 0 to the RNN for forward calculation will not only waste computing resources, but may also have errors in the final value. Therefore, before sending the sequence to RNN for processing, pack_padded_sequence needs to be used for compression to compress invalid padding values. After the sequence is processed by RNN, the output is still a compressed sequence. It is necessary to use pad_packed_sequence to fill the compressed sequence back for subsequent processing.

Compared to RNN and GRU, LSTM has one more cell state needed to feed into output gate, here I add one more thing in initHidden().

In [44]:
input_size = 300
hidden_size = 20
output_size = 3
num_layers = 1
sequence_length = 20

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.layer = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True)
        self.fc = nn.Linear(sequence_length * hidden_size, output_size)

    def forward(self, input, length):
        batch_size = input.size(0)
        hidden = self.initHidden(batch_size)
        x_packed = pack_padded_sequence(input, length, batch_first = True, enforce_sorted = False)
        output, hidden = self.layer(x_packed, hidden)
        output, output_length = pad_packed_sequence(output, batch_first = True)
        output = self.fc(output.reshape(output.shape[0], -1))
        return output

    def initHidden(self, batch_size):
        return (torch.zeros(self.num_layers, batch_size, self.hidden_size),
                torch.zeros(self.num_layers, batch_size, self.hidden_size))

LSTM = LSTMModel(input_size, hidden_size, num_layers, output_size)


In [45]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(LSTM.parameters(), lr=0.005)

n_epochs = 20
BATCH_SIZE = 64

LSTM_dataset = Data.TensorDataset(x_RNN_20_train,y_RNN_20_train)
LSTM_loader = Data.DataLoader(dataset = LSTM_dataset, batch_size = BATCH_SIZE, collate_fn = collate_fn)

for epoch in range(1, n_epochs + 1):
    LSTM.train()
    for data, target,length in LSTM_loader:
        target = torch.tensor(target, dtype = torch.long)
        output = LSTM(data, length)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print('Epoch: {} \t Traning Loss: {:.6f}'.format(epoch, loss.item()))

Epoch: 1 	 Traning Loss: 0.941118
Epoch: 2 	 Traning Loss: 0.878589
Epoch: 3 	 Traning Loss: 0.830294
Epoch: 4 	 Traning Loss: 0.787551
Epoch: 5 	 Traning Loss: 0.763168
Epoch: 6 	 Traning Loss: 0.739457
Epoch: 7 	 Traning Loss: 0.721872
Epoch: 8 	 Traning Loss: 0.716214
Epoch: 9 	 Traning Loss: 0.701400
Epoch: 10 	 Traning Loss: 0.698533
Epoch: 11 	 Traning Loss: 0.645968
Epoch: 12 	 Traning Loss: 0.613846
Epoch: 13 	 Traning Loss: 0.590450
Epoch: 14 	 Traning Loss: 0.565637
Epoch: 15 	 Traning Loss: 0.531812
Epoch: 16 	 Traning Loss: 0.519008
Epoch: 17 	 Traning Loss: 0.529143
Epoch: 18 	 Traning Loss: 0.508652
Epoch: 19 	 Traning Loss: 0.492074
Epoch: 20 	 Traning Loss: 0.503269


In [46]:
LSTM_test_dataset = Data.TensorDataset(x_RNN_20_test,y_RNN_20_test)
LSTM_test_loader = Data.DataLoader(dataset = LSTM_test_dataset, batch_size = BATCH_SIZE, collate_fn = collate_fn)


In [47]:
with torch.no_grad():
    totalCount = 0
    currentCount = 0
    for data, target, length in LSTM_test_loader:
        output = LSTM(data, length)
        _, pred = torch.max(output, 1)
        currentCount += sum(np.array(pred) == target)
        totalCount += pred.size(0)
    rnn.train()
    print('accuracy is :', currentCount / totalCount)
    

accuracy is : 0.6000833333333333


Answer:We could learn that the accuracy of LSTM is between that of FNN based on avaerage wv and first-10-wv FNN. FNN based on average wv still gives better accuracy since it contains average vector of each reviews, which could deliver more information to model for classification problem. And for this compared to first-10-wv FNN, Task5 leans from first-20 vectors, so it contains more information, which is acceptable.

Answer: For this part with review length equal to 20, LSTM gives me best accuracy while simple RNN gives me lowest accuracy. Since LSTM could be used to solve long-dependencies problem, which may reflect that our review with size of 20 has longer dependencies than we expected. This is also something simple RNN missed. And for GRU, its gate obviously generate better results than simple RNN, which means its gate is useful and could improve performance.