# Sentiment Analysis Using Neural Networks

- Libraries Used:  __Pandas, gensim.downloader, Numpy, nltk, re, sklearn, torch__
- Time to Run the code: __Approximately 17 minutes__

In [None]:
import pandas as pd
import gensim.downloader as api
import contractions
import gensim
from gensim.test.utils import datapath
from gensim import utils
import numpy as np
import nltk
from numpy.linalg import norm
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn import preprocessing

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hetvishah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hetvishah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/hetvishah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/hetvishah/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Task 1 - Dataset Generation

In the snippet below I load the dataset(available at [Amazon Reviews Dataset](https://www.kaggle.com/datasets/beaglelee/amazon-reviews-us-beauty-v1-00-tsv-zip)), assign them into classes and randomly extract a sample of data from each class.

In [None]:
dataFile = pd.read_csv(r'data.tsv', sep='\t', on_bad_lines = 'skip',low_memory=False)
reducedDataFile = dataFile.filter(['review_body','review_headline','star_rating'])
reducedDataFile = reducedDataFile[pd.to_numeric(reducedDataFile['star_rating'], errors='coerce').notnull()]
reducedDataFile = reducedDataFile.astype({'star_rating':'int'})
reducedDataFile = reducedDataFile.astype({'review_body':'str'})
reducedDataFile = reducedDataFile.astype({'review_headline':'str'})
def classes(x):
    if x == 1 or x == 2:
        temp = 1
    elif x == 3:
        temp = 2
    else:
        temp = 3
    return temp
tempClassList = []
tempClassList = reducedDataFile['star_rating'].apply(classes)
reviewDataFrame = reducedDataFile.assign(classes = tempClassList)
reviewDataFrame = reviewDataFrame.groupby(['classes']).sample(n=20000)
#reviewDataFrame.to_csv('storedDataset.csv',index=False)

### Task 2 - Word Embedding

In [None]:
#Loading the google pre-trained model
loadPretrainedModel = api.load('word2vec-google-news-300')


### Task 2A - Check the Syntactic Understanding of the Pretrained Word2Vec Model

To check the syntactic understanding of the model I take the following 3 examples:
- 1) husband - man + woman = wife
- 2) brilliant = superb
- 3) beautiful = gorgeous

To find the similarity, I use cosine distance.

In [None]:
#Semantic similarities of vectors

#Example 1
ex1_LHS = loadPretrainedModel['husband'] - loadPretrainedModel['man'] + loadPretrainedModel['woman']
ex1_RHS = loadPretrainedModel['wife']
sim1 = np.dot(ex1_LHS, ex1_RHS)/(norm(ex1_LHS)*norm(ex1_RHS))

#Example 2
ex2_LHS = loadPretrainedModel['brilliant']
ex2_RHS = loadPretrainedModel['superb']
sim2 = np.dot(ex2_LHS, ex2_RHS)/(norm(ex2_LHS)*norm(ex2_RHS))

#Example 3
ex3_LHS = loadPretrainedModel['beautiful']
ex3_RHS = loadPretrainedModel['gorgeous']
sim3 = np.dot(ex3_LHS, ex3_RHS)/(norm(ex3_LHS)*norm(ex3_RHS))

print('Example 1 Similarity: ', sim1)
print('Example 2 Similarity: ', sim2)
print('Example 3 Similarity: ', sim3)

Example 1 Similarity:  0.7332699
Example 2 Similarity:  0.7657863
Example 3 Similarity:  0.8353004


### Task 2B - Building My Custom Word2Vec Model and Comparing Its Performance with the Pretrained Model


In [None]:
class MyCorpus:
    def __iter__(self):
        #data = pd.read_csv(r'storedDataset.csv',on_bad_lines = 'skip')
        corpus = reviewDataFrame['review_body'].tolist()
        for line in corpus:
            line = str(line)
            yield utils.simple_preprocess(line)


sentences = MyCorpus()
CustomModel = gensim.models.Word2Vec(sentences=sentences, vector_size=300,window=13, min_count=9)
CustomModel.save("myCustomModel.model")

In [None]:
#Semantic similarities of vectors

#Example 1
ex1_LHS = CustomModel.wv['husband'] - CustomModel.wv['man'] + CustomModel.wv['woman']
ex1_RHS = CustomModel.wv['wife']
sim1 = np.dot(ex1_LHS, ex1_RHS)/(norm(ex1_LHS)*norm(ex1_RHS))

#Example 2
ex2_LHS = CustomModel.wv['brilliant']
ex2_RHS = CustomModel.wv['superb']
sim2 = np.dot(ex2_LHS, ex2_RHS)/(norm(ex2_LHS)*norm(ex2_RHS))

#Example 3
ex3_LHS = CustomModel.wv['beautiful']
ex3_RHS = CustomModel.wv['gorgeous']
sim3 = np.dot(ex3_LHS, ex3_RHS)/(norm(ex3_LHS)*norm(ex3_RHS))

print('Example 1 Similarity: ', sim1)
print('Example 2 Similarity: ', sim2)
print('Example 3 Similarity: ', sim3)

Example 1 Similarity:  0.83443147
Example 2 Similarity:  0.3848225
Example 3 Similarity:  0.8107748


### Conclusion
To conclude, the pre-trained model performs better than the custom-trained model for the last two examples. However, for the first example, the custom-trained model shows better performance. I believe the performance depends on the dataset used. There might be cases where the words used in my examples are not present in the randomly sampled dataset, which could result in errors. The similarity performance can also vary with different datasets. I will use the pre-trained model further in my code.

### Task 3 - Comparing Sentiment Analysis Accuracy Using Simple Models: Perceptron and Linear SVC for Word2Vec and TF-IDF Vectorization

Before creating models, I will first clean the dataset.

In [None]:
#Convert reviews to lowercase
reviewDataFrame['review_body'] = reviewDataFrame['review_body'].apply(lambda x: x.lower())
#Remove any data inside brackets
reviewDataFrame['review_body'] = reviewDataFrame['review_body'].apply(lambda x: re.sub('\[.*?\]', '', x))
reviewDataFrame['review_body'] = reviewDataFrame['review_body'].apply(lambda x: re.sub('\(.*?\)', '', x))
reviewDataFrame['review_body'] = reviewDataFrame['review_body'].apply(lambda x: re.sub('\{.*?\}', '', x))
#Remove web URL's from the reviews
reviewDataFrame['review_body'] = reviewDataFrame['review_body'].apply(lambda x: re.sub('https:\/\/.*', '', x))
#Remove HTML tags
reviewDataFrame['review_body'] = reviewDataFrame['review_body'].apply(lambda x: re.sub('<[^<]+?>', '', x))
#Remove words with numericals
reviewDataFrame['review_body'] = reviewDataFrame['review_body'].apply(lambda x: re.sub('\w*\d\w*', '', x))
#Remove '\n' from the text. I noticed some text containing them.
reviewDataFrame['review_body'] = reviewDataFrame['review_body'].apply(lambda x:  re.sub('\n', '', x))
#Fix words like couldn't -> could not using the contraction library
reviewDataFrame['review_body'] = reviewDataFrame['review_body'].apply(lambda x: contractions.fix(x))
#Removing Alphanumeric characters
reviewDataFrame['review_body'] = reviewDataFrame['review_body'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
#Removing any extra space
reviewDataFrame['review_body'] = reviewDataFrame['review_body'].apply(lambda x: " ".join(x.split()))
#FOR REVIEW Headline



In [None]:
#Lemmatization
def getPosTag(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def lemmatize(token):
    lemmatizer = WordNetLemmatizer()
    tokenizedList = pos_tag(word_tokenize(token))
    lemmStr = ''
    for val in tokenizedList:
        Tag = getPosTag(val[1])
        if Tag!= '':
            lemmStr+= ' '+ lemmatizer.lemmatize(val[0], Tag)
        else:
            lemmStr+= ' '+ val[0]
    return lemmStr
reviewDataFrame['review_body'] = reviewDataFrame['review_body'].apply(lambda x: lemmatize(x))


#Remove tokens of length 1 or 2
reviewDataFrame['review_body'] = reviewDataFrame['review_body'].apply(lambda x: ' '.join([word for word in x.split() if len(word) >= 3]))



In [None]:
# Creating TF-IDF vectors
trainingData, testingData , trainY, testY = train_test_split(reviewDataFrame['review_body'].values,reviewDataFrame['classes'].values,test_size=0.2,random_state=123, stratify = reviewDataFrame['classes'].values )
vector = TfidfVectorizer()
trainX = vector.fit_transform(trainingData)
testX = vector.transform(testingData)


In order to create the word2Vec vectors, I first had to handle rows which were empty. For that, I replace any empty row with an <UNK> tag. And since, it is not presesnt in the google trained word2vec model, it will append zeros. Once these rows are handled, I get the summation of all word2Vec vectors for one review and divide it by the total review count.

In [None]:
reviewDataFrame['review_body'].replace('', '<UNK>', inplace=True)

#word2Vec Average vector
word2Vec = []
temp = np.zeros(300)
for ele in reviewDataFrame['review_body'].values:
    count = 0
    temp = np.zeros(300)
    stuff = ele.split(" ")
    for item in stuff:
        count+=1
        if item in loadPretrainedModel:
            temp = np.add(temp,np.array(loadPretrainedModel[item]))
    word2Vec.append(temp/count)





Below, I split the data into training and testing sets with an 80% and 20% split, respectively. Then, I convert the input data (i.e., trainX and testX) into numpy arrays.


In [None]:
word2Vec_trainX, word2Vec_testX , word2Vec_trainY, word2Vec_testY = train_test_split(word2Vec,reviewDataFrame['classes'].values,test_size=0.2,random_state=123)
word2Vec_trainX = np.array(word2Vec_trainX)
word2Vec_testX = np.array(word2Vec_testX)

In [None]:
#Accuracy on the Perceptron Model using TF-IDF
modelPerceptron = Perceptron(random_state=0,alpha = 0.2,eta0 = 10,n_iter_no_change = 10,early_stopping=True)

modelPerceptron.fit(trainX, trainY)
accuracyPerceptron = modelPerceptron.score(testX,testY)
print('Accuracy: ',accuracyPerceptron*100,'%')



Accuracy:  60.34166666666667 %


In [None]:
#Accuracy on the Perceptron Model using Word2Vec

modelPerceptron_word2Vec = Perceptron()
modelPerceptron_word2Vec.fit(word2Vec_trainX, word2Vec_trainY)
accuracyPerceptron_word2Vec = modelPerceptron_word2Vec.score(word2Vec_testX,word2Vec_testY)

print('Accuracy: ',accuracyPerceptron_word2Vec*100,'%')

Accuracy:  47.333333333333336 %


In [None]:
#Accuracy on the SVM Model using TF-IDF
modelSVM = LinearSVC(loss  = 'hinge',tol = 1e-4, C = 0.7,intercept_scaling = 0.1, max_iter = 5000)

modelSVM.fit(trainX, trainY)
accuracySVM = modelSVM.score(testX,testY)
print('Accuracy: ',accuracySVM*100,'%')


Accuracy:  70.43333333333334 %


In [None]:
#Accuracy on the SVM Model using word2Vec
modelSVM_word2Vec = LinearSVC()

modelSVM_word2Vec.fit(word2Vec_trainX, word2Vec_trainY)
accuracySVM_word2Vec = modelSVM_word2Vec.score(word2Vec_testX,word2Vec_testY)
print('Accuracy: ',accuracySVM_word2Vec*100,'%')

Accuracy:  65.06666666666666 %


### Conclusion
In conclusion TFIDF performs significantly better than the word2Vec model for Perceptron. For SVM, the difference is relatively less, but TFIDF still performs better.


### Task 4 - Feedforward Neural Network for Sentiment Analysis Using Pytorch

The Data class converts the inputs and outputs for the model into tensors.


In [None]:
from torch.utils.data import DataLoader, Dataset
class Data(Dataset):
    def __init__(self, X_train, y_train):
        self.X = torch.from_numpy(X_train.astype(np.float32))
        self.y = torch.from_numpy(y_train)
        self.len = self.X.shape[0]
    def __getitem__(self, index):
        return self.X[index], self.y[index]
    def __len__(self):
        return self.len


The following function first converts the output data into one-hot encoded vectors and then passes the training and testing data to the `Data` class defined above. It returns a concatenated tensor for the training and testing data.


In [None]:

def ConvertIntoTensor(word2Vec_trainX,word2Vec_trainY,word2Vec_testX,word2Vec_testY):
    encoding = preprocessing.OneHotEncoder()
    MLP_trainX = word2Vec_trainX
    MLP_trainY = encoding.fit_transform(word2Vec_trainY.reshape(-1,1)).toarray()
    MLP_testX =  word2Vec_testX
    MLP_testY = encoding.fit_transform(word2Vec_testY.reshape(-1,1)).toarray()
    MLPTrainData = Data(MLP_trainX,MLP_trainY)
    MLPTestData = Data(MLP_testX,MLP_testY)
    return MLPTrainData,MLPTestData


The following function first shuffles the dataset and converts a given percentage of the training data into validation data. It then returns the `train_loader` and `valid_loader`, which are passed to the models for training. The batch size, validation data size, and training data are passed as parameters to the function.



In [None]:
def Dataloaders(batch_size,valid_size,MLPTrainData):
    batch_size = batch_size
    valid_size = valid_size

    num_train = len(MLPTrainData)
    indices = list(range(num_train))
    np.random.shuffle(indices)
    split = int(np.floor(valid_size * num_train))
    train_idx, valid_idx = indices[split:], indices[:split]

    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(MLPTrainData, batch_size=batch_size,
                                               sampler=train_sampler)
    valid_loader = torch.utils.data.DataLoader(MLPTrainData, batch_size=batch_size,
                                               sampler=valid_sampler)
    return train_loader,valid_loader





This is the main model. It has two hidden layers with 100 and 10 nodes, respectively.


In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        hidden_1 = 100
        hidden_2 = 10
        self.fc1 = nn.Linear(input_dim, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, output_dim)


    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

Here, I am training the model using PyTorch functionalities. For every batch in the `train_loader`, we first run a forward pass, calculate the loss, and run a backward pass for weight optimization. Once done, the model is evaluated on the validation data. If the loss for the current validation pass is less than the previous pass, the model is saved.


In [None]:
def trainEpoch(n_epochs,model,train_loader,valid_loader):
    n_epochs = n_epochs
    valid_loss_min = np.Inf

    for epoch in range(n_epochs):
        train_loss = 0.0
        valid_loss = 0.0


        model.train()
        for data, target in train_loader:
            optimizer.zero_grad()
            output = model(data)
            train_loss = criterion(output, target)
            train_loss.backward()
            optimizer.step()

        model.eval()
        for data, target in valid_loader:
            output = model(data)
            valid_loss = criterion(output, target)



        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            epoch+1,
            train_loss.item(),
            valid_loss.item()
            ))

        # save model if validation loss has decreased
        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss.item()))
            torch.save(model.state_dict(), 'model.pt')
            valid_loss_min = valid_loss.item()
    return model

Here, to get the predicted values, I pass the test data (i.e., dataloader) into the model, and it returns the predicted values. Since the output dimension is 3, I take the maximum value from each output and return the `prediction_list`.


In [None]:
def predict(model, dataloader):
    prediction_list = []
    for i, batch in enumerate(dataloader):
        batch = batch[0].clone().detach().requires_grad_(True)
        outputs = model(batch)
        _, predicted = torch.max(outputs.data, 1)
        prediction_list.append(predicted.cpu())
    return prediction_list

In [None]:
# Defining the model parameters
input_dim = 300
output_dim = 3
batchSize = 32
validationSize = 0.2
n_epochs = 25
learningRate = 0.003


In [None]:
# Converting the test and train Data into Tensors
MLP_trainData_4A,MLP_testData_4A = ConvertIntoTensor(word2Vec_trainX,word2Vec_trainY,word2Vec_testX,word2Vec_testY)
# Using dataloader to get the train and validation data
train_loader,valid_loader = Dataloaders(batchSize,validationSize,MLP_trainData_4A)
# Initializing the model
model = Net()
# Defining the loss criteria
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr= learningRate)
# Training the model on train data and testing it on the validation data
model = trainEpoch(n_epochs,model,train_loader,valid_loader)
# Loading the model parameters
model.load_state_dict(torch.load('model.pt'))

Epoch: 1 	Training Loss: 0.843965 	Validation Loss: 0.950183
Validation loss decreased (inf --> 0.950183).  Saving model ...
Epoch: 2 	Training Loss: 0.565988 	Validation Loss: 0.850802
Validation loss decreased (0.950183 --> 0.850802).  Saving model ...
Epoch: 3 	Training Loss: 0.903665 	Validation Loss: 0.936189
Epoch: 4 	Training Loss: 0.816334 	Validation Loss: 0.714347
Validation loss decreased (0.850802 --> 0.714347).  Saving model ...
Epoch: 5 	Training Loss: 0.832102 	Validation Loss: 0.877351
Epoch: 6 	Training Loss: 0.670143 	Validation Loss: 0.836814
Epoch: 7 	Training Loss: 0.685039 	Validation Loss: 0.635168
Validation loss decreased (0.714347 --> 0.635168).  Saving model ...
Epoch: 8 	Training Loss: 0.845585 	Validation Loss: 0.696973
Epoch: 9 	Training Loss: 0.922382 	Validation Loss: 0.667591
Epoch: 10 	Training Loss: 0.734397 	Validation Loss: 0.779530
Epoch: 11 	Training Loss: 0.714184 	Validation Loss: 0.794143
Epoch: 12 	Training Loss: 0.634325 	Validation Loss: 0.6

<All keys matched successfully>

In [None]:
# Using DataLoaders to get the test_loader to test upon
test_loader = torch.utils.data.DataLoader(MLP_testData_4A, batch_size=1)

In [None]:
# predicting values on the test data
predictions = predict(model,test_loader)

In [None]:
right = 0
predictionsFinal = []
for ele in list(predictions):
    predictionsFinal.append(int(ele))
for ele1,ele2 in zip(word2Vec_testY,predictionsFinal):
    if ele1 == (ele2+1):
        right+=1
Accuracy_4A = (right/len(predictionsFinal))*100
print('Accuracy: ',Accuracy_4A ,'%')

Accuracy:  66.60833333333333 %


### Task 4B - Concatenate the first 10 Word2Vec vectors for each review as the input feature

Here, I am creating the Word2Vec data as mentioned in the assignment. For every review, I take the first 10 words and concatenate their vectors. For any word not in the pre-trained Google model, I append zeros. For reviews shorter than 10 words, I pad with zeros so that the total array length becomes 3000.


In [None]:
word2Vec_4B = []

for ele in reviewDataFrame['review_body'].values:
    count = 0
    temp = []
    splitData = ele.split(" ")
    for item in splitData:
        count+=1
        if item in loadPretrainedModel:
            #If word is present in the load-pretrained model
            if count == 1:
                # For the first word
                temp = np.array(loadPretrainedModel[item])
            else:
                temp = np.concatenate((temp,np.array(loadPretrainedModel[item])))
        else:
            # If word is not present in the load-pretrained model
            if count == 1:
                # For the first word
                temp = np.array(np.zeros(300))
            else:
                temp = np.concatenate((temp,np.zeros(300)))
        if count == 10:
            break
    # padding with zeroes if the length of review is less than 10.
    if count!= 10:
        for countVal in range(count+1,11):
            temp = np.concatenate((temp,np.zeros(300)))
    word2Vec_4B.append(temp)




In [None]:
#Splitting into train and test data - 80% and 20% respectively
word2Vec_trainX_4B, word2Vec_testX_4B , word2Vec_trainY_4B, word2Vec_testY_4B = train_test_split(word2Vec_4B,reviewDataFrame['classes'].values,test_size=0.2,random_state=123)
word2Vec_trainX_4B = np.array(word2Vec_trainX_4B)
word2Vec_testX_4B = np.array(word2Vec_testX_4B)

In [None]:
# Defining model Paramters
input_dim = 3000
output_dim = 3
batchSize = 25
validationSize = 0.2
n_epochs = 25
learningRate = 0.008


In [None]:
# Converting the test and train Data into Tensors
MLP_trainData_4B,MLP_testData_4B = ConvertIntoTensor(word2Vec_trainX_4B, word2Vec_trainY_4B , word2Vec_testX_4B, word2Vec_testY_4B)
# Using dataloader to get the train and validation data
train_loader_4B,valid_loader_4B = Dataloaders(batchSize,validationSize,MLP_trainData_4B)
# Initializing the model
model_4B = Net()
# Loss Criteria
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_4B.parameters(), lr= learningRate)
# Training epochs
model_4B = trainEpoch(n_epochs,model_4B,train_loader_4B,valid_loader_4B)
# Loading the model parameters
model_4B.load_state_dict(torch.load('model.pt'))

Epoch: 1 	Training Loss: 0.983894 	Validation Loss: 0.928744
Validation loss decreased (inf --> 0.928744).  Saving model ...
Epoch: 2 	Training Loss: 0.863392 	Validation Loss: 0.867216
Validation loss decreased (0.928744 --> 0.867216).  Saving model ...
Epoch: 3 	Training Loss: 0.835467 	Validation Loss: 1.093449
Epoch: 4 	Training Loss: 0.720671 	Validation Loss: 1.274513
Epoch: 5 	Training Loss: 0.588887 	Validation Loss: 0.954057
Epoch: 6 	Training Loss: 0.381003 	Validation Loss: 0.963526
Epoch: 7 	Training Loss: 0.617316 	Validation Loss: 0.970150
Epoch: 8 	Training Loss: 0.349621 	Validation Loss: 1.275130
Epoch: 9 	Training Loss: 0.341491 	Validation Loss: 1.347012
Epoch: 10 	Training Loss: 0.329319 	Validation Loss: 1.154507
Epoch: 11 	Training Loss: 0.269605 	Validation Loss: 2.722850
Epoch: 12 	Training Loss: 0.570581 	Validation Loss: 1.789865
Epoch: 13 	Training Loss: 0.211871 	Validation Loss: 3.322661
Epoch: 14 	Training Loss: 0.551105 	Validation Loss: 2.264518
Epoch: 1

<All keys matched successfully>

In [None]:
# Predicting values from the test data
test_loader = torch.utils.data.DataLoader(MLP_testData_4B, batch_size=1)
predictions = predict(model_4B,test_loader)

In [None]:
# Checking the model Accuracy
right = 0
predictionsFinal = []
for ele in list(predictions):
    predictionsFinal.append(int(ele))
for ele1,ele2 in zip(word2Vec_testY,predictionsFinal):
    if ele1 == (ele2+1):
        right+=1
Accuracy_4B = right/len(predictionsFinal)*100
print('Accuracy: ',Accuracy_4B ,'%')

Accuracy:  57.01666666666667 %


### Conclusion
Considering the accuracy, the first model (which considers the review's average word2Vec) performs better than the later model(concatenating the first 10 review vectors). This should be because the first model can handle context better.

### Task 5 - Recurrent Neural Networks for Sentiment Analysis Using Pytorch

### 5A - RNN

In [None]:
# Defining an RNN module
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first = True)
        self.fc = nn.Linear(hidden_size*sequence_length, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)

        out, _ = self.rnn(x,h0)
        out = out.reshape(out.shape[0],-1)

        out = self.fc(out)

        return out

To create the Word2Vec model, I take the first 20 words from each review and add them to `word2Vec_5A`. I append zeros for any word not in the pre-trained Google model. For reviews shorter than 20 words, I pad them with zeros. Each row will have 20 vectors, with each vector having a length of 300. On the other hand, in an RNN, since we pass it word-by-word, it incorporates the context better and hence gives better accuracy.


In [None]:
def trainEpochforTask5(n_epochs,model,train_loader,valid_loader):
    n_epochs = n_epochs
    valid_loss_min = np.Inf

    for epoch in range(n_epochs):
        train_loss = 0.0
        valid_loss = 0.0


        model.train()
        for data, target in train_loader:
            #hiddenState =  model.initHidden()[0]
            data = data.squeeze(1)
            optimizer.zero_grad()
            output = model(data)
            #target = target.reshape(np.array(target).shape[0],1)
            train_loss = criterion(output, target)

            train_loss.backward()
            optimizer.step()
            #train_loss += loss.item()*data.size(0)

        model.eval()
        for data, target in valid_loader:
            data = data.squeeze(1)

            output = model(data)
            #target = target.reshape(-1,1)
            #loss = criterion(output, target)
            valid_loss = criterion(output, target)
            #valid_loss += loss.item()*data.size(0)


        #train_loss = train_loss/len(train_loader.dataset)
        #valid_loss = valid_loss/len(valid_loader.dataset)

        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            epoch+1,
            train_loss.item(),
            valid_loss.item()
            ))

        # save model if validation loss has decreased
        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss.item()))
            torch.save(model.state_dict(), 'model.pt')
            valid_loss_min = valid_loss.item()
    return model

In [None]:
word2Vec_5A = []
for ele in reviewDataFrame['review_body'].values:
    count = 0
    temp = []
    splitData = ele.split(" ")
    for item in splitData:
        count+=1
        if item in loadPretrainedModel:
                temp.append(np.array(loadPretrainedModel[item]))
        else:
                temp.append(np.zeros(300))
        if count == 20:
            break
    # Padding with zeros
    if count!= 20:
        for countVal in range(count+1,21):
            temp.append(np.zeros(300))
    word2Vec_5A.append(temp)



In [None]:
#Splitting into train and test data - 80% and 20% respectively
word2Vec_trainX_5A, word2Vec_testX_5A , word2Vec_trainY_5A, word2Vec_testY_5A = train_test_split(word2Vec_5A,reviewDataFrame['classes'].values,test_size=0.2,random_state=123)
word2Vec_trainX_5A = np.array(word2Vec_trainX_5A)
word2Vec_testX_5A = np.array(word2Vec_testX_5A)

In [None]:
# Initializing the model parameters
batchSize = 32
validationSize = 0.2
n_epochs = 25
learningRate = 0.01
# Converting the test and training data into tensors
RNN_trainData_5A,RNN_testData_5A = ConvertIntoTensor(word2Vec_trainX_5A, word2Vec_trainY_5A , word2Vec_testX_5A, word2Vec_testY_5A)
# Using dataloader to get the train and validation data
train_loader_5A,valid_loader_5A = Dataloaders(batchSize,validationSize,RNN_trainData_5A)


Here, since each row has 20 different vectors of length 300, the input size is defined as 300. Additionally, because I convert the output data into one-hot encoded vectors, the output dimension for the RNN model will be 3.


In [None]:
hiddenStateSize = 20
sequence_length = 20
hiddenLayers = 1
# Initializing the model
model_5A = RNN(300,hiddenStateSize,hiddenLayers,3)
# Defining the loss Criteria
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_5A.parameters(), lr= learningRate)
#training the model
model_5A = trainEpochforTask5(n_epochs,model_5A,train_loader_5A,valid_loader_5A)
# Loading the model parameters
model_5A.load_state_dict(torch.load('model.pt'))

Epoch: 1 	Training Loss: 1.002603 	Validation Loss: 1.114888
Validation loss decreased (inf --> 1.114888).  Saving model ...
Epoch: 2 	Training Loss: 1.008980 	Validation Loss: 1.113441
Validation loss decreased (1.114888 --> 1.113441).  Saving model ...
Epoch: 3 	Training Loss: 0.748540 	Validation Loss: 0.999444
Validation loss decreased (1.113441 --> 0.999444).  Saving model ...
Epoch: 4 	Training Loss: 0.953780 	Validation Loss: 0.981410
Validation loss decreased (0.999444 --> 0.981410).  Saving model ...
Epoch: 5 	Training Loss: 0.830897 	Validation Loss: 0.907053
Validation loss decreased (0.981410 --> 0.907053).  Saving model ...
Epoch: 6 	Training Loss: 0.939435 	Validation Loss: 0.888999
Validation loss decreased (0.907053 --> 0.888999).  Saving model ...
Epoch: 7 	Training Loss: 1.037449 	Validation Loss: 0.722463
Validation loss decreased (0.888999 --> 0.722463).  Saving model ...
Epoch: 8 	Training Loss: 0.780343 	Validation Loss: 0.756940
Epoch: 9 	Training Loss: 0.956208 

<All keys matched successfully>

In [None]:
# Loading and predicting on the test data
test_loader = torch.utils.data.DataLoader(RNN_testData_5A, batch_size=1)
predictions = predict(model_5A,test_loader)


In [None]:
#Checking model accuracy
right = 0
predictionsFinal = []
for ele in list(predictions):
    predictionsFinal.append(int(ele))
for ele1,ele2 in zip(word2Vec_testY,predictionsFinal):
    if ele1 == (ele2+1):
        right+=1
Accuracy_RNN = (right/len(predictionsFinal))*100
print('Accuracy: ', Accuracy_RNN,'%')

Accuracy:  61.00833333333333 %


### Conclusion
The Simple model performs better than RNN, since it considers the average, it can handle context better. On the other hand training an RNN renders a better accuracy as compared to 4B (Concatenating the first 10 reviews). Since in 4B we're simply concatenating the first 10 vectors, it might not be able to handle context well. On the other hand, In RNN, we send those words into the model one-by-one.

### 5B GRU

In [None]:
# Defining a GRU model
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(GRU, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.GRU = nn.GRU(input_size, hidden_size, num_layers, batch_first = True)
        self.fc = nn.Linear(hidden_size*sequence_length, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)

        out, _ = self.GRU(x,h0)
        out = out.reshape(out.shape[0],-1)

        out = self.fc(out)

        return out

In [None]:
#Splitting into train and test data - 80% and 20% respectively
word2Vec_trainX_5B, word2Vec_testX_5B , word2Vec_trainY_5B, word2Vec_testY_5B = train_test_split(word2Vec_5A,reviewDataFrame['classes'].values,test_size=0.2,random_state=123)
word2Vec_trainX_5B = np.array(word2Vec_trainX_5B)
word2Vec_testX_5B = np.array(word2Vec_testX_5B)

In [None]:
#defining model Parameters
batchSize = 32
validationSize = 0.2
n_epochs = 25
learningRate = 0.008
#Converting into Tensors and loading data
GRU_trainData_5B,GRU_testData_5B = ConvertIntoTensor(word2Vec_trainX_5B, word2Vec_trainY_5B , word2Vec_testX_5B, word2Vec_testY_5B)
train_loader_5B,valid_loader_5B = Dataloaders(batchSize,validationSize,GRU_trainData_5B)


In [None]:
hiddenStateSize = 20
sequence_length = 20
hiddenLayers = 1
# Initializing the model
model_5B = GRU(300,hiddenStateSize,hiddenLayers,3)
# Defining the loss criteria
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_5B.parameters(), lr= learningRate)
# Training the model
model_5B = trainEpochforTask5(n_epochs,model_5B,train_loader_5B,valid_loader_5B)
# Loading the model parameters
model_5B.load_state_dict(torch.load('model.pt'))

Epoch: 1 	Training Loss: 1.046179 	Validation Loss: 1.062644
Validation loss decreased (inf --> 1.062644).  Saving model ...
Epoch: 2 	Training Loss: 0.977733 	Validation Loss: 1.026263
Validation loss decreased (1.062644 --> 1.026263).  Saving model ...
Epoch: 3 	Training Loss: 0.819245 	Validation Loss: 0.993745
Validation loss decreased (1.026263 --> 0.993745).  Saving model ...
Epoch: 4 	Training Loss: 0.815836 	Validation Loss: 0.864283
Validation loss decreased (0.993745 --> 0.864283).  Saving model ...
Epoch: 5 	Training Loss: 0.720248 	Validation Loss: 0.942974
Epoch: 6 	Training Loss: 0.798292 	Validation Loss: 0.890896
Epoch: 7 	Training Loss: 1.124406 	Validation Loss: 0.865597
Epoch: 8 	Training Loss: 0.789420 	Validation Loss: 0.857848
Validation loss decreased (0.864283 --> 0.857848).  Saving model ...
Epoch: 9 	Training Loss: 0.858342 	Validation Loss: 0.856926
Validation loss decreased (0.857848 --> 0.856926).  Saving model ...
Epoch: 10 	Training Loss: 0.775309 	Valida

<All keys matched successfully>

In [None]:
# Loading and predicting on the test data
test_loader = torch.utils.data.DataLoader(GRU_testData_5B, batch_size=1)
predictions = predict(model_5B,test_loader)

In [None]:
#Checking the model accuracy
right = 0
predictionsFinal = []
for ele in list(predictions):
    predictionsFinal.append(int(ele))
for ele1,ele2 in zip(word2Vec_testY,predictionsFinal):
    if ele1 == (ele2+1):
        right+=1
Accuracy_GRU = (right/len(predictionsFinal))*100
print('Accuracy: ',Accuracy_GRU , '%')

Accuracy:  62.6 %


### 5C LSTM

In [None]:
# Defining an LSTM model
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTM, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.LSTM = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True)
        self.fc = nn.Linear(hidden_size*sequence_length, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        h1 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)

        out, _ = self.LSTM(x,(h0,h1))
        out = out.reshape(out.shape[0],-1)

        out = self.fc(out)

        return out

In [None]:
#Splitting into train and test data - 80% and 20% respectively
word2Vec_trainX_5C, word2Vec_testX_5C , word2Vec_trainY_5C, word2Vec_testY_5C = train_test_split(word2Vec_5A,reviewDataFrame['classes'].values,test_size=0.2,random_state=123)
word2Vec_trainX_5C = np.array(word2Vec_trainX_5C)
word2Vec_testX_5C = np.array(word2Vec_testX_5C)

In [None]:
#Defining Model Paramters
batchSize = 20
validationSize = 0.2
n_epochs = 25
learningRate = 0.01
#Converting into Tensors and loading data
LSTM_trainData_5C,LSTM_testData_5C = ConvertIntoTensor(word2Vec_trainX_5C, word2Vec_trainY_5C , word2Vec_testX_5C, word2Vec_testY_5C)
train_loader_5C,valid_loader_5C = Dataloaders(batchSize,validationSize,LSTM_trainData_5C)


In [None]:
hiddenStateSize = 20
sequence_length = 20
hiddenLayers = 1
# Initializing the model
model_5C = LSTM(300,hiddenStateSize,hiddenLayers,3)
# Defining the loss criteria
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_5C.parameters(), lr= learningRate)
# Training the model
model_5C = trainEpochforTask5(n_epochs,model_5C,train_loader_5C,valid_loader_5C)
# Loading model parameters
model_5C.load_state_dict(torch.load('model.pt'))

Epoch: 1 	Training Loss: 1.129570 	Validation Loss: 1.011557
Validation loss decreased (inf --> 1.011557).  Saving model ...
Epoch: 2 	Training Loss: 0.923504 	Validation Loss: 0.930789
Validation loss decreased (1.011557 --> 0.930789).  Saving model ...
Epoch: 3 	Training Loss: 0.820379 	Validation Loss: 0.930147
Validation loss decreased (0.930789 --> 0.930147).  Saving model ...
Epoch: 4 	Training Loss: 0.773549 	Validation Loss: 0.959385
Epoch: 5 	Training Loss: 1.040869 	Validation Loss: 0.955542
Epoch: 6 	Training Loss: 0.997374 	Validation Loss: 0.749450
Validation loss decreased (0.930147 --> 0.749450).  Saving model ...
Epoch: 7 	Training Loss: 0.804810 	Validation Loss: 0.834420
Epoch: 8 	Training Loss: 0.877667 	Validation Loss: 0.968348
Epoch: 9 	Training Loss: 0.759487 	Validation Loss: 1.031708
Epoch: 10 	Training Loss: 0.930988 	Validation Loss: 0.552397
Validation loss decreased (0.749450 --> 0.552397).  Saving model ...
Epoch: 11 	Training Loss: 0.786059 	Validation Lo

<All keys matched successfully>

In [None]:
# Loading and predicting test Data
test_loader = torch.utils.data.DataLoader(LSTM_testData_5C, batch_size=1)
predictions = predict(model_5C,test_loader)

In [None]:
# Checking model accuracy
right = 0
predictionsFinal = []
for ele in list(predictions):
    predictionsFinal.append(int(ele))
for ele1,ele2 in zip(word2Vec_testY,predictionsFinal):
    if ele1 == (ele2+1):
        right+=1
Accuracy_LSTM = (right/len(predictionsFinal))*100
print('Accuracy: ',Accuracy_LSTM ,'%')

Accuracy:  63.05 %


### Conclusion
The performance of RNN, GRU and LSTM is almost on par with each other. LSTM performs slightly better than the other two. This would be because of the capability of an LSTM unit to handle the vanishing gradient problem. But overall, since the data is less, the performance is at a comparable level.

In [None]:
#Final Accuracies
print('Simple Models: ',Accuracy_4A,'%' )
print('4B: ',Accuracy_4B,'%' )
print('RNN: ',Accuracy_RNN,'%' )
print('GRU: ',Accuracy_GRU,'%' )
print('LSTM: ',Accuracy_LSTM,'%' )

Simple Models:  66.60833333333333 %
4B:  57.01666666666667 %
RNN:  61.00833333333333 %
GRU:  62.6 %
LSTM:  63.05 %
