# Section 0: Importing Dataset and Libraries

In [2]:
#This entire script is meant to be run in Google Colab

#We clone the github to easily grab the dataset information without having to manually import it to Google Colab each time. Additionally, we install necessary libraries here.
#Libraries are: Pytorch, NLTK, and Gensim.
!git clone https://github.com/JoshuaWidjaja/IMDB_ClassificationAndPrediction
#!pip install torch===1.7.1 torchvision===0.8.2 torchaudio===0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
#!pip install --user -U nltk
#!pip install gensim

#Imports, generally we isolate each import to it's proper block to easier read/organize the code.
import os


Cloning into 'IMDB_ClassificationAndPrediction'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 98548 (delta 9), reused 28 (delta 5), pack-reused 98507[K
Receiving objects: 100% (98548/98548), 121.25 MiB | 19.50 MiB/s, done.
Resolving deltas: 100% (16/16), done.
Checking out files: 100% (100030/100030), done.


# Section 1: Preprocessing Code 

In [3]:
### NOTE: The output of this block has already been saved in a text file. It does not need to be run for the model to work, but the code here is kept for documentation purposes.
### The local version we use of this file is located in the /src section of our project turn in.
### The primary output of this file is a text file called unalteredWordWeightMap.txt. This file contains a map from WORD : WEIGHT and is utilized in the rest of the preprocessing.

#Setting up file paths in relation to Google colab.
basePath = os.getcwd() + "/IMDB_ClassificationAndPrediction/IMDB_Data"
vocabFile = os.getcwd() + "/IMDB_ClassificationAndPrediction/IMDB_Data/imdb.vocab"
weightFile = os.getcwd() + "/IMDB_ClassificationAndPrediction/IMDB_Data/imdbEr.txt"

vocabList = []
counter = 0
#Below code retrieves every word from the vocabFile and adds it to the vocabList.
vocabFile = open(vocabFile, "r", encoding= "utf-8")
try:
  for lines in vocabFile:
    vocabList.append(lines.strip("\n"))
    counter += 1
except UnicodeDecodeError:
  print("ERROR AT: " + str(counter) + " " + str(lines))

#Below code retrieves the weight of each word from the weightFile.
weightFile = open(weightFile, "r", encoding = "utf-8")
fileNum = 0

#This dictionary maps each word with its perceived weight value
vocabWeightDict = dict()

#Assigning values to the dictionary
for weight in weightFile:
  vocabWeightDict[vocabList[fileNum]] = (float(weight.strip("\n")))
  fileNum += 1

#Two dictionaries that have the vocabWeightDict sorted from greatest to lowest, and lowest to greatest respectively. Used for information gathering
greatestWeightDict = sorted(vocabWeightDict.items(), key = lambda x: x[1], reverse=True)
lowestWeightDict = sorted(vocabWeightDict.items(), key = lambda x: x[1])

#Uncomment the below line to see what the vocabWeightDict looks like
#print(vocabWeightDict)


## Primary Preprocessing is performed in the block below. We use the data retrieved to train our models.

In [4]:
#In this block, we do our primary preprocessing and organization into the data we need to train our models. 

#Imports needed for proper preprocessing.
import nltk
import os
import sys
import codecs
from nltk.corpus import stopwords
from collections import defaultdict
nltk.download('stopwords')

#Needed to adjust encoding to avoid errors when using vocabWeightDict. 
try:
    sys.stdin = codecs.getreader("utf-8")(sys.stdin.detach())
    sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
except:
    pass

#Paths relative to Google Colab setup
trainDataLabeledBOWFile = os.getcwd() + "/IMDB_ClassificationAndPrediction/IMDB_Data/train/labeledBow.feat"
testDataLabeledBOWFile = os.getcwd() + "/IMDB_ClassificationAndPrediction/IMDB_Data/test/labeledBow.feat"
trainDataUnlabeledBOWFile = os.getcwd() + "/IMDB_ClassificationAndPrediction/IMDB_Data/train/unsupBow.feat"

vocabWeightFile = os.getcwd() + "/IMDB_ClassificationAndPrediction/unalteredWordWeightMap.txt"
positiveTrainSetPath = os.getcwd() + "/IMDB_ClassificationAndPrediction/IMDB_Data/train/pos"
negativeTrainSetPath = os.getcwd() + "/IMDB_ClassificationAndPrediction/IMDB_Data/train/neg"


#Open and creates a dictionary from existing vocabWeightFile created earlier. We also create a list of all the words in order. This is used to create and organize data regarding the reviews.
#Output is the following: vocabWeightDict is a Dictionary with structure of WORD: WEIGHT
#vocabList is a List that contains a different word at each index, [WORD1, WORD2, ... ]
vocabWeightDict = dict()
vocabWeightFile = open(vocabWeightFile, "r", encoding= "utf-8")
for lines in vocabWeightFile:
    word, spacer, weight = lines.split()
    vocabWeightDict[word] = weight

vocabList = list(vocabWeightDict)
vocabWeightFile.close()

'''
Function for creating an InfoList
Below syntax may be confusing. I will explain it here.
vocabWeightDict is the dictionary that maps WORD : WEIGHT.
vocabList is the List that contains a list of all words that occur in the reviews, in the same order as given in the files. For examples vocabList[0] is "the" 
BOWList is the mapping from the BOW that associated Word Index : Word Occurrence. For example 0 : 9 means the word at index 0 of the vocabList (in this case "the") occurs 9 times in that review.

Therefore, when we do vocabWeightDict[vocabList[int(BOWList[0])]] we are doing the following:
Assuming BOWList is [0, 9]. Then vocabList[int(BOWList[0])] is simply vocabList[0] which is the word "the".
Then vocabWeightDict[vocabList[int(BOWList)]] is just doing vocabWeightDict["the"] to get the corresponding weight. We then multiply this by BOWList[1] which is the occurence count.
'''
def createInfoList(infoFile: str) -> list:
  #Here, we use NLTK's stopword set.
  stopWords = set(stopwords.words("english"))
  fileNum = 0
  documentInfoList = []
  BOWContents = open(infoFile, "r", encoding="UTF-8")
  for lines in BOWContents:
    totalWeight = 0
    excludeStopWordsWeight = 0
    fileLength = 0
    uniqueWords = set()
    splitText = lines.split()
    for i in range(len(splitText)):
      if i == 0:
        pass
      else:
        BOWList = splitText[i].split(":")
        totalWeight += float(vocabWeightDict[vocabList[int(BOWList[0])]]) * float(BOWList[1])
        fileLength += int(BOWList[1])
        uniqueWords.add(vocabList[int(BOWList[0])])
        if vocabList[int(BOWList[0])] not in stopWords:
          excludeStopWordsWeight += float(vocabWeightDict[vocabList[int(BOWList[0])]]) * float(BOWList[1])
    documentInfoList.append( (fileNum, eval(splitText[0]), totalWeight, excludeStopWordsWeight, fileLength, len(uniqueWords)))
    fileNum +=1
  BOWContents.close()
  return documentInfoList

#documentInfoList contains the following information in the following order: 
#(Number of File being used, Actual Rating of the File, Weight when counting all words, Weight when excluding stop words.)

'''
Function removes entries in the documentInfoList that surpass the bounds given within lowerBound and upperBound.
We utilized this to see how removing outliers would effect the accuracy of our models.
We do this so that we can remove any significant outliers from the data, and then train using this dataset.
'''
def optimizeInfoList(infoList: list, lowerBound: float, upperBound: float) -> list:
  optimizedInfoList = []
  for i in range(len(infoList)):
    if infoList[i][2] > lowerBound and infoList[i][2] < upperBound:
      optimizedInfoList.append(infoList[i])
  return optimizedInfoList

#Creating the three infoLists, one for each type of data.
trainingDocumentInfoList = createInfoList(trainDataLabeledBOWFile)
testDocumentInfoList = createInfoList(testDataLabeledBOWFile)
unlabeledReviewDocumentInfoList = createInfoList(trainDataUnlabeledBOWFile)

#Creating the optimized infoLists, only for labeled training and testing data.
optimizedTrainingList = optimizeInfoList(trainingDocumentInfoList, -30.0, 30.0)
optimizedTestingList = optimizeInfoList(testDocumentInfoList, -30.0, 30.0)

#Uncomment to view what the document info list looks like.
#print("Training InfoList: " + str(trainingDocumentInfoList))
#print("Testing InfoList: " + str(testDocumentInfoList))

#Print lengths of the infoLists to confirm completion of the script.
print(len(trainingDocumentInfoList), len(testDocumentInfoList))
print(len(optimizedTrainingList), len(optimizedTestingList))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
25000 25000
19328 19975


##  The code below is primarily used to gather information that we will be using to make tables that will be put into reports.

In [5]:
#Helper Function - Not used directly
#Opens files from specified file path (positive or negative) training data. Specify the directory in filePath, and the name of the file in fileName. Example function call below.
#checkReviewFile(positiveTrainSetPath, "0_9.txt") --- Will travel to the directory set in variable positiveTrainSetPath, and open file named 0_9.txt
def checkReviewFile(filePath: str, fileName: str) -> None:
    with open(os.path.join(filePath, fileName), "r") as reviewFile:
        fileContents = reviewFile.read()

#Everything Below this line is used to check specifics of data. We will be using this for the report or for specific checkups.

#Variables to store results
weightGreaterWithStopWords = 0
weightLesserWithStopWords = 0
RatingWeightDictWithStopWords = defaultdict(list)

#Can change the infoList being iterated over to any of the three. Primarily used on (trainingDocumentInfoList) or (testDocumentInfoList)
#Uncomment the lines below if you wish to assign values to the variables above.

for entry in trainingDocumentInfoList:
 RatingWeightDictWithStopWords[entry[1]].append(entry[2])
 if entry[2] > entry[3]:
  weightGreaterWithStopWords += 1
 else:
  weightLesserWithStopWords += 1

#This prints the number of reviews that end up with a greater weight when stopwords are included, and the number that end up with a lesser weight when stopwords are included.
print(weightGreaterWithStopWords, weightLesserWithStopWords)

#Prints the number of reviews with a specific rating and the average weight of the reviews with that rating
for keys in RatingWeightDictWithStopWords.keys():
 print("Number of Reviews With Rating " + str(keys) + ": " + str(len(RatingWeightDictWithStopWords[keys])))
 print("Average Weight of Reviews With Rating " + str(keys) +": " + str(sum(RatingWeightDictWithStopWords[keys])/len(RatingWeightDictWithStopWords[keys])))




9952 15048
Number of Reviews With Rating 9: 2263
Average Weight of Reviews With Rating 9: 25.538025335039823
Number of Reviews With Rating 7: 2496
Average Weight of Reviews With Rating 7: 18.572582186983873
Number of Reviews With Rating 10: 4732
Average Weight of Reviews With Rating 10: 19.255829873995065
Number of Reviews With Rating 8: 3009
Average Weight of Reviews With Rating 8: 23.778929775391873
Number of Reviews With Rating 3: 2420
Average Weight of Reviews With Rating 3: -11.771561434340828
Number of Reviews With Rating 1: 5100
Average Weight of Reviews With Rating 1: -22.543467375622267
Number of Reviews With Rating 4: 2696
Average Weight of Reviews With Rating 4: -4.803665397910479
Number of Reviews With Rating 2: 2284
Average Weight of Reviews With Rating 2: -16.923281383868098


#Section 2: Perceptron/Linear Classifier and Logistic Classfier Setup Code

In [6]:
#Some resources we used to undestand the creation of the model.
#brief overview of activation/threshold functions: https://www.analyticsvidhya.com/blog/2020/01/fundamentals-deep-learning-activation-functions-when-to-use-them/"
#loss functions: https://medium.com/udacity-pytorch-challengers/a-brief-overview-of-loss-functions-in-pytorch-c0ddb78068f7"
#The book "Natural Language Processing with Pytorch" 
#and the website https://medium.com/biaslyai/pytorch-introduction-to-neural-network-feedforward-neural-network-model-e7231cff47cb 

In [7]:
#Pytorch Perceptron Classifier class
#This is one of our first models we created, but we eventually transitioned into using different classifiers with the help of SKLearn.
import torch.nn as nn
import torch as torch

#Class defined here
class PerceptronClassifier(nn.Module):
  def __init__(self): 
    super(PerceptronClassifier, self).__init__()
    self.fcl = nn.Linear(1,1) 
    self.relu = nn.ReLU() 
    
  def forward(self, tensor ):
    out = self.fcl(tensor)
    out = torch.sign(out)
    out = self.relu(out) 
    return out



In [8]:
#Sources for Pytorch Logistic Classifier
# kaggle tutorial: https://www.kaggle.com/negation/pytorch-logistic-regression-tutorial
# website: https://towardsdatascience.com/logistic-regression-on-mnist-with-pytorch-b048327f8d19
# pytorch example on bag of words: https://pytorch.org/tutorials/beginner/nlp/deep_learning_tutorial.html
import torch.nn.functional as functional
import torch as torch

#Logistic Classifier class defined here
class logisiticClassifier(nn.Module):
  def __init__(self):
    super(logisiticClassifier, self).__init__()
    self.linear = nn.Linear(1, 1)
 
  def forward(self, vector):
    output = functional.log_softmax(self.linear(vector), dim = 0)
    return output

# Section 3:  Tensor/Feature set up and Splitting data into both Training and Testing.

In [9]:
#Using this resource: https://towardsdatascience.com/sentiment-analysis-using-lstm-step-by-step-50d074f09948

#Imports 
import math
import sklearn
import torch.nn as nn
import torch as torch
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.feature_extraction.text import * 
from sklearn.model_selection import train_test_split 
from sklearn import linear_model 
from sklearn import metrics 


#Creating a tensor of size 25000 using information from documentInfoList, and the optimized versions.
#Contains the same information as their lists version, just in tensor form, which rounds off the floats for the weights.
#Substitute these tensors to use on different data
tensorTrainingList = torch.FloatTensor(trainingDocumentInfoList) 
tensorTestingList = torch.FloatTensor(testDocumentInfoList)
tensorOptimizedTrainingList = torch.FloatTensor(optimizedTrainingList)
tensorOptimizedTestList = torch.FloatTensor(optimizedTestingList)

##### FEATURES LOCATED HERE #####
#Separates the "big" tensor smaller ones: Ratings, weightWithStopWords, weightWithoutStopWords, reviewLength, and uniqueWords
#Cast into Numpy Array at first for what I did below, may need to change depending on what you need to do.
#If you want to use a different tensor, make sure to change it for the first 3 features.

#UNOPTIMIZED FEATURES
ratingFeature = np.asarray([tensorTrainingList[i][1] for i in range(len(tensorTrainingList))])
withStopWordsFeature = np.asarray([tensorTrainingList[i][2] for i in range(len(tensorTrainingList))])
removeStopWordsFeature =np.asarray([tensorTrainingList[i][3] for i in range(len(tensorTrainingList))])
reviewLengthFeature = np.asarray([tensorTrainingList[i][4] for i in range(len(tensorTrainingList))])
uniqueWordsFeature = np.asarray([tensorTrainingList[i][5] for i in range(len(tensorTrainingList))])

sentimentRatingFeature = np.zeros(ratingFeature.shape)
sentimentRatingFeatureWithNeutral = np.zeros(ratingFeature.shape)

#OPTIMIZED FEATURES
ratingFeatureOptimized = np.asarray([tensorOptimizedTrainingList[i][1] for i in range(len(tensorOptimizedTrainingList))])
withStopWordsFeatureOptimized = np.asarray([tensorOptimizedTrainingList[i][2] for i in range(len(tensorOptimizedTrainingList))])
removeStopWordsFeatureOptimized =np.asarray([tensorOptimizedTrainingList[i][3] for i in range(len(tensorOptimizedTrainingList))])

sentimentRatingFeatureOptimized = np.zeros(ratingFeatureOptimized.shape)
sentimentRatingFeatureWithNeutralOptimized = np.zeros(ratingFeatureOptimized.shape)

#Functions below just help in creating the sentimentFeatures when both including neutral or not including it.
'''
Creates the sentiment feature array for only positive/negative classification.
In the array, 1 is classified as positive and 0 is classified as negative
'''
def createRatingFeatureBinary(ratingFeature: list, bound: int) -> list:
  sentimentFeature = np.zeros(ratingFeature.shape)
  for i in range(len(ratingFeature)):
    if ratingFeature[i] > bound:
      sentimentFeature[i] = 1
    else:
      sentimentFeature[i] = 0
  return sentimentFeature

'''
Creates the sentiment feature array for positive, negative, and neutral classification.
With this function, 2 is positive, 1 is neutral, and 0 is negative
'''
def createRatingFeatureWithNeutral(ratingFeature: list, lowerBound: int, upperBound: int) -> list:
  sentimentFeature = np.zeros(ratingFeature.shape)
  for i in range(len(ratingFeature)):
    if ratingFeature[i] > upperBound:
      sentimentFeature[i] = 2
    elif ratingFeature[i] < lowerBound:
      sentimentFeature[i] = 0
    elif ratingFeature[i] == lowerBound or ratingFeature[i] == upperBound:
      sentimentFeature[i] = 1
  return sentimentFeature

sentimentRatingFeature = createRatingFeatureBinary(ratingFeature, 5)
sentimentRatingFeatureWithNeutral = createRatingFeatureWithNeutral(ratingFeature, 4, 7)

#Variable to count amount of positive/negative/neutral reviews.
positives = 0
negatives = 0
neutral = 0
for i in sentimentRatingFeatureWithNeutral:
  if i == 0:
    negatives+=1
  if i == 1:
    neutral+=1
  if i == 2:
    positives+=1


sentimentRatingFeatureOptimized = createRatingFeatureBinary(ratingFeatureOptimized, 5)
sentimentRatingFeatureWithNeutralOptimized = createRatingFeatureWithNeutral(ratingFeatureOptimized, 4, 7)

#Right now , using 80% of training data as train and the other 20% to validate.
dataSplitFrac = 0.8
#batchSize affects how DataLoaders organize results. Example: Batchsize of 50 means will report results 50 at a time. Keep to 1 as default here.
#dataSize is just the size of the numpy array.
batchSize = 1
dataSize = len(withStopWordsFeature)

#Using SKLearn method of splitting training/validation/testing data.
#Documention of parameters located here: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
trainX, validX, trainY, validY = train_test_split(withStopWordsFeature, ratingFeature, train_size = dataSplitFrac, random_state = 15)
print('Number of training examples: ', trainX.shape[0])
print('Number of testing examples: ',validX.shape[0])   

# "Bundles" both WeightWithStopWords and Ratings into tensors 
trainData = TensorDataset(torch.from_numpy(trainX), torch.from_numpy(trainY))
validData = TensorDataset(torch.from_numpy(validX), torch.from_numpy(validY))

# Uses the trainData, which are now tensors, and shuffles them. Stores them in the DataLoader class.
trainLoader = DataLoader(trainData, shuffle=True, batch_size=batchSize)
validLoader = DataLoader(validData, shuffle=True, batch_size=batchSize)

#This method call saves the data into an NPZ file. We use this is our other demonstration colab.
#np.savez("demonstrationData.npz", ratingFeature, withStopWordsFeature, removeStopWordsFeature, reviewLengthFeature, 
#         uniqueWordsFeature, sentimentRatingFeature, sentimentRatingFeatureWithNeutral)


Number of training examples:  20000
Number of testing examples:  5000


# Section 4: Training and Testing with Perceptron Classifier

In [10]:
# Perceptron Classifier

#Setting up parameters and model
device = torch.device('cpu')
classifier = PerceptronClassifier()
classifier.to(device)
learning_rate = .001
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr = learning_rate)


#Training model using data
# training_input = torch.unsqueeze(torch.FloatTensor(trainX), 1)
# training_labels = torch.unsqueeze(torch.FloatTensor(trainY), 1)

# test_input = torch.unsqueeze(torch.FloatTensor(validX), 1)
# test_labels = torch.unsqueeze(torch.FloatTensor(validY), 1)

trainX, validX, trainY, validY = train_test_split(withStopWordsFeature, sentimentRatingFeature,
                                                  train_size = dataSplitFrac, random_state = 15)

trainData = TensorDataset(torch.from_numpy(trainX), torch.from_numpy(trainY))
validData = TensorDataset(torch.from_numpy(validX), torch.from_numpy(validY))

trainLoader = DataLoader(trainData, shuffle=True, batch_size=batchSize)
validLoader = DataLoader(validData, shuffle=True, batch_size=batchSize)

classifier.train()

for epoch in range(2):
  for batch_index,(key, value) in enumerate(trainLoader):
      # forward
    key = key.to(device)
    value = value.to(device)
    #print(key, value)
    optimizer.zero_grad()
    output = classifier(key)
    #print(output.data, value)
    loss = criterion(output, value)
    #loss.requires_grad = True
    # backwards pass 
  
    loss.backward()
    optimizer.step()
    
    # to see for improvement of loss
    
    # if (i + 1) % 100 == 0:
    #   print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
    #         .format(epoch + 1, 2, batch_index + 1, len(trainLoader), loss.item()))

classifier.eval()

with torch.no_grad(): 
  correct = 0
  total = 0
  for images, labels in validLoader:
    labels = labels.to(device)
    outputs = classifier(images)
    predicted = output.data
    total+= labels.size(0)

    # to see predicted vs labels, should show only one label
    print(predicted, labels)

    correct += (predicted == labels).sum().item()

  print('Accuracy: {} %'.format(100 * correct / total))



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
tensor([0.]) tensor([1.], dtype=torch.float64)
tensor([0.]) tensor([1.], dtype=torch.float64)
tensor([0.]) tensor([0.], dtype=torch.float64)
tensor([0.]) tensor([0.], dtype=torch.float64)
tensor([0.]) tensor([0.], dtype=torch.float64)
tensor([0.]) tensor([1.], dtype=torch.float64)
tensor([0.]) tensor([1.], dtype=torch.float64)
tensor([0.]) tensor([0.], dtype=torch.float64)
tensor([0.]) tensor([0.], dtype=torch.float64)
tensor([0.]) tensor([0.], dtype=torch.float64)
tensor([0.]) tensor([0.], dtype=torch.float64)
tensor([0.]) tensor([1.], dtype=torch.float64)
tensor([0.]) tensor([0.], dtype=torch.float64)
tensor([0.]) tensor([0.], dtype=torch.float64)
tensor([0.]) tensor([0.], dtype=torch.float64)
tensor([0.]) tensor([0.], dtype=torch.float64)
tensor([0.]) tensor([1.], dtype=torch.float64)
tensor([0.]) tensor([0.], dtype=torch.float64)
tensor([0.]) tensor([1.], dtype=torch.float64)
tensor([0.]) tensor([0.], dtype=torch.floa

# Section 5: Training and Testing with Logistic Classifier

In [11]:
#Training and testing with logistic classifier
#Resources used: https://medium.com/biaslyai/pytorch-linear-and-logistic-regression-models-5c5f0da2cb9
#https://towardsdatascience.com/logistic-regression-explained-593e9ddb7c6c

#Setting up parameters and model
#Much like the initial perceptron classifier, our initial logistic classifier was only predicting one label
#and the loss was not improving
device = torch.device('cpu')
classifier = logisiticClassifier()
classifier.to(device)
learning_rate = .001
criterion = nn.L1Loss()
optimizer = torch.optim.SGD(classifier.parameters(), lr = learning_rate)
trainX, validX, trainY, validY = train_test_split(withStopWordsFeature, sentimentRatingFeature,
                                                  train_size = dataSplitFrac, random_state = 15)

trainData = TensorDataset(torch.from_numpy(trainX), torch.from_numpy(trainY))
validData = TensorDataset(torch.from_numpy(validX), torch.from_numpy(validY))

trainLoader = DataLoader(trainData, shuffle=True, batch_size=batchSize)
validLoader = DataLoader(validData, shuffle=True, batch_size=batchSize)

#Training the classifier using the data
classifier.train()
for epoch in range(2):
  for batch_index,(key, value) in enumerate(trainLoader):
      # forward
    key = key.to(device)
    value = value.to(device)

    optimizer.zero_grad()
    output = classifier(key)
    #print(output.data, value)
    loss = criterion(output, value)
    
    # backwards pass 
    loss.backward()
    optimizer.step()
    
    # to see if loss is improving

    # if (i + 1) % 100 == 0:
    #   print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
    #         .format(epoch + 1, 2, batch_index + 1, len(trainLoader), loss.item()))

classifier.eval()


with torch.no_grad(): 
  correct = 0
  total = 0
  for images, labels in validLoader:
    labels = labels.to(device)
    outputs = classifier(images)
    predicted = output.data
    total+= labels.size(0)

    #To see predicted vs labels, should show only one label
    #print(predicted, labels)
    
    correct += (predicted == labels).sum().item()

  print('Accuracy: {} %'.format(100 * correct / total))

Accuracy: 52.14 %


# Section 6: Classifiers done using Scipy with only Positive and Negative

In [12]:
#Imports
from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn import neighbors
from joblib import dump, load
# model is better without stop words
trainX, validX, trainY, validY = train_test_split(removeStopWordsFeature, sentimentRatingFeature, train_size = dataSplitFrac, random_state = 15)
print('Number of training examples: ', trainX.shape[0])
print('Number of testing examples: ',validX.shape[0]) 

trainX = trainX.reshape(-1,1)
validX = validX.reshape(-1,1)

Number of training examples:  20000
Number of testing examples:  5000


In [13]:
# Logistic with only positive and negative

print("Logistic Classifier")
logisticClassifier = linear_model.LogisticRegression(fit_intercept=False)

fitLogisticClassifier = logisticClassifier.fit(trainX, trainY)
trainingPredictions = fitLogisticClassifier.predict(trainX)
trainingAccuracy =  fitLogisticClassifier.score(trainX, trainY)
print('Training accuracy:',format( 100*trainingAccuracy , '.2f') ) 

testPredictions = fitLogisticClassifier.predict(validX)	 
testAccuracy = fitLogisticClassifier.score(validX,validY)

print('Testing: accuracy:', format( 100*testAccuracy , '.2f') )

Logistic Classifier
Training accuracy: 85.19
Testing: accuracy: 85.10


In [14]:
# Perceptron with only positive and negative
print("Perceptron Classifer")
perceptron = linear_model.Perceptron(fit_intercept= False)

trainX = trainX.reshape(-1,1)
validX = validX.reshape(-1,1)

#Training the model using data
fitPerceptronClassifier = perceptron.fit(trainX, trainY)
trainingPredictions = fitPerceptronClassifier.predict(trainX)
trainingAccuracy =  fitPerceptronClassifier.score(trainX, trainY)
print('Training accuracy:',format( 100*trainingAccuracy , '.2f') ) 

#Obtaining results
testPredictions = fitPerceptronClassifier.predict(validX)	 
testAccuracy = fitPerceptronClassifier.score(validX,validY)
print('Testing: accuracy:', format( 100*testAccuracy , '.2f') )

# sample_weight = [[12]] # predicts sample weight
# sample = fitPerceptronClassifier.predict(sample_weight)
# print(sample)

Perceptron Classifer
Training accuracy: 85.19
Testing: accuracy: 85.10


In [15]:
#Decision tree with only positive and negative
decision = tree.DecisionTreeClassifier()
print("Decision Tree")

#Training the model using the data
fitdecision = decision.fit(trainX, trainY)
trainingPredictions = fitdecision.predict(trainX)
trainingAccuracy =  fitdecision.score(trainX, trainY)
print('Training accuracy Decision:',format( 100*trainingAccuracy , '.2f') ) 

#Obtaining results
testPredictions = fitdecision.predict(validX)	 
testAccuracy = fitdecision.score(validX,validY)
print('Testing accuracy Decision:', format( 100*testAccuracy , '.2f') )

Decision Tree
Training accuracy Decision: 100.00
Testing accuracy Decision: 78.40


In [16]:
#Gradient boosting with only positive and negative
gradientboosting = ensemble.GradientBoostingClassifier(learning_rate= .001)
print("Gradient Boosting")

#Training the model using data
fitGradientBoosting = gradientboosting.fit(trainX, trainY)
trainingPredictions = fitGradientBoosting.predict(trainX)
trainingAccuracy =  fitGradientBoosting.score(trainX, trainY)
print('Training accuracy Decision:',format( 100*trainingAccuracy , '.2f') ) 

#Obtaining results
testPredictions = fitGradientBoosting.predict(validX)	 
testAccuracy = fitGradientBoosting.score(validX,validY)

print('Testing accuracy Decision:', format( 100*testAccuracy , '.2f') )

Gradient Boosting
Training accuracy Decision: 85.37
Testing accuracy Decision: 85.00


In [17]:
#KNN with positive and negative


# shift everything under for loop to check multiple K
# max_K = 0
max_predict = 0
# for i in range(5, 200):
knn = neighbors.KNeighborsClassifier(n_neighbors= 114)

print("KNN")
#Training the model using the data
fitknn = knn.fit(trainX, trainY)
trainingPredictions = fitknn.predict(trainX)
trainingAccuracy =  fitknn.score(trainX, trainY)
print('Training accuracy Decision:',format( 100*trainingAccuracy , '.2f') ) 

#Obtaining results
testPredictions = fitknn.predict(validX)	 
testAccuracy = fitknn.score(validX,validY)
if testAccuracy > max_predict:
  max_predict = testAccuracy
  max_K = i
print('Testing accuracy Decision:', format( 100*testAccuracy , '.2f') )

# for i in testPredictions:
#   if i == 1:
#     print(i)
# print(max_K)
# print(max_predict)


# for testing examples
test_positiveKNN = []
test_negativeKNN = []
x = 0
for i in testPredictions:
  if i == 0:
    test_negativeKNN.append(x)
  elif i == 1:
    test_positiveKNN.append(x)
  x+=1

KNN
Training accuracy Decision: 85.44
Testing accuracy Decision: 85.40


#Section 7: Classifiers done with Scipy using Positive, Neutral, and Negative

In [18]:
#Data split with neutral
#Needs neutral data
#Resource Used: https://scikit-learn.org/stable/modules/multiclass.html
neutraltrainX, neutralvalidX, neutraltrainY, neutralvalidY = train_test_split(withStopWordsFeature,sentimentRatingFeatureWithNeutral, train_size = dataSplitFrac, random_state = 15)
neutraltrainX = neutraltrainX.reshape(-1,1)
neutralvalidX = neutralvalidX.reshape(-1,1)


In [19]:
#Logistic classifier with positive and negative and neutral

print("With Neutral score - logisitic")
logisticClassifierMulti = linear_model.LogisticRegression(multi_class="multinomial")
fitLogisticClassifierMulti = logisticClassifierMulti.fit(neutraltrainX, neutraltrainY)
trainingPredictions = fitLogisticClassifierMulti.predict(neutraltrainX)
trainingAccuracy =  fitLogisticClassifierMulti.score(neutraltrainX, neutraltrainY)
print('Training accuracy:',format( 100*trainingAccuracy , '.2f') ) 

#Obtaining results
testPredictions = fitLogisticClassifierMulti.predict(neutralvalidX)
testAccuracy = fitLogisticClassifierMulti.score(neutralvalidX,neutralvalidY)
print('Testing: accuracy:', format( 100*testAccuracy , '.2f') )

## not predicting neutral numbers

With Neutral score - logisitic
Training accuracy: 69.03
Testing: accuracy: 69.34


In [20]:
decisionMulti = tree.DecisionTreeClassifier()
print("With Neutral - Decision Tree")

#Training the model
fitdecisionMulti = decisionMulti.fit(neutraltrainX, neutraltrainY)
trainingPredictions = fitdecisionMulti.predict(neutraltrainX)
trainingAccuracy =  fitdecisionMulti.score(neutraltrainX, neutraltrainY)
print('Training accuracy Decision:',format( 100*trainingAccuracy , '.2f') ) 

#Obtaining results
testPredictions = fitdecisionMulti.predict(neutralvalidX)	 
testAccuracy = fitdecisionMulti.score(neutralvalidX,neutralvalidY)

print('Testing accuracy Decision:', format( 100*testAccuracy , '.2f') )

With Neutral - Decision Tree
Training accuracy Decision: 99.98
Testing accuracy Decision: 56.62


In [21]:
#KNN with Neutral
# gradient boost not predicting neutral
print("KNN - Neutral")

# tried 1,3,5,20,100,200,300,400,500, 600-900
# if too high won't predict any neutral
# weights = uniform give better result
# algorithm doesnt matter
# 78 most accurate for 5 - 100
# shift everything under for loop to check multiple K
# max_K = 0
max_predict = 0
# for i in range(5, 100):

knnMulti = neighbors.KNeighborsClassifier(n_neighbors= 78)

#Training the model
fitknnMulti = knnMulti.fit(neutraltrainX, neutraltrainY)
trainingPredictions = knnMulti.predict(neutraltrainX)
trainingAccuracy =  knnMulti.score(neutraltrainX, neutraltrainY)
print('Training accuracy Decision:',format( 100*trainingAccuracy , '.2f') ) 

#Obtaining results
testPredictions = knnMulti.predict(neutralvalidX)	 
testAccuracy = knnMulti.score(neutralvalidX,neutralvalidY)
# if testAccuracy > max_predict:
#   max_predict = testAccuracy
#   max_K = i

print('Testing accuracy Decision:', format( 100*testAccuracy , '.2f') )
# for i in testPredictions:
#   if i == 1:
#     print(i)
# print(max_K)
# print(max_predict)



# for testing examples
test_positiveMulti = []
test_negativeMulti = []
test_neutralMulti = []
x = 0
for i in testPredictions:
  if i == 0:
    test_negativeMulti.append(x)
  elif i == 1:
    test_neutralMulti.append(x)
  else:
    test_positiveMulti.append(x)
  x+=1


KNN - Neutral
Training accuracy Decision: 69.31
Testing accuracy Decision: 69.36


# Section 8: Classifiers done using Optimized Sets with Postive and Negative

In [22]:
#Using the optimized set seems to overall give much lower accuracy scores

#Imports
from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn import neighbors

#Model performs better without stop words
OpttrainX, OptvalidX, OpttrainY, OptvalidY = train_test_split(removeStopWordsFeatureOptimized, sentimentRatingFeatureOptimized, train_size = dataSplitFrac, random_state = 15)
print('Number of training examples: ', OpttrainX.shape[0])
print('Number of testing examples: ', OptvalidX.shape[0]) 

OpttrainX = OpttrainX.reshape(-1,1)
OptvalidX = OptvalidX.reshape(-1,1)

Number of training examples:  15462
Number of testing examples:  3866


In [23]:
#Using Logistic binary classifier
print("Logistic")
logisticClassifierOpt = linear_model.LogisticRegression(fit_intercept=False)

fitLogisticClassifierOpt = logisticClassifierOpt.fit(OpttrainX, OpttrainY)
trainingPredictions = fitLogisticClassifierOpt.predict(OpttrainX)
trainingAccuracy =  fitLogisticClassifierOpt.score(OpttrainX, OpttrainY)
print('Training accuracy:',format( 100*trainingAccuracy , '.2f') ) 

testPredictions = fitLogisticClassifierOpt.predict(OptvalidX)	 
testAccuracy = fitLogisticClassifierOpt.score(OptvalidX,OptvalidY)

print('Testing: accuracy:', format( 100*testAccuracy , '.2f') )


Logistic
Training accuracy: 82.01
Testing: accuracy: 82.44


In [24]:
#Decision tree binary classifier

decisionOpt = tree.DecisionTreeClassifier()
print("Decision Tree")

fitdecisionOpt = decisionOpt.fit(OpttrainX, OpttrainY)
trainingPredictions = fitdecisionOpt.predict(OpttrainX)
trainingAccuracy =  fitdecisionOpt.score(OpttrainX, OpttrainY)
print('Training accuracy Decision:',format( 100*trainingAccuracy , '.2f') ) 

#Obtaining results
testPredictions = fitdecisionOpt.predict(OptvalidX)	 
testAccuracy = fitdecisionOpt.score(OptvalidX,OptvalidY)

print('Testing accuracy Decision:', format( 100*testAccuracy , '.2f') )

Decision Tree
Training accuracy Decision: 100.00
Testing accuracy Decision: 74.19


In [25]:
#Gradient boosting binary classifier

gradientboostingOpt = ensemble.GradientBoostingClassifier(learning_rate= .001)
print("Gradient Boosting")

#train the data
fitGradientBoostingOpt = gradientboostingOpt.fit(OpttrainX, OpttrainY)
trainingPredictions = fitGradientBoostingOpt.predict(OpttrainX)
trainingAccuracy =  fitGradientBoostingOpt.score(OpttrainX, OpttrainY)
print('Training accuracy Decision:',format( 100*trainingAccuracy , '.2f') ) 

#evaluate
testPredictions = fitGradientBoostingOpt.predict(OptvalidX)	 
testAccuracy = fitGradientBoostingOpt.score(OptvalidX,OptvalidY)

print('Testing accuracy Decision:', format( 100*testAccuracy , '.2f') )

Gradient Boosting
Training accuracy Decision: 81.29
Testing accuracy Decision: 81.14


In [26]:
# max_K = 0
# max_predict = 0
# for i in range(5, 100):

# Knn optimized
knnMultiOpt = neighbors.KNeighborsClassifier(n_neighbors= 114)

  #train the data
fitknnMultiOpt = knnMultiOpt.fit(OpttrainX, OpttrainY)
trainingPredictions = fitknnMultiOpt.predict(OpttrainX)
trainingAccuracy =  fitknnMultiOpt.score(OpttrainX, OpttrainY)
print('Training accuracy Decision:',format( 100*trainingAccuracy , '.2f') ) 

#evaluate
testPredictions = fitknnMultiOpt.predict(OptvalidX)	 
testAccuracy = fitknnMultiOpt.score(OptvalidX,OptvalidY)
  # if testAccuracy > max_predict:
  #   max_predict = testAccuracy
  #   max_K = i

print('Testing accuracy Decision:', format( 100*testAccuracy , '.2f') )
# for i in testPredictions:
#   if i == 0:
#     pass
    #print(i)
# print(max_K)
# print(max_predict)

Training accuracy Decision: 82.31
Testing accuracy Decision: 82.54


#Section 9: Classifiers done using Optimized Sets for Positive, Negative, and Neutral

In [27]:
#Setting up features for positive/negative/neutral classification
OptneutraltrainX, OptneutralvalidX, OptneutraltrainY, OptneutralvalidY = train_test_split(withStopWordsFeatureOptimized,sentimentRatingFeatureWithNeutralOptimized,
                                                                                          train_size = dataSplitFrac, random_state = 15)
OptneutraltrainX = OptneutraltrainX.reshape(-1,1)
OptneutralvalidX = OptneutralvalidX.reshape(-1,1)

In [28]:
decisionMultiOpt = tree.DecisionTreeClassifier()
print("With Neutral - Decision Tree")

#Training the model
fitdecisionMultiOpt = decisionMultiOpt.fit(OptneutraltrainX, OptneutraltrainY)
trainingPredictions = fitdecisionMultiOpt.predict(OptneutraltrainX)
trainingAccuracy =  fitdecisionMultiOpt.score(OptneutraltrainX, OptneutraltrainY)
print('Training accuracy Decision:',format( 100*trainingAccuracy , '.2f') ) 

#Obtaining results
testPredictions = fitdecisionMultiOpt.predict(OptneutralvalidX)	 
testAccuracy = fitdecisionMultiOpt.score(OptneutralvalidX,OptneutralvalidY)

print('Testing accuracy Decision:', format( 100*testAccuracy , '.2f') )

With Neutral - Decision Tree
Training accuracy Decision: 99.99
Testing accuracy Decision: 53.83


In [29]:
print("With Neutral score - logisitic")
logisticClassifierMultiOpt = linear_model.LogisticRegression(multi_class="multinomial")

# logistic Classifier with Neutral rating
fitLogisticClassifierMultiOpt = logisticClassifierMultiOpt.fit(OptneutraltrainX, OptneutraltrainY)
trainingPredictions = fitLogisticClassifierMultiOpt.predict(OptneutraltrainX)
trainingAccuracy =  fitLogisticClassifierMultiOpt.score(OptneutraltrainX, OptneutraltrainY)
print('Training accuracy:',format( 100*trainingAccuracy , '.2f') ) 

#Obtaining results
testPredictions = fitLogisticClassifierMultiOpt.predict(OptneutralvalidX)
testAccuracy = fitLogisticClassifierMultiOpt.score(OptneutralvalidX,OptneutralvalidY)

print('Testing: accuracy:', format( 100*testAccuracy , '.2f') )
# also not predicting neutral

With Neutral score - logisitic
Training accuracy: 65.61
Testing: accuracy: 65.29


In [30]:
# max_K = 0
# max_predict = 0
# for i in range(5, 100):
print("KNN - With Neutral")
knnMultiOptNeutral = neighbors.KNeighborsClassifier(n_neighbors= 78)

#Training the model
fitknnMultiOptNeutral = knnMultiOptNeutral.fit(OptneutraltrainX, OptneutraltrainY)
trainingPredictions = fitknnMultiOptNeutral.predict(OptneutraltrainX)
trainingAccuracy =  fitknnMultiOptNeutral.score(OptneutraltrainX, OptneutraltrainY)
print('Training accuracy Decision:',format( 100*trainingAccuracy , '.2f') ) 

#Obtaining results
testPredictions = fitknnMultiOptNeutral.predict(OptneutralvalidX)	 
testAccuracy = fitknnMultiOptNeutral.score(OptneutralvalidX,OptneutralvalidY)
  # if testAccuracy > max_predict:
  #   max_predict = testAccuracy
  #   max_K = i
print('Testing accuracy Decision:', format( 100*testAccuracy , '.2f') )

# for i in testPredictions:
#   if i == 0:
#     pass
    #print(i)
# print(max_K)
# print(max_predict)

KNN - With Neutral
Training accuracy Decision: 65.68
Testing accuracy Decision: 65.49


#Section 10: Alternate version of Logistic Classification for Positive

In [31]:
#This block does a variation of Logistic Classification similar to what was performed in Assignment 1.
#Here we utilize the LogisticRegression provided by SKLearn.

#Splitting into train/validation data.
trainX, validX, trainY, validY = train_test_split(withStopWordsFeature, sentimentRatingFeature, train_size = dataSplitFrac, random_state = 15)
print('Number of training examples: ', trainX.shape[0])
print('Number of testing examples: ',validX.shape[0]) 

#Creating the classifier here.
logisticClassifier = linear_model.LogisticRegression(penalty = "l2", fit_intercept= True)

trainX = trainX.reshape(-1,1)
validX = validX.reshape(-1,1)

#Fitting the classifier, then making predictions and calculating accuracy of training data.
fitLogisticClassifier = logisticClassifier.fit(trainX, trainY)
trainingPredictions = fitLogisticClassifier.predict(trainX)
trainingAccuracy =  fitLogisticClassifier.score(trainX, trainY)
print('Training accuracy:',format( 100*trainingAccuracy , '.2f') ) 

#Making predictions and calculating accuracy of validation (testing) data.
testPredictions = fitLogisticClassifier.predict(validX)	 
testAccuracy = fitLogisticClassifier.score(validX,validY)

print('Testing: accuracy:', format( 100*testAccuracy , '.2f') )

Number of training examples:  20000
Number of testing examples:  5000
Training accuracy: 83.89
Testing: accuracy: 83.90


#Section 11: Analyzing Errors



In [32]:
#We locate errors in this block based on the results of the above logistic classification. We then manually go in and analyze the wrongly classified
#reviews to get a better understanding as to why our model classifies wrongly.

'''
Simple function that retrieves the weight from the specified word
'''
def getVocabWeight(word: str) -> None:
  print("The weight of word:", word, "is" , vocabWeightDict[word])

#Initializing Variables
totalwrongCount = 0
falsePositives = 0
falseNegatives = 0
totalReviewCount = 0
errorList = []

#Fills errorList with errors from training data. Each entry in errorList is a tuple with (FileNumber, Weight, Error Made)
for i in range(len(trainX)):
  trainWeight = trainX[i]
  totalReviewCount += 1
  indexLocation = np.where(withStopWordsFeature == trainWeight)[0][0]
  if sentimentRatingFeature[indexLocation] != trainingPredictions[i]:
    errorTuple = (indexLocation, trainWeight)
    totalwrongCount += 1
    if trainingPredictions[i] == 1:
      falsePositives += 1
      errorTuple += ("False Positive",)
    else:
      falseNegatives += 1
      errorTuple += ("False Negative",)
    errorList.append(errorTuple)

#Fills the errorList with the errors from test data. Each entry in errorList is a tuple with (FileNumber, Weight, Error Made)
for i in range(len(validX)):
  testWeight = validX[i][0]
  totalReviewCount += 1
  #print(np.where(withStopWordsFeature == testWeight),testWeight)
  indexLocation = np.where(withStopWordsFeature == testWeight)[0][0]
  if sentimentRatingFeature[indexLocation] != testPredictions[i]:
    errorTuple = (indexLocation, testWeight)
    totalwrongCount += 1
    if testPredictions[i] == 1:
      falsePositives += 1
      errorTuple += ("False Positive",)
    else:
      falseNegatives += 1
      errorTuple += ("False Negative",)
    errorList.append(errorTuple)

#Print results for user to see.
print(totalwrongCount, "False Positives: ", falsePositives, "False Negatives: ", falseNegatives, "Total Amount: ", totalReviewCount)
# print("Error List in order of occurrence when shuffled")
# print(errorList)
# print("Error List sorted with respect to file it occurs in")
# print(sorted(errorList, key = lambda x: x[0]))

# print("\n + Error List sorted with respect to the weight (from smallest to largest")
# print(sorted(errorList, key = lambda x: x[1]))

# print("\n + Error List sorted with respect to the weight (from largest to smallest")
# print(sorted(errorList, key = lambda x: x[1], reverse= True))



  

4024 False Positives:  2056 False Negatives:  1968 Total Amount:  25000


# Section 12: Neural Network/Linear Regression for Predicting Ratings


In [33]:
#Imports
import torch.nn
import torch.nn.functional

#Train/test tensors
train_doc = torch.FloatTensor(trainingDocumentInfoList)
test_doc = torch.FloatTensor(testDocumentInfoList)

#train_doc = torch.FloatTensor(optimizedTrainingList)
#test_doc = torch.FloatTensor(optimizedTestingList)

trainX = np.asarray([train_doc[i][3] for i in range(len(train_doc))])
trainY = np.asarray([train_doc[i][1] for i in range(len(train_doc))])

testX = np.asarray([test_doc[i][3] for i in range(len(test_doc))])
testY = np.asarray([test_doc[i][1] for i in range(len(test_doc))])

train_length = np.asarray([train_doc[i][4] for i in range(len(train_doc))])
test_length = np.asarray([test_doc[i][4] for i in range(len(test_doc))])

train_unique = np.asarray([train_doc[i][5] for i in range(len(train_doc))])
test_unique = np.asarray([test_doc[i][5] for i in range(len(test_doc))])

#Grab total review weights for the training/validation datasets and their associated review scores
training_input = torch.unsqueeze(torch.FloatTensor(trainX), 1)
training_labels = torch.unsqueeze(torch.FloatTensor(trainY), 1)

test_input = torch.unsqueeze(torch.FloatTensor(testX), 1)
test_labels = torch.unsqueeze(torch.FloatTensor(testY), 1)

#For multiple features
#training_input = torch.cat((torch.unsqueeze(torch.FloatTensor(trainX), 1), torch.unsqueeze(torch.FloatTensor(train_length), 1)), 1)
#test_input = torch.cat((torch.unsqueeze(torch.FloatTensor(testX), 1), torch.unsqueeze(torch.FloatTensor(test_length), 1)), 1)

#Linear regression
class LinReg(nn.Module):
  def __init__(self):
    super(LinReg, self).__init__()
    self.hid = nn.Linear(1, 3)
    self.lin = nn.Linear(3, 1)

  def forward(self, w):
    output = torch.nn.functional.relu(self.hid(w))
    output = self.lin(output)
    return output

#Setting up the model/parameters
linreg_model = LinReg()
mse_loss = torch.nn.MSELoss()
optim = torch.optim.SGD(linreg_model.parameters(), lr = 0.001)

#Running regression
most_accurate = (None, None, None, None)
best_train = 1000000.0

print("Running epochs...")
for i in range(30000):
  optim.zero_grad()
  predicted_labels = linreg_model(training_input)
  predicted_labels_test = linreg_model(test_input)

  test_mse_loss = torch.nn.MSELoss()(predicted_labels_test, test_labels)
  if most_accurate == (None, None, None, None):
    most_accurate = (linreg_model, i, float(test_mse_loss), float(test_mse_loss) ** 0.5)
  elif float(test_mse_loss) < most_accurate[2]:
    most_accurate = (linreg_model, i, float(test_mse_loss), float(test_mse_loss) ** 0.5)
  else:
    pass

  train_mse_loss = mse_loss(predicted_labels, training_labels)

  if float(train_mse_loss) < best_train:
    best_train = float(train_mse_loss)

  train_mse_loss.backward()
  optim.step()

  #print(i, float(train_mse_loss), float(test_mse_loss), most_accurate[2])
  
print("The most accurate model for the test data occurred at epoch # " + str(most_accurate[1]) + ".")
print("It had a MSE of " + str(most_accurate[2]) + ".")
print("This means that the average prediction was off by " + str(most_accurate[3]) + ".")
print("Normalized MSE: " + str(most_accurate[2] / 81.0))
print("Normalized RMSE: " + str(most_accurate[3] / 9.0))
#print(best_train / 81.0)

#Try to use as a classifier
correct = 0.0
incorrect = 0.0

for i, e in enumerate(test_input):
  model_output = float(most_accurate[0](test_input[i])[0])
  actual_score = float(test_labels[i][0])
  if (model_output >= 5.5 and actual_score >= 5.5):
    correct += 1.0
  elif (model_output < 5.5 and actual_score < 5.5):
    correct += 1.0
  else:
    incorrect += 1.0

print("Classification (positive/negative) test accuracy: ", ((correct / (correct + incorrect)) * 100.0), "%")

Running epochs...
The most accurate model for the test data occurred at epoch # 29978.
It had a MSE of 6.029042720794678.
This means that the average prediction was off by 2.455410906710866.
Normalized MSE: 0.07443262618265034
Normalized RMSE: 0.2728234340789851
Classification (positive/negative) test accuracy:  85.008 %


#Section 13: Implmenting Word2Vec


In [34]:
#We created a file locally and import it from GitHub that assigns ReviewNum: [Words in Review]
wordToVecFilePath = os.getcwd() + "/IMDB_ClassificationAndPrediction/reviewNumberWordMap.txt"

#Turn the imported file into proper dictionary form for our Word2Vec Model.
wordToVecDict = dict()

wordToVecFile = open(wordToVecFilePath, "r", encoding= "utf-8")
for lines in wordToVecFile:
  #print(lines)
  reviewNum, wordList = lines.split(":")
  wordToVecDict[reviewNum] = wordList


In [35]:
#Imports
from gensim.models import Word2Vec

#Setting parameters for Gensim's Word2Vec
SIZE = 100
WINDOW = 5
MIN_COUNT = 5
WORKERS = 3
SG = 1

tokenList = []
for value in wordToVecDict.values():
  tokenList.append(value)

print(len(tokenList))
print("Creating Model...")
wordToVecModel = Word2Vec(tokenList, min_count = MIN_COUNT, size = SIZE, workers = WORKERS, window = WINDOW, sg = SG)
print("Model creation complete")




25000
Creating Model...
Model creation complete


In [36]:
#Using Word2Vec 

from sklearn import tree

vecDecision =  tree.DecisionTreeClassifier()
vecDecisionNeutral =  tree.DecisionTreeClassifier()
# stored in keyedVectors
wordToVecModel.train(tokenList, total_examples= 1, epochs=1)

logisticClassifierVec = linear_model.LogisticRegression(fit_intercept=False)
logisticClassifierVecNeutral = linear_model.LogisticRegression(fit_intercept=False)

knnVec = neighbors.KNeighborsClassifier(n_neighbors= 114)
knnVecNeutral = neighbors.KNeighborsClassifier(n_neighbors= 78)

In [37]:
# since i can not use all the reviews i need to remake the tensor so tranform to list first
transform_rating_to_list = list(sentimentRatingFeature)
transform_rating_to_list_neutral = list(sentimentRatingFeatureWithNeutral)
data_that_will_be_split = []
rating_list = []
rating_list_neutral = []


# add the vector to a list and also the ratings 
# ignore the error vectors
# only 3 are ignored so it is minimal
for i in range(len(tokenList)):
  try:
    vectors = (np.mean([wordToVecModel[x] for x in tokenList[i]], axis=0)).tolist()
    data_that_will_be_split.append(vectors)
    rating_list.append(transform_rating_to_list[i])
    rating_list_neutral.append(transform_rating_to_list_neutral[i])
  except:
    pass

# transform back into correct data structure
rating_list_final = torch.FloatTensor(rating_list)
ratingListArray = np.asarray([rating_list_final[i] for i in range(len(rating_list_final))])

rating_list_final_neutral = torch.FloatTensor(rating_list_neutral)
ratingListArrayNeutral = np.asarray([rating_list_final_neutral[i] for i in range(len(rating_list_final_neutral))])


tensorVectors = torch.FloatTensor(data_that_will_be_split)

# splitting, training, and testing
VectrainX, VecvalidX, VectrainY, VecvalidY = train_test_split(tensorVectors, ratingListArray,
                                                  train_size = dataSplitFrac, random_state = 15)
# splitting, training, testing, Neutral
VectrainXNeutral, VecvalidXNeutral, VectrainYNeutral, VecvalidYNeutral = train_test_split(tensorVectors, ratingListArrayNeutral,
                                                                          train_size = dataSplitFrac, random_state = 15)



  


In [38]:
# decision tree
fitword2vec = vecDecision.fit(VectrainX, VectrainY)
trainingPredictions = fitword2vec.predict(VectrainX)
trainingAccuracy =  fitword2vec.score(VectrainX, VectrainY)
print('Training accuracy:',format( 100*trainingAccuracy , '.2f') ) 

testPredictions = fitword2vec.predict(VecvalidX)	 
testAccuracy = fitword2vec.score(VecvalidX,VecvalidY)

print('Testing: accuracy:', format( 100*testAccuracy , '.2f') )

Training accuracy: 100.00
Testing: accuracy: 54.86


In [39]:
# with logistic
fitword2vecLog = logisticClassifierVec.fit(VectrainX, VectrainY)
trainingPredictions = fitword2vecLog.predict(VectrainX)
trainingAccuracy =  fitword2vecLog.score(VectrainX, VectrainY)
print('Training accuracy:',format( 100*trainingAccuracy , '.2f') ) 

testPredictions = fitword2vecLog.predict(VecvalidX)	 
testAccuracy = fitword2vecLog.score(VecvalidX,VecvalidY)

print('Testing: accuracy:', format( 100*testAccuracy , '.2f') )

Training accuracy: 57.08
Testing: accuracy: 56.62


In [40]:
dump(fitword2vecLog, "classification_wordToVecLogistic.joblib")

['classification_wordToVecLogistic.joblib']

In [41]:
# with knn
fitword2vecKnn = knnVec.fit(VectrainX, VectrainY)
trainingPredictions = fitword2vecKnn.predict(VectrainX)
trainingAccuracy =  fitword2vecKnn.score(VectrainX, VectrainY)
print('Training accuracy:',format( 100*trainingAccuracy , '.2f') ) 

testPredictions = fitword2vecKnn.predict(VecvalidX)	 
testAccuracy = fitword2vecKnn.score(VecvalidX,VecvalidY)

print('Testing: accuracy:', format( 100*testAccuracy , '.2f') )


# for our test examples at the bottom
test_positiveWord2Vec = []
test_negativeWord2Vec = []
x = 0
for i in testPredictions:
  if i == 0:
    test_negativeWord2Vec.append(x)
  elif i == 1:
    test_positiveWord2Vec.append(x)
  x+=1

Training accuracy: 60.25
Testing: accuracy: 59.34


In [42]:
# with neutral, decision tree

fitword2vecNeutral = vecDecisionNeutral.fit(VectrainXNeutral, VectrainYNeutral)
trainingPredictions = fitword2vecNeutral.predict(VectrainXNeutral)
trainingAccuracy =  fitword2vecNeutral.score(VectrainXNeutral, VectrainYNeutral)
print('Training accuracy:',format( 100*trainingAccuracy , '.2f') ) 

testPredictions = fitword2vecNeutral.predict(VecvalidXNeutral)	 
testAccuracy = fitword2vecNeutral.score(VecvalidXNeutral,VecvalidYNeutral)

print('Testing: accuracy:', format( 100*testAccuracy , '.2f') )

Training accuracy: 99.99
Testing: accuracy: 39.88


In [43]:
# with neutral, logistic 
# not reporting neutral

# fitword2vecLogNeutral = logisticClassifierVecNeutral.fit(VectrainXNeutral, VectrainYNeutral)
# trainingPredictions = fitword2vecLogNeutral.predict(VectrainXNeutral)
# trainingAccuracy =  fitword2vecLogNeutral.score(VectrainXNeutral, VectrainYNeutral)
# print('Training accuracy:',format( 100*trainingAccuracy , '.2f') ) 

# testPredictions = fitword2vecLogNeutral.predict(VecvalidXNeutral)	 
# testAccuracy = fitword2vecLogNeutral.score(VecvalidXNeutral,VecvalidYNeutral)

# print('Testing: accuracy:', format( 100*testAccuracy , '.2f') )

# for i in testPredictions:
#   if i == 2:
#     print(i)

In [44]:
# with neutral, knn

fitword2vecKnnNeutral = knnVecNeutral.fit(VectrainXNeutral, VectrainYNeutral)
trainingPredictions = fitword2vecKnnNeutral.predict(VectrainXNeutral)
trainingAccuracy =  fitword2vecKnnNeutral.score(VectrainXNeutral, VectrainYNeutral)
print('Training accuracy:',format( 100*trainingAccuracy , '.2f') ) 

testPredictions = fitword2vecKnnNeutral.predict(VecvalidXNeutral)	 
testAccuracy = fitword2vecKnnNeutral.score(VecvalidXNeutral,VecvalidYNeutral)

print('Testing: accuracy:', format( 100*testAccuracy , '.2f') )

Training accuracy: 49.43
Testing: accuracy: 47.98


# Section 14: Sentiment Prediction and Examples

In [45]:

'''This function takes in a review weight and use that weight to classify as either positive or negative.
If the neutral parameter is False, then we assume there is only positive or negative.
If the neutral parameter is True, then positive, negative, and neutral are all possible classifications.
'''
def predict_sentiment(review_weight, classifier, neutral = False):
  # assumes review_weight is a float so will reshape into numpy
  reshaped_weight = np.array(review_weight)
  reshaped_weight = reshaped_weight.reshape(1,-1)
  score = classifier.predict(reshaped_weight)

  # for positive and negative only
  if neutral == False:
    if score == 0:
      return "Negative"
    elif score == 1:
      return "Positive"
      
  elif neutral == True:
    if score == 0:
      return "Negative"
    elif score == 1:
      return "Neutral"
    elif score == 2:
      return "Positive"
'''
for bag of words all you need is a float or int(a review weight)
for word2vec you need a tensor
You can use any classifier. They are the variables that start with fit: fitword2vecKnn, fitPerceptronClassifier, fitknn, fitknnMulti(multi for multiclass so includes neutral)
Test examples 
'''
# to test for KNN, input the correct index
# example: predict_sentiment(validX[test_positiveKNN[0]],fitKnn, False)
'''
the lists below contains indexes that you can input into our test data such as validX to get a weight or vector
you can input the same index for the labels such as validY to check if it is the correct label


test indexes for KNN- Positive and negative:
test_positiveKNN
test_negativeKNN

test indexes for KNN- Positive, Negative, Neutral:
test_positiveMulti
test_neutralMulti
test_negativeMulti

test indexes for KNN- Positive and negative for word2vec:
test_positiveWord2Vec
test_negativeWord2Vec
'''

" Out model is not 100 percent accurate so it may be wrong at times"


# Examples of Positive and Negative using KNN on Bag of Words
print(predict_sentiment(validX[test_positiveKNN[0]],fitknn, False))
print(predict_sentiment(validX[test_positiveKNN[100]],fitknn, False))
print(predict_sentiment(validX[test_negativeKNN[0]],fitknn, False))
print(predict_sentiment(validX[test_negativeKNN[1]],fitknn, False))
print(predict_sentiment(validX[test_negativeKNN[100]],fitknn, False)) # an example of our model being wrong

print()

# Examples of Positive, Negative, and Neutral using KNN on Bag of Words
print(predict_sentiment(validX[test_positiveMulti[0]],fitknnMulti, True))
print(predict_sentiment(validX[test_positiveMulti[100]],fitknnMulti, True))
print(predict_sentiment(validX[test_negativeMulti[0]],fitknnMulti, True)) 
print(predict_sentiment(validX[test_negativeMulti[1]],fitknnMulti, True))
print(predict_sentiment(validX[test_neutralMulti[2]],fitknnMulti, True)) 

print()

# Examples of Positive and Negative using KNN on Word2Vec
print(predict_sentiment(VecvalidX[test_positiveWord2Vec[0]],fitword2vecKnn, False))
print(predict_sentiment(VecvalidX[test_positiveWord2Vec[100]],fitword2vecKnn, False))
print(predict_sentiment(VecvalidX[test_negativeWord2Vec[0]],fitword2vecKnn, False)) 
print(predict_sentiment(VecvalidX[test_negativeWord2Vec[1]],fitword2vecKnn, False))
print(predict_sentiment(VecvalidX[test_negativeWord2Vec[2]],fitword2vecKnn, False)) 



Positive
Positive
Negative
Negative
Positive

Positive
Positive
Negative
Negative
Neutral

Positive
Positive
Negative
Negative
Negative
