# This Google Colab is a simplified version of the original code. It instead imports already trained models and created data to provide a very quick simulation of what our models do and what they are capable of.

# Section 0: Setup and Importing Libraries



In [None]:
#This entire script is meant to be run in Google Colab

#We clone the github to easily grab the dataset information without having to manually import it to Google Colab each time. Additionally, we install necessary libraries here.
#Libraries are: Pytorch, NLTK, and Gensim.
!git clone https://github.com/JoshuaWidjaja/IMDB_ClassificationAndPrediction
!pip install torch===1.8.1 torchvision===0.8.2 torchaudio===0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
!pip install --user -U nltk
!pip install gensim

#Imports, generally we isolate each import to it's proper block to easier read/organize the code.
#For the purpose of this shorter demonstration file, we import everything at the top.
import os
import math
import sklearn
import torch.nn as nn
import torch as torch
import numpy as np
import torch.nn
import torch.nn.functional

from joblib import dump, load
from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn import neighbors
from sklearn import linear_model 
from sklearn import metrics 
from torch.utils.data import DataLoader, TensorDataset
from sklearn.feature_extraction.text import * 
from sklearn.model_selection import train_test_split 





Cloning into 'IMDB_ClassificationAndPrediction'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 98548 (delta 9), reused 28 (delta 5), pack-reused 98507[K
Receiving objects: 100% (98548/98548), 121.25 MiB | 18.37 MiB/s, done.
Resolving deltas: 100% (16/16), done.
Checking out files: 100% (100030/100030), done.
Looking in links: https://download.pytorch.org/whl/torch_stable.html
[31mERROR: Could not find a version that satisfies the requirement torch===1.8.1 (from versions: 0.1.2, 0.1.2.post1, 0.1.2.post2, 0.4.1, 0.4.1.post2, 1.0.0, 1.0.1, 1.0.1.post2, 1.1.0, 1.2.0, 1.2.0+cpu, 1.2.0+cu92, 1.3.0, 1.3.0+cpu, 1.3.0+cu100, 1.3.0+cu92, 1.3.1, 1.3.1+cpu, 1.3.1+cu100, 1.3.1+cu92, 1.4.0, 1.4.0+cpu, 1.4.0+cu100, 1.4.0+cu92, 1.5.0, 1.5.0+cpu, 1.5.0+cu101, 1.5.0+cu92, 1.5.1, 1.5.1+cpu, 1.5.1+cu101, 1.5.1+cu92, 1.6.0, 1.6.0+cpu, 1.6.0+cu101, 1.6.0+cu92, 1.7.0, 1.7.0+cpu, 1.7.0+cu101, 1.7

# Section 1: Positive and Negative Classifiers


In [None]:
#Loading in the data and the model from the Google Colab directory.
#This block handles our logistic classifier and classifiers either as Positive or Negative.
dataFile = np.load(os.getcwd() + "/IMDB_ClassificationAndPrediction/Demonstration_Files/demonstrationData.npz")
logisticClassifier = load(os.getcwd() + "/IMDB_ClassificationAndPrediction/Demonstration_Files/classification_LogisticClassifier.joblib")

#This is the format that the dataFile takes due to exporting as an npz file: ['arr_0', 'arr_1', 'arr_2', 'arr_3', 'arr_4', 'arr_5', 'arr_6']

#Assigning features here.
ratingFeature = dataFile["arr_0"]
withStopWordsFeature = dataFile["arr_1"]
removeStopWordsFeature =dataFile["arr_2"]
reviewLengthFeature = dataFile["arr_3"]
uniqueWordsFeature = dataFile["arr_4"]
sentimentRatingFeature = dataFile["arr_5"]
sentimentRatingFeatureWithNeutral = dataFile["arr_6"]

#We split the data here into 50/50. In these examples, we are using the testing data from our dataset to obtain results. 
#The model has already been trained on our training data, however we decided to demonstrate our model working using similar methods as
#our original Colab.
dataSplitFrac = .5
trainX, validX, trainY, validY = train_test_split(removeStopWordsFeature, sentimentRatingFeature, train_size = dataSplitFrac, random_state = 15)
trainX = trainX.reshape(-1,1)
validX = validX.reshape(-1,1)

#fitLogisticClassifier = logisticClassifier.fit(trainX, trainY)
print("Logistic - Without Neutral")
#Calculate the accuracy of the first 50% of the data.
fitLogisticClassifier = logisticClassifier
trainingPredictions = fitLogisticClassifier.predict(trainX)
trainingAccuracy =  fitLogisticClassifier.score(trainX, trainY)
print('Accuracy of first 50%:',format( 100*trainingAccuracy , '.2f') ) 

#Calculate the accuracy of the remaining 50% of the data
testPredictions = fitLogisticClassifier.predict(validX)	 
testAccuracy = fitLogisticClassifier.score(validX,validY)
print('Accuracy of last 50%:', format( 100*testAccuracy , '.2f') )

Logistic - Without Neutral
Accuracy of first 50%: 85.20
Accuracy of last 50%: 84.70


In [None]:
#Loading in the data and the model from the Google Colab directory.
#This block handles our KNN Classifier and classifiers as either Positive or Negative.
knn = load(os.getcwd() + "/IMDB_ClassificationAndPrediction/Demonstration_Files/classification_KNN.joblib")

#Demonstrating with the KNN model and calculating the accuracy of the first 50% of the data.
print("KNN - Without Neutral")
fitknn = knn.fit(trainX, trainY)
trainingPredictions = fitknn.predict(trainX)
trainingAccuracy =  fitknn.score(trainX, trainY)
print('Accuracy of first 50%:',format( 100*trainingAccuracy , '.2f') ) 

#Calculate the accuracy of the remaining 50% of the data
testPredictions = fitknn.predict(validX)	 
testAccuracy = fitknn.score(validX,validY)
print('Accuracy of last 50%:',format( 100*testAccuracy , '.2f') ) 

KNN - Without Neutral
Accuracy of first 50%: 85.29
Accuracy of last 50%: 84.70


# Section 2: Positive, Negative, and Neutral Classifiers


In [None]:
#Loading in the data and the model from the Google Colab directory.
#This block handles our KNN Classifier and classifiers as either Positive, Negative, or Neutral.
knnMulti = load(os.getcwd() + "/IMDB_ClassificationAndPrediction/Demonstration_Files/classification_KNN_Neutral.joblib")

#Setting up variables.
neutraltrainX, neutralvalidX, neutraltrainY, neutralvalidY = train_test_split(withStopWordsFeature,sentimentRatingFeatureWithNeutral, train_size = dataSplitFrac, random_state = 15)
neutraltrainX = neutraltrainX.reshape(-1,1)
neutralvalidX = neutralvalidX.reshape(-1,1)

print("KNN - With Neutral")
#Demonstrating with the KNN model and calculating the accuracy of the first 50% of the data.
fitknnMulti = knnMulti.fit(neutraltrainX, neutraltrainY)
trainingPredictions = knnMulti.predict(neutraltrainX)
trainingAccuracy =  knnMulti.score(neutraltrainX, neutraltrainY)
print('Accuracy of first 50%:',format( 100*trainingAccuracy , '.2f') ) 

#Calculate the remaining 50% of the data
testPredictions = knnMulti.predict(neutralvalidX)	 
testAccuracy = knnMulti.score(neutralvalidX,neutralvalidY)
print('Accuracy of last 50%:', format( 100*testAccuracy , '.2f') )


KNN - With Neutral
Accuracy of first 50%: 70.62
Accuracy of last 50%: 69.33


# Section 3: WordToVec Implementation

In [None]:
#Loading in the data and model required for our WordToVec Implementation.
#The following two blocks show an example of one of our classifiers trained using the Word2Vec features instead of our other features.
#The example here is a logistic classifier trained using WordToVec features and with classification of positive or negative.
word2VecData = np.load(os.getcwd() + "/IMDB_ClassificationAndPrediction/Demonstration_Files/wordToVecDemonstration.npz")
wordToVecLogisticClassifier = load(os.getcwd() + "/IMDB_ClassificationAndPrediction/Demonstration_Files/classification_wordToVecLogistic.joblib")

#This is the format that the dataFile takes due to exporting as an npz file:  ['arr_0', 'arr_1', 'arr_2']

#Assigning features
tensorVectors = word2VecData["arr_0"]
ratingListArray = word2VecData["arr_1"]
ratingListArrayNeutral = word2VecData["arr_2"]
VectrainX, VecvalidX, VectrainY, VecvalidY = train_test_split(tensorVectors, ratingListArray,
                                                  train_size = dataSplitFrac, random_state = 15)
VectrainXNeutral, VecvalidXNeutral, VectrainYNeutral, VecvalidYNeutral = train_test_split(tensorVectors, ratingListArrayNeutral,
                                                                          train_size = dataSplitFrac, random_state = 15)


In [None]:
#Calculating the first 50% of the data
print("WordToVec Logistic - Without Neutral")
fitword2vecLog = wordToVecLogisticClassifier.fit(VectrainX, VectrainY)
trainingPredictions = fitword2vecLog.predict(VectrainX)
trainingAccuracy =  fitword2vecLog.score(VectrainX, VectrainY)
print('Accuracy of first 50%:',format( 100*trainingAccuracy , '.2f') ) 

#Calculating the remaining 50%
testPredictions = fitword2vecLog.predict(VecvalidX)	 
testAccuracy = fitword2vecLog.score(VecvalidX,VecvalidY)

print('Accuracy of last 50%: accuracy:', format( 100*testAccuracy , '.2f') )

WordToVec Logistic - Without Neutral
Accuracy of first 50%: 57.21
Accuracy of last 50%: accuracy: 56.76


#Section 4: Prediction Model

In [None]:
#This block is a demonstration of our prediction model, copied directly from the original Google Colab.

#Required class for our prediction model
class LinReg(nn.Module):
  def __init__(self):
    super(LinReg, self).__init__()
    self.hid = nn.Linear(1, 3)
    self.lin = nn.Linear(3, 1)

  def forward(self, w):
    output = torch.nn.functional.relu(self.hid(w))
    output = self.lin(output)
    return output


#Assigning features
testX = removeStopWordsFeature
testY = ratingFeature

#Grab total review weights for the test dataset and their associated review scores
test_input = torch.unsqueeze(torch.FloatTensor(testX), 1)
test_labels = torch.unsqueeze(torch.FloatTensor(testY), 1)

#Load model
score_prediction_model = torch.load(os.getcwd() + "/IMDB_ClassificationAndPrediction/Demonstration_Files/score_prediction.pt")
score_prediction_model.eval()

#Test score prediction
predicted_test_output = score_prediction_model(test_input)
normalized_mse = float(torch.nn.MSELoss()(predicted_test_output, test_labels)) / ((10.0 - 1.0) ** 2.0)

print("Normalized test MSE: ", normalized_mse)

#Test classification
correct = 0.0
incorrect = 0.0

for i, e in enumerate(test_input):
  model_output = float(score_prediction_model(test_input[i])[0])
  actual_score = float(test_labels[i][0])
  if (model_output >= 5.5 and actual_score >= 5.5):
    correct += 1.0
  elif (model_output < 5.5 and actual_score < 5.5):
    correct += 1.0
  else:
    incorrect += 1.0

print("Classification (positive/negative) test accuracy: ", ((correct / (correct + incorrect)) * 100.0), "%")
print("This model was trained on a three-layer feedforward neural network with linear regression over 30,000 epochs. It uses one feature (aggregate review weight).")

Normalized test MSE:  0.06518174395149137
Classification (positive/negative) test accuracy:  84.944 %
This model was trained on a three-layer feedforward neural network with linear regression over 30,000 epochs. It uses one feature (aggregate review weight).
