In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [2]:
dataset = pd.read_csv("IMDB_dataset_preprocessed.csv")
dataset.drop(dataset.tail(45000).index, inplace = True)
reviews = [w for w in dataset["review"]]

In [3]:
pickle_in = open("word2vec_cbow_vectors_preprocessed.pickle", "rb")
word_vectors_CBOW = pickle.load(pickle_in)
del word_vectors_CBOW[-45000:]

In [4]:
def getReviewVector(review):
    return np.mean([word for word in review], axis = 0)

In [5]:
# CBOW
x = []
for i in range(len(word_vectors_CBOW)):
    x.append(getReviewVector(word_vectors_CBOW[i]))

In [6]:
y = pd.get_dummies(dataset["sentiment"])
y = y.iloc[:,1].values

In [7]:
rfc = RandomForestClassifier(n_estimators = 100)

In [8]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.81  0.814 0.82  0.83  0.788 0.828 0.814 0.816 0.838 0.836]
Average CV accuracy score:  0.8193999999999999


In [9]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.8199454  0.8344247  0.82445769 0.82795167 0.79831882 0.82387869
 0.8142     0.84089265 0.81801086 0.80628696]
Average CV precision score:  0.8208367448553091


In [10]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.808 0.852 0.826 0.826 0.788 0.826 0.796 0.824 0.822 0.828]
Average CV recall score:  0.8196


In [11]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.80975942 0.82373093 0.82218177 0.81603245 0.79932322 0.81383255
 0.81401563 0.81849541 0.8319139  0.82393793]
Average CV f1 score:  0.8173223197917249


In [12]:
pickle_in = open("word2vec_skip-gram_vectors_preprocessed.pickle", "rb")
word_vectors_Skip_Gram = pickle.load(pickle_in)
del word_vectors_Skip_Gram[-45000:]

In [13]:
# Skip-Gram
x = []
for i in range(len(word_vectors_Skip_Gram)):
    x.append(getReviewVector(word_vectors_Skip_Gram[i]))

In [14]:
rfc = RandomForestClassifier(n_estimators = 100)

In [15]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.824 0.828 0.85  0.846 0.828 0.834 0.828 0.844 0.848 0.844]
Average CV accuracy score:  0.8373999999999999


In [16]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.84203833 0.82795749 0.858048   0.84837577 0.83263354 0.84436423
 0.83435484 0.83816464 0.85009212 0.83000963]
Average CV precision score:  0.8406038572812417


In [17]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.81  0.85  0.84  0.848 0.83  0.848 0.84  0.838 0.844 0.836]
Average CV recall score:  0.8384


In [18]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.82791714 0.83388996 0.86212774 0.85401346 0.82152505 0.83793195
 0.83001428 0.85835014 0.84996097 0.83990763]
Average CV f1 score:  0.8415638333260353
