In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [2]:
dataset = pd.read_csv("IMDB_dataset_untouched.csv")
reviews = [w for w in dataset["review"]]

In [3]:
pickle_in = open("word2vec_cbow_vectors_untouched.pickle", "rb")
word_vectors_CBOW = pickle.load(pickle_in)

In [4]:
def getReviewVector(review):
    return np.mean([word for word in review], axis = 0)

In [5]:
# CBOW
x = []
for i in range(len(word_vectors_CBOW)):
    x.append(getReviewVector(word_vectors_CBOW[i]))

In [6]:
y = pd.get_dummies(dataset["sentiment"])
y = y.iloc[:,1].values

In [7]:
rfc = RandomForestClassifier(n_estimators = 100)

In [8]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.8222 0.8206 0.8226 0.8234 0.8122 0.8202 0.8128 0.811  0.815  0.8146]
Average CV accuracy score:  0.81746


In [9]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.82056226 0.82078596 0.82120841 0.81900101 0.81144337 0.81688303
 0.8134073  0.80835506 0.82239638 0.81441419]
Average CV precision score:  0.8168456972862721


In [10]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.8178 0.8162 0.8212 0.8236 0.8128 0.8174 0.8078 0.8126 0.8112 0.8134]
Average CV recall score:  0.8154


In [11]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.81940288 0.81857367 0.82412551 0.81979722 0.81001186 0.81937468
 0.81636757 0.81258279 0.81377157 0.81419027]
Average CV f1 score:  0.8168198042268727


In [12]:
pickle_in = open("word2vec_skip-gram_vectors_untouched.pickle", "rb")
word_vectors_Skip_Gram = pickle.load(pickle_in)

In [13]:
# Skip-Gram
x = []
for i in range(len(word_vectors_Skip_Gram)):
    x.append(getReviewVector(word_vectors_Skip_Gram[i]))

In [14]:
rfc = RandomForestClassifier(n_estimators = 100)

In [15]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.8418 0.844  0.8408 0.8484 0.8346 0.8424 0.8352 0.8416 0.8392 0.839 ]
Average CV accuracy score:  0.8407


In [16]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.84681097 0.83729105 0.83883264 0.84968658 0.83901119 0.84528742
 0.83434074 0.84083487 0.84072344 0.8467342 ]
Average CV precision score:  0.841955310247128


In [17]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.8462 0.8438 0.837  0.8466 0.8382 0.8458 0.834  0.839  0.8402 0.845 ]
Average CV recall score:  0.8415800000000001


In [18]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.84640214 0.84074581 0.83652285 0.85038341 0.8393574  0.84136151
 0.83633732 0.83696239 0.84886853 0.84440359]
Average CV f1 score:  0.8421344948311973
