In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [2]:
dataset = pd.read_csv("IMDB_dataset_preprocessed.csv")
reviews = [w for w in dataset["review"]]

In [3]:
pickle_in = open("word2vec_cbow_vectors_preprocessed.pickle", "rb")
word_vectors_CBOW = pickle.load(pickle_in)

In [4]:
def getReviewVector(review):
    return np.mean([word for word in review], axis = 0)

In [5]:
# CBOW
x = []
for i in range(len(word_vectors_CBOW)):
    x.append(getReviewVector(word_vectors_CBOW[i]))

In [6]:
y = pd.get_dummies(dataset["sentiment"])
y = y.iloc[:,1].values

In [7]:
rfc = RandomForestClassifier(n_estimators = 100)

In [8]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.8366 0.8432 0.8412 0.8422 0.8286 0.8312 0.8362 0.8364 0.8388 0.8346]
Average CV accuracy score:  0.8369


In [9]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.84135877 0.84226528 0.83767082 0.84014952 0.82699457 0.83486358
 0.83431906 0.83595499 0.84239667 0.83465499]
Average CV precision score:  0.8370628250042398


In [10]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.8376 0.84   0.8344 0.8366 0.8248 0.8354 0.834  0.8334 0.839  0.8338]
Average CV recall score:  0.8349


In [11]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.83980117 0.84209707 0.83832147 0.83854745 0.83016897 0.83472057
 0.83652783 0.83115126 0.84230033 0.83839214]
Average CV f1 score:  0.8372028262455047


In [12]:
pickle_in = open("word2vec_skip-gram_vectors_preprocessed.pickle", "rb")
word_vectors_Skip_Gram = pickle.load(pickle_in)

In [13]:
# Skip-Gram
x = []
for i in range(len(word_vectors_Skip_Gram)):
    x.append(getReviewVector(word_vectors_Skip_Gram[i]))

In [14]:
rfc = RandomForestClassifier(n_estimators = 100)

In [15]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.857  0.8504 0.8492 0.8586 0.8436 0.8538 0.8512 0.8476 0.8546 0.8508]
Average CV accuracy score:  0.85168


In [16]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.8547843  0.85130167 0.85160088 0.85984699 0.84787343 0.85502617
 0.85034962 0.84633139 0.86045256 0.85018749]
Average CV precision score:  0.8527754492994918


In [17]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.8586 0.85   0.8482 0.8592 0.8482 0.8548 0.8454 0.8462 0.85   0.8554]
Average CV recall score:  0.8515999999999998


In [18]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.8562023  0.85110793 0.85131106 0.8609773  0.84315704 0.85234228
 0.85032999 0.84636549 0.85611935 0.84678765]
Average CV f1 score:  0.8514700389777451
