In [4]:
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [5]:
dataset = pd.read_csv("IMDB_dataset_untouched.csv")
dataset.drop(dataset.tail(45000).index, inplace = True)
reviews = [w for w in dataset["review"]]

In [6]:
pickle_in = open("word2vec_cbow_vectors_untouched.pickle", "rb")
word_vectors_CBOW = pickle.load(pickle_in)
del word_vectors_CBOW[-45000:]

In [7]:
def getReviewVector(review):
    return np.mean([word for word in review], axis = 0)

In [8]:
# CBOW
x = []
for i in range(len(word_vectors_CBOW)):
    x.append(getReviewVector(word_vectors_CBOW[i]))

In [9]:
y = pd.get_dummies(dataset["sentiment"])
y = y.iloc[:,1].values

In [10]:
rfc = RandomForestClassifier(n_estimators = 100)

In [11]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.792 0.804 0.796 0.798 0.778 0.788 0.782 0.808 0.808 0.802]
Average CV accuracy score:  0.7956


In [12]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.79390286 0.802343   0.79947826 0.79428364 0.77020574 0.78956127
 0.79243219 0.7897689  0.804128   0.81071216]
Average CV precision score:  0.7946816015909205


In [13]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.804 0.802 0.822 0.82  0.752 0.802 0.782 0.802 0.808 0.802]
Average CV recall score:  0.7996000000000001


In [14]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.79405698 0.82208205 0.82218177 0.78566882 0.76981261 0.80182174
 0.78999076 0.80258327 0.81400521 0.81770548]
Average CV f1 score:  0.8019908687079706


In [15]:
pickle_in = open("word2vec_skip-gram_vectors_untouched.pickle", "rb")
word_vectors_Skip_Gram = pickle.load(pickle_in)
del word_vectors_Skip_Gram[-45000:]

In [16]:
# Skip-Gram
x = []
for i in range(len(word_vectors_Skip_Gram)):
    x.append(getReviewVector(word_vectors_Skip_Gram[i]))

In [17]:
rfc = RandomForestClassifier(n_estimators = 100)

In [18]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.818 0.808 0.838 0.828 0.8   0.82  0.824 0.844 0.828 0.818]
Average CV accuracy score:  0.8226000000000001


In [19]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.8183168  0.82809827 0.84310748 0.83814217 0.82328312 0.81949181
 0.81613978 0.82607155 0.83204203 0.83798205]
Average CV precision score:  0.8282675058678866


In [20]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.826 0.838 0.83  0.83  0.802 0.822 0.832 0.84  0.844 0.836]
Average CV recall score:  0.8300000000000001


In [21]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.83203233 0.8379395  0.83813442 0.82       0.79770541 0.82579319
 0.8240169  0.84249566 0.84800973 0.82598676]
Average CV f1 score:  0.8292113898459709
