In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [2]:
dataset = pd.read_csv("IMDB_dataset_untouched.csv")
reviews = [w for w in dataset["review"]]

In [3]:
pickle_in = open("glove_vectors_untouched.pickle", "rb")
word_vectors_GloVe = pickle.load(pickle_in)

In [4]:
def getReviewVector(review):
    return np.mean([word for word in review], axis = 0)

In [5]:
x = []
for i in range(len(word_vectors_GloVe)):
    x.append(getReviewVector(word_vectors_GloVe[i]))

In [6]:
y = pd.get_dummies(dataset["sentiment"])
y = y.iloc[:,1].values

In [7]:
rfc = RandomForestClassifier(n_estimators = 100)

In [8]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.7634 0.7574 0.7484 0.7512 0.7454 0.7522 0.748  0.7456 0.7518 0.7502]
Average CV accuracy score:  0.75136


In [9]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.76241636 0.75597777 0.74721786 0.75859878 0.75074831 0.75423032
 0.7525406  0.74620159 0.75018804 0.75221661]
Average CV precision score:  0.7530336252457002


In [10]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.7686 0.7504 0.7486 0.7562 0.7422 0.753  0.7518 0.7506 0.7536 0.7496]
Average CV recall score:  0.75246


In [11]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.76120883 0.75537101 0.74978763 0.75719266 0.74663323 0.7552
 0.75099332 0.74779682 0.74696704 0.74980459]
Average CV f1 score:  0.7520955126986517
