In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [2]:
dataset = pd.read_csv("IMDB_dataset_preprocessed.csv")
dataset.drop(dataset.tail(45000).index, inplace = True)
reviews = [w for w in dataset["review"]]

In [3]:
pickle_in = open("bert_vectors_preprocessed1.pickle", "rb")
word_vectors1 = pickle.load(pickle_in)

In [4]:
pickle_in = open("bert_vectors_preprocessed2.pickle", "rb")
word_vectors2 = pickle.load(pickle_in)

In [5]:
pickle_in = open("bert_vectors_preprocessed3.pickle", "rb")
word_vectors3 = pickle.load(pickle_in)

In [6]:
pickle_in = open("bert_vectors_preprocessed4.pickle", "rb")
word_vectors4 = pickle.load(pickle_in)

In [7]:
pickle_in = open("bert_vectors_preprocessed5.pickle", "rb")
word_vectors5 = pickle.load(pickle_in)

In [8]:
word_vectors_concatenated = np.concatenate((word_vectors1, word_vectors2, word_vectors3, word_vectors4, word_vectors5))

In [9]:
def getReviewVector(review):
    return np.mean([word for word in review], axis = 0)

In [10]:
x = []
for i in range(len(word_vectors_concatenated)):
    x.append(getReviewVector(word_vectors_concatenated[i]))

In [11]:
y = pd.get_dummies(dataset["sentiment"])
y = y.iloc[:,1].values

In [12]:
rfc = RandomForestClassifier(n_estimators = 100)

In [13]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.682 0.72  0.668 0.688 0.706 0.694 0.696 0.68  0.686 0.69 ]
Average CV accuracy score:  0.6910000000000001


In [14]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.6977639  0.72740901 0.6463871  0.66986664 0.69818816 0.71026964
 0.70796657 0.69822214 0.69582449 0.69886175]
Average CV precision score:  0.6950759390415627


In [15]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.712 0.7   0.658 0.686 0.7   0.694 0.716 0.692 0.704 0.706]
Average CV recall score:  0.6968


In [16]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.69396686 0.72021526 0.6781739  0.68367509 0.67546878 0.70193196
 0.69997118 0.67117956 0.71731183 0.67397522]
Average CV f1 score:  0.6915869632654459
