In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report as cr
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [2]:
dataset = pd.read_csv("IMDB_dataset_preprocessed.csv")
dataset.drop(dataset.tail(45000).index, inplace = True)
reviews = [w for w in dataset["review"]]

In [3]:
pickle_in = open("word2vec_cbow_vectors_preprocessed.pickle", "rb")
word_vectors_CBOW = pickle.load(pickle_in)
del word_vectors_CBOW[-45000:]

In [4]:
def getReviewVector(review):
    return np.mean([word for word in review], axis = 0)

In [5]:
# CBOW
x = []
for i in range(len(word_vectors_CBOW)):
    x.append(getReviewVector(word_vectors_CBOW[i]))

In [7]:
y = pd.get_dummies(dataset["sentiment"])
y = y.iloc[:,1].values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [94]:
rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [95]:
score = accuracy_score(y_test, y_pred)
print(score)
print(cr(y_test, y_pred))

0.813
              precision    recall  f1-score   support

           0       0.84      0.79      0.82       525
           1       0.78      0.84      0.81       475

    accuracy                           0.81      1000
   macro avg       0.81      0.81      0.81      1000
weighted avg       0.82      0.81      0.81      1000



In [96]:
# 10-fold cross-validation - CBOW
scores = cross_val_score(rfc, x, y, cv = k_folds)

In [97]:
# 10-fold cross-validation results - CBOW
print("CV scores: ", scores)
print("Avg CV score: ", scores.mean())

CV scores:  [0.814 0.836 0.84  0.81  0.814 0.842 0.826 0.796 0.816 0.818]
Avg CV score:  0.8211999999999999


In [98]:
pickle_in = open("word2vec_skip-gram_vectors_preprocessed.pickle", "rb")
word_vectors_Skip_Gram = pickle.load(pickle_in)
del word_vectors_Skip_Gram[-45000:]

In [99]:
# Skip-Gram
x = []
for i in range(len(word_vectors_Skip_Gram)):
    x.append(getReviewVector(word_vectors_Skip_Gram[i]))

In [100]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [101]:
rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [102]:
score = accuracy_score(y_test, y_pred)
print(score)
print(cr(y_test, y_pred))

0.821
              precision    recall  f1-score   support

           0       0.85      0.79      0.82       525
           1       0.79      0.85      0.82       475

    accuracy                           0.82      1000
   macro avg       0.82      0.82      0.82      1000
weighted avg       0.82      0.82      0.82      1000



In [103]:
# 10-fold cross-validation - Skip-Gram
scores = cross_val_score(rfc, x, y, cv = k_folds)

In [104]:
# 10-fold cross-validation results - Skip-Gram
print("CV scores: ", scores)
print("Avg CV score: ", scores.mean())

CV scores:  [0.822 0.834 0.84  0.85  0.812 0.842 0.846 0.854 0.846 0.84 ]
Avg CV score:  0.8386000000000001
