In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [2]:
dataset = pd.read_csv("Hotel_dataset.csv")
reviews = [w for w in dataset["Review"]]

In [3]:
dataset.loc[dataset.Rating == 1, 'Rating'] = 0
dataset.loc[dataset.Rating == 2, 'Rating'] = 0
dataset.loc[dataset.Rating == 3, 'Rating'] = 0
dataset.loc[dataset.Rating == 4, 'Rating'] = 1
dataset.loc[dataset.Rating == 5, 'Rating'] = 1

In [4]:
pickle_in = open("word2vec_cbow_vectors.pickle", "rb")
word_vectors_CBOW = pickle.load(pickle_in)

In [5]:
def getReviewVector(review):
    return np.mean([word for word in review], axis = 0)

In [6]:
# CBOW
x = []
for i in range(len(word_vectors_CBOW)):
    x.append(getReviewVector(word_vectors_CBOW[i]))

In [7]:
y = pd.get_dummies(dataset["Rating"])
y = y.iloc[:,1].values

In [8]:
rfc = RandomForestClassifier(n_estimators = 100)

In [9]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.85463415 0.85065886 0.89019034 0.86530015 0.86627623 0.85651537
 0.88140556 0.86969253 0.87798926 0.87701318]
Average CV accuracy score:  0.8689675629992024


In [10]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.85037642 0.84955844 0.88268525 0.86100236 0.85545024 0.8572582
 0.88316813 0.86549693 0.8693992  0.86789928]
Average CV precision score:  0.8642294441957464


In [11]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.85512195 0.84138604 0.88677404 0.86578819 0.86627623 0.85846755
 0.88433382 0.86676428 0.87896535 0.87408492]
Average CV recall score:  0.8677962361175589


In [12]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.84972468 0.84067701 0.87881933 0.86410951 0.8528812  0.85325678
 0.8741048  0.86432637 0.87142289 0.86829291]
Average CV f1 score:  0.8617615481585712


In [13]:
pickle_in = open("word2vec_skip-gram_vectors.pickle", "rb")
word_vectors_Skip_Gram = pickle.load(pickle_in)

In [14]:
# Skip-Gram
x = []
for i in range(len(word_vectors_Skip_Gram)):
    x.append(getReviewVector(word_vectors_Skip_Gram[i]))

In [15]:
rfc = RandomForestClassifier(n_estimators = 100)

In [16]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.86731707 0.8716447  0.88921425 0.88140556 0.87066862 0.86578819
 0.89263055 0.87701318 0.88530991 0.88726208]
Average CV accuracy score:  0.8788254115630467


In [17]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.86483483 0.86678966 0.88803671 0.87682961 0.86698585 0.86585701
 0.89117283 0.87121482 0.87753964 0.87782828]
Average CV precision score:  0.8747089244977586


In [18]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.86878049 0.86090776 0.89116642 0.88042948 0.86969253 0.86530015
 0.88970229 0.87310883 0.88384578 0.88872621]
Average CV recall score:  0.8771659941196777


In [19]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.8599893  0.8570028  0.8849139  0.87388864 0.85946744 0.85946915
 0.89044422 0.86496773 0.88141422 0.88108914]
Average CV f1 score:  0.8712646533255137
