In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [2]:
dataset = pd.read_csv("Hotel_dataset.csv")
dataset.drop(dataset.tail(15491).index, inplace = True)
reviews = [w for w in dataset["Review"]]

In [3]:
dataset.loc[dataset.Rating == 1, 'Rating'] = 0
dataset.loc[dataset.Rating == 2, 'Rating'] = 0
dataset.loc[dataset.Rating == 3, 'Rating'] = 0
dataset.loc[dataset.Rating == 4, 'Rating'] = 1
dataset.loc[dataset.Rating == 5, 'Rating'] = 1

In [4]:
pickle_in = open("word2vec_cbow_vectors.pickle", "rb")
word_vectors_CBOW = pickle.load(pickle_in)
del word_vectors_CBOW[-15491:]

In [5]:
def getReviewVector(review):
    return np.mean([word for word in review], axis = 0)

In [6]:
# CBOW
x = []
for i in range(len(word_vectors_CBOW)):
    x.append(getReviewVector(word_vectors_CBOW[i]))

In [7]:
y = pd.get_dummies(dataset["Rating"])
y = y.iloc[:,1].values

In [8]:
rfc = RandomForestClassifier(n_estimators = 100)

In [9]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.822 0.872 0.84  0.864 0.88  0.818 0.798 0.868 0.872 0.91 ]
Average CV accuracy score:  0.8543999999999998


In [10]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.84067195 0.85118904 0.86180923 0.86272281 0.86910958 0.81212836
 0.82285194 0.84443917 0.87089969 0.90829539]
Average CV precision score:  0.8544117152963591


In [11]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.834 0.852 0.864 0.87  0.89  0.806 0.812 0.882 0.886 0.916]
Average CV recall score:  0.8612


In [12]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.83208029 0.84635996 0.86151755 0.87463889 0.89106743 0.7972809
 0.79888549 0.8633378  0.86614422 0.92151155]
Average CV f1 score:  0.8552824082417345


In [13]:
pickle_in = open("word2vec_skip-gram_vectors.pickle", "rb")
word_vectors_Skip_Gram = pickle.load(pickle_in)
del word_vectors_Skip_Gram[-15491:]

In [14]:
# Skip-Gram
x = []
for i in range(len(word_vectors_Skip_Gram)):
    x.append(getReviewVector(word_vectors_Skip_Gram[i]))

In [15]:
rfc = RandomForestClassifier(n_estimators = 100)

In [16]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.812 0.858 0.87  0.876 0.884 0.824 0.834 0.876 0.882 0.912]
Average CV accuracy score:  0.8628


In [17]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.82156207 0.85965808 0.87906066 0.87890821 0.87453714 0.84117947
 0.83300261 0.86288684 0.86884199 0.90871895]
Average CV precision score:  0.8628356035815375


In [18]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.822 0.862 0.886 0.878 0.88  0.85  0.842 0.874 0.878 0.904]
Average CV recall score:  0.8676


In [19]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.80688929 0.85710518 0.88069298 0.87361888 0.8887387  0.82214068
 0.81572025 0.87260391 0.86531137 0.90057415]
Average CV f1 score:  0.8583395408923599
