In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [2]:
dataset = pd.read_csv("Hotel_dataset.csv")
dataset.drop(dataset.tail(15491).index, inplace = True)
reviews = [w for w in dataset["Review"]]

In [3]:
dataset.loc[dataset.Rating == 1, 'Rating'] = 0
dataset.loc[dataset.Rating == 2, 'Rating'] = 0
dataset.loc[dataset.Rating == 3, 'Rating'] = 0
dataset.loc[dataset.Rating == 4, 'Rating'] = 1
dataset.loc[dataset.Rating == 5, 'Rating'] = 1

In [4]:
pickle_in = open("glove_vectors.pickle", "rb")
word_vectors_GloVe = pickle.load(pickle_in)
del word_vectors_GloVe[-15491:]

In [5]:
def getReviewVector(review):
    return np.mean([word for word in review], axis = 0)

In [6]:
x = []
for i in range(len(word_vectors_GloVe)):
    x.append(getReviewVector(word_vectors_GloVe[i]))

In [7]:
y = pd.get_dummies(dataset["Rating"])
y = y.iloc[:,1].values

In [8]:
rfc = RandomForestClassifier(n_estimators = 100)

In [9]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.782 0.836 0.838 0.856 0.828 0.774 0.784 0.812 0.838 0.884]
Average CV accuracy score:  0.8231999999999999


In [10]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.77629365 0.84289823 0.84476933 0.83954    0.8350813  0.78893171
 0.79758663 0.79829811 0.83253482 0.87000286]
Average CV precision score:  0.8225936633731319


In [11]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.78  0.832 0.846 0.854 0.836 0.782 0.788 0.812 0.834 0.878]
Average CV recall score:  0.8241999999999999


In [12]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.77949986 0.84449558 0.83569184 0.84586748 0.8426499  0.7682366
 0.75568425 0.81079472 0.80623907 0.87148491]
Average CV f1 score:  0.8160644209848069
