In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [2]:
dataset = pd.read_csv("Hotel_dataset.csv")
dataset.drop(dataset.tail(15491).index, inplace = True)
reviews = [w for w in dataset["Review"]]

In [3]:
dataset.loc[dataset.Rating == 1, 'Rating'] = 0
dataset.loc[dataset.Rating == 2, 'Rating'] = 0
dataset.loc[dataset.Rating == 3, 'Rating'] = 0
dataset.loc[dataset.Rating == 4, 'Rating'] = 1
dataset.loc[dataset.Rating == 5, 'Rating'] = 1

In [4]:
pickle_in = open("fasttext_vectors.pickle", "rb")
word_vectors_fastText = pickle.load(pickle_in)
del word_vectors_fastText[-15491:]

In [5]:
def getReviewVector(review):
    return np.mean([word for word in review], axis = 0)

In [6]:
x = []
for i in range(len(word_vectors_fastText)):
    x.append(getReviewVector(word_vectors_fastText[i]))

In [7]:
y = pd.get_dummies(dataset["Rating"])
y = y.iloc[:,1].values

In [8]:
rfc = RandomForestClassifier(n_estimators = 100)

In [9]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.822 0.872 0.868 0.882 0.868 0.816 0.826 0.86  0.878 0.898]
Average CV accuracy score:  0.859


In [10]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.81939636 0.85777218 0.86190751 0.88275448 0.86942843 0.82832858
 0.84241069 0.8587386  0.87245009 0.89533267]
Average CV precision score:  0.8588519588461876


In [11]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.818 0.868 0.874 0.884 0.872 0.814 0.824 0.866 0.878 0.898]
Average CV recall score:  0.8596


In [12]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.82225011 0.86652434 0.86650266 0.8778727  0.8614251  0.80943645
 0.81383799 0.86027526 0.86427688 0.89328775]
Average CV f1 score:  0.8535689234873644
