In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [2]:
dataset = pd.read_csv("Hotel_dataset.csv")
reviews = [w for w in dataset["Review"]]

In [3]:
dataset.loc[dataset.Rating == 1, 'Rating'] = 0
dataset.loc[dataset.Rating == 2, 'Rating'] = 0
dataset.loc[dataset.Rating == 3, 'Rating'] = 0
dataset.loc[dataset.Rating == 4, 'Rating'] = 1
dataset.loc[dataset.Rating == 5, 'Rating'] = 1

In [4]:
pickle_in = open("tfidf_vectors.pickle", "rb")
word_vectors_TFIDF = pickle.load(pickle_in)

In [5]:
x = word_vectors_TFIDF

In [6]:
y = pd.get_dummies(dataset["Rating"])
y = y.iloc[:,1].values

In [7]:
rfc = RandomForestClassifier(n_estimators = 100)

In [8]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.84682927 0.84187408 0.88384578 0.86334797 0.85212299 0.84285017
 0.86969253 0.8648121  0.87262079 0.87506101]
Average CV accuracy score:  0.8613056696306348


In [9]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.85913902 0.86207286 0.88813492 0.86471401 0.86541324 0.85742582
 0.88194569 0.87256508 0.8721932  0.87859951]
Average CV precision score:  0.8702203346629357


In [10]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.84829268 0.84382626 0.88238165 0.8613958  0.85553929 0.84675451
 0.87457296 0.8716447  0.86774036 0.86969253]
Average CV recall score:  0.8621840755157185


In [11]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.83317523 0.83446197 0.86483675 0.84538517 0.82925858 0.83086578
 0.86469607 0.8511169  0.85908735 0.87035791]
Average CV f1 score:  0.8483241706534453
