In [64]:
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [65]:
dataset = pd.read_csv("Hotel_dataset.csv")
dataset.drop(dataset.tail(15491).index, inplace = True)
reviews = [w for w in dataset["Review"]]

In [66]:
dataset.loc[dataset.Rating == 1, 'Rating'] = 0
dataset.loc[dataset.Rating == 2, 'Rating'] = 0
dataset.loc[dataset.Rating == 3, 'Rating'] = 0
dataset.loc[dataset.Rating == 4, 'Rating'] = 1
dataset.loc[dataset.Rating == 5, 'Rating'] = 1

In [67]:
pickle_in = open("bert_vectors1.pickle", "rb")
word_vectors1 = pickle.load(pickle_in)

In [68]:
pickle_in = open("bert_vectors2.pickle", "rb")
word_vectors2 = pickle.load(pickle_in)

In [69]:
pickle_in = open("bert_vectors3.pickle", "rb")
word_vectors3 = pickle.load(pickle_in)

In [70]:
pickle_in = open("bert_vectors4.pickle", "rb")
word_vectors4 = pickle.load(pickle_in)

In [71]:
pickle_in = open("bert_vectors5.pickle", "rb")
word_vectors5 = pickle.load(pickle_in)

In [72]:
word_vectors_concatenated = np.concatenate((word_vectors1, word_vectors2, word_vectors3, word_vectors4, word_vectors5))

In [73]:
def getReviewVector(review):
    return np.mean([word for word in review], axis = 0)

In [74]:
x = []
for i in range(len(word_vectors_concatenated)):
    x.append(getReviewVector(word_vectors_concatenated[i]))

In [75]:
y = pd.get_dummies(dataset["Rating"])
y = y.iloc[:,1].values

In [77]:
rfc = RandomForestClassifier(n_estimators = 100)

In [78]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.728 0.808 0.812 0.812 0.808 0.756 0.72  0.786 0.812 0.868]
Average CV accuracy score:  0.7910000000000001


In [79]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.78148509 0.78804306 0.82252079 0.84414865 0.8288     0.77079007
 0.74391369 0.79651945 0.81546833 0.89321052]
Average CV precision score:  0.8084899655000972


In [80]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.73  0.812 0.828 0.836 0.808 0.726 0.742 0.796 0.8   0.88 ]
Average CV recall score:  0.7958000000000001


In [81]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.67808824 0.77332474 0.79569926 0.81988076 0.80180466 0.70663914
 0.68421199 0.78437974 0.76468241 0.85409443]
Average CV f1 score:  0.7662805368337886
