In [15]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report as cr
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [16]:
dataset = pd.read_csv("Hotel_dataset.csv")
dataset.drop(dataset.tail(15491).index, inplace = True)
reviews = [w for w in dataset["Review"]]

In [17]:
dataset.loc[dataset.Rating == 1, 'Rating'] = 0
dataset.loc[dataset.Rating == 2, 'Rating'] = 0
dataset.loc[dataset.Rating == 3, 'Rating'] = 0
dataset.loc[dataset.Rating == 4, 'Rating'] = 1
dataset.loc[dataset.Rating == 5, 'Rating'] = 1

In [18]:
pickle_in = open("bert_vectors1.pickle", "rb")
word_vectors1 = pickle.load(pickle_in)

In [19]:
pickle_in = open("bert_vectors2.pickle", "rb")
word_vectors2 = pickle.load(pickle_in)

In [20]:
pickle_in = open("bert_vectors3.pickle", "rb")
word_vectors3 = pickle.load(pickle_in)

In [21]:
pickle_in = open("bert_vectors4.pickle", "rb")
word_vectors4 = pickle.load(pickle_in)

In [22]:
pickle_in = open("bert_vectors5.pickle", "rb")
word_vectors5 = pickle.load(pickle_in)

In [23]:
word_vectors_concatenated = np.concatenate((word_vectors1, word_vectors2, word_vectors3, word_vectors4, word_vectors5))

In [24]:
def getReviewVector(review):
    return np.mean([word for word in review], axis = 0)

In [25]:
x = []
for i in range(len(word_vectors_concatenated)):
    x.append(getReviewVector(word_vectors_concatenated[i]))

In [26]:
y = pd.get_dummies(dataset["Rating"])
y = y.iloc[:,1].values

In [27]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [28]:
rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [29]:
score = accuracy_score(y_test, y_pred)
print(score)
print(cr(y_test, y_pred))

0.776
              precision    recall  f1-score   support

           0       0.88      0.31      0.46       305
           1       0.76      0.98      0.86       695

    accuracy                           0.78      1000
   macro avg       0.82      0.64      0.66      1000
weighted avg       0.80      0.78      0.74      1000



In [30]:
# 10-fold cross-validation
scores = cross_val_score(rfc, x, y, cv = k_folds)

In [31]:
# 10-fold cross-validation results
print("CV scores: ", scores)
print("Avg CV score: ", scores.mean())

CV scores:  [0.72  0.8   0.812 0.82  0.806 0.734 0.726 0.78  0.8   0.864]
Avg CV score:  0.7862
