In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [2]:
dataset = pd.read_csv("Hotel_dataset.csv")
reviews = [w for w in dataset["Review"]]

In [3]:
dataset.loc[dataset.Rating == 1, 'Rating'] = 0
dataset.loc[dataset.Rating == 2, 'Rating'] = 0
dataset.loc[dataset.Rating == 3, 'Rating'] = 0
dataset.loc[dataset.Rating == 4, 'Rating'] = 1
dataset.loc[dataset.Rating == 5, 'Rating'] = 1

In [4]:
pickle_in = open("fasttext_vectors.pickle", "rb")
word_vectors_fastText = pickle.load(pickle_in)

In [5]:
def getReviewVector(review):
    return np.mean([word for word in review], axis = 0)

In [6]:
x = []
for i in range(len(word_vectors_fastText)):
    x.append(getReviewVector(word_vectors_fastText[i]))

In [7]:
y = pd.get_dummies(dataset["Rating"])
y = y.iloc[:,1].values

In [8]:
rfc = RandomForestClassifier(n_estimators = 100)

In [9]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.85853659 0.8443143  0.88238165 0.86432406 0.86090776 0.86871645
 0.88579795 0.8682284  0.87798926 0.87359688]
Average CV accuracy score:  0.8684793295956386


In [10]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.8580549  0.84744906 0.87751386 0.86717757 0.86383581 0.86405357
 0.88432804 0.86636792 0.87374889 0.87096017]
Average CV precision score:  0.8673489781848863


In [11]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.86341463 0.84529039 0.88433382 0.86383602 0.86530015 0.86578819
 0.88482186 0.87310883 0.88384578 0.87847731]
Average CV recall score:  0.8708216976752491


In [12]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.85805873 0.84226102 0.87511584 0.86015823 0.86043606 0.86161349
 0.87608526 0.86799046 0.86910949 0.86642659]
Average CV f1 score:  0.8637255164420357
