In [112]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report as cr
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [113]:
dataset = pd.read_csv("Hotel_dataset.csv")
reviews = [w for w in dataset["Review"]]

In [None]:
dataset.loc[dataset.Rating == 1, 'Rating'] = 0
dataset.loc[dataset.Rating == 2, 'Rating'] = 0
dataset.loc[dataset.Rating == 3, 'Rating'] = 0
dataset.loc[dataset.Rating == 4, 'Rating'] = 1
dataset.loc[dataset.Rating == 5, 'Rating'] = 1

In [114]:
pickle_in = open("word2vec_cbow_vectors.pickle", "rb")
word_vectors_CBOW = pickle.load(pickle_in)

In [115]:
def getReviewVector(review):
    return np.mean([word for word in review], axis = 0)

In [116]:
# CBOW
x = []
for i in range(len(word_vectors_CBOW)):
    x.append(getReviewVector(word_vectors_CBOW[i]))

In [118]:
y = pd.get_dummies(dataset["Rating"])
y = y.iloc[:,1].values

In [119]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [120]:
rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [121]:
score = accuracy_score(y_test, y_pred)
print(score)
print(cr(y_test, y_pred))

0.8638692363991217
              precision    recall  f1-score   support

           0       0.82      0.64      0.72      1111
           1       0.88      0.95      0.91      2988

    accuracy                           0.86      4099
   macro avg       0.85      0.79      0.81      4099
weighted avg       0.86      0.86      0.86      4099



In [122]:
# 10-fold cross-validation - CBOW
scores = cross_val_score(rfc, x, y, cv = k_folds)

In [123]:
# 10-fold cross-validation results - CBOW
print("CV scores: ", scores)
print("Avg CV score: ", scores.mean())

CV scores:  [0.85512195 0.84236213 0.89263055 0.86578819 0.86627623 0.86285993
 0.88384578 0.86920449 0.87896535 0.87262079]
Avg CV score:  0.8689675391922294


In [124]:
pickle_in = open("word2vec_skip-gram_vectors.pickle", "rb")
word_vectors_Skip_Gram = pickle.load(pickle_in)

In [125]:
# Skip-Gram
x = []
for i in range(len(word_vectors_Skip_Gram)):
    x.append(getReviewVector(word_vectors_Skip_Gram[i]))

In [126]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [127]:
rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [128]:
score = accuracy_score(y_test, y_pred)
print(score)
print(cr(y_test, y_pred))

0.8638692363991217
              precision    recall  f1-score   support

           0       0.83      0.62      0.71      1111
           1       0.87      0.95      0.91      2988

    accuracy                           0.86      4099
   macro avg       0.85      0.79      0.81      4099
weighted avg       0.86      0.86      0.86      4099



In [129]:
# 10-fold cross-validation - Skip-Gram
scores = cross_val_score(rfc, x, y, cv = k_folds)

In [130]:
# 10-fold cross-validation results - Skip-Gram
print("CV scores: ", scores)
print("Avg CV score: ", scores.mean())

CV scores:  [0.86341463 0.86530015 0.88823816 0.87896535 0.86774036 0.86627623
 0.89019034 0.87310883 0.88384578 0.88482186]
Avg CV score:  0.8761901701008226
