In [11]:
import pandas as pd
import os, sys
import nltk
import re

sys.path.append(".")
import preprocessing.feature_engineering
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import plot

# import torch


cwd = os.getcwd()
data_path = os.path.join(cwd, "data")

## Tripadvisor data

In [8]:
tripadvisor_df = pd.read_csv(os.path.join(data_path,"tripadvisor_hotel_reviews.csv"))
tripadvisor_df["Rating"].value_counts()

5    9054
4    6039
3    2184
2    1793
1    1421
Name: Rating, dtype: int64

We can see that the dataset is unbalanced, with much higher numbers for the higher grades. To make this a binary classification problem, i choose to count the numbers above 3 as positive (1) and the numbers 3 and below negative (0).

In [9]:
tripadvisor_df["sentiment"] = pd.cut(x = tripadvisor_df["Rating"], bins = [0, 4, 6], labels = [0, 1])
value_counts = tripadvisor_df["sentiment"].value_counts()
tripadvisor_df = tripadvisor_df.rename(columns={"Review":"text", "Rating":"rating"})

In [10]:
value_counts

0    11437
1     9054
Name: sentiment, dtype: int64

In [8]:
fig = px.histogram(tripadvisor_df, x="sentiment", width=500, height=250)
fig.update_layout(bargap=0.3)
fig.show()

This plot illustrates that the data is unbalanced. By looking into some of the reviews it looks like the reviews are not complete sentences.

In [9]:
X = tripadvisor_df["text"].copy()
y = tripadvisor_df["sentiment"].copy()

In [2]:
import preprocessing.feature_engineering as feature_engineering
import preprocessing.dataframe_manipulation as df_manipulation
import pandas as pd

DATA_FILENAME = "tripadvisor_hotel_reviews.csv"
# df = get_dataframe("tripadvisor_hotel_reviews.csv")
# print(df.head())
df = df_manipulation.get_and_process_df(DATA_FILENAME)
feature_df = feature_engineering.textual_features(df, lem = True)
feature_df

Unnamed: 0,text,num_unique_words,word_count
0,nice hotel expensive parking got good deal sta...,77,87
1,ok nothing special charge diamond member hilto...,188,250
2,nice rooms not 4* experience hotel monaco seat...,161,217
3,"unique, great stay, wonderful time hotel monac...",75,89
4,"great stay great stay, went seahawk game aweso...",160,191
...,...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",96,109
20487,great location price view hotel great quick pl...,33,39
20488,"ok just looks nice modern outside, desk staff ...",59,63
20489,hotel theft ruined vacation hotel opened sept ...,529,781


In [12]:
feature_engineering.lemmatize("theft")

'theft'

In [None]:
X = feature_engineering.tfidf_vectorize(feature_df)
y = df["sentiment"].copy()

X_train, X_val, X_test, y_train, y_val, y_test = df_manipulation.train_validate_test_split(
    X, y, test_size=0.1, val_size=0.1
)

In [22]:
vectorizer = TfidfVectorizer(stop_words="english", min_df=0.005)
vectorizer.fit(feature_df["text"])
# X = vectorizer.fit_transform(feature_df["text"])

TfidfVectorizer(min_df=0.005, stop_words='english')

In [12]:
from models.model_manipulation import save_model, load_model

svm_classifier = load_model("svm")
knn_classifier = load_model("knn")

In [13]:
import models.model_manipulation

new_df = df_manipulation.get_and_process_df("new_tripadvisor_data.csv")
models.model_manipulation.test_model_from_df(new_df, svm_classifier, "tfidf")

Testing SVMClassifier at 2022-04-19 12:42:07.399013
Classification report for SVMClassifier
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         1

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



In [9]:
from models.models import SVMClassifier
from models.models import KNNClassifier
from models.model_manipulation import train_model

svm_classifier = train_model(X_train, y_train, SVMClassifier())
knn_classifier = train_model(X_train, y_train, KNNClassifier(neighbors=5))

Training <class 'models.SVMClassifier'> at 2022-04-17 20:55:34.345728
Done training <class 'models.SVMClassifier'> at 2022-04-17 21:00:45.011002
Training <class 'models.KNNClassifier'> at 2022-04-17 21:00:45.011963
Done training <class 'models.KNNClassifier'> at 2022-04-17 21:00:45.023935


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

In [8]:
from models.models import EnsembleClassifier

ec = EnsembleClassifier()

ec.classifier.fit(X_train, y_train)

VotingClassifier(estimators=[('SVC',
                              Pipeline(steps=[('standardscaler',
                                               StandardScaler(with_mean=False)),
                                              ('svc', SVC())])),
                             ('KNN', KNeighborsClassifier())])

In [17]:
from models.model_manipulation import test_model

test_model(X_val, y_val, ec.classifier)

Testing VotingClassifier at 2022-04-17 22:03:22.740622
Classification report for VotingClassifier
              precision    recall  f1-score   support

           0       0.80      0.80      0.80       475
           1       0.93      0.93      0.93      1370

    accuracy                           0.90      1845
   macro avg       0.87      0.87      0.87      1845
weighted avg       0.90      0.90      0.90      1845



## Crossvalidation



In [3]:
from sklearn.model_selection import GridSearchCV

In [15]:
from models.models import SVMClassifier, KNNClassifier

In [10]:
params = [{'C': [1, 1.5, 2],
         'kernel': ['linear', 'rbf', "poly"],
         }]

In [6]:
model = SVMClassifier()

In [11]:
gs = GridSearchCV(model.classifier,
    param_grid=params,
    scoring="accuracy",
    cv=5
)

In [12]:
gs.fit(X_val, y_val)

GridSearchCV(cv=5, estimator=SVC(C=1),
             param_grid=[{'C': [1, 1.5, 2],
                          'kernel': ['linear', 'rbf', 'poly']}],
             scoring='accuracy')

In [13]:
gs.best_params_

{'C': 1.5, 'kernel': 'rbf'}

In [17]:
model = KNNClassifier()

params = [{'n_neighbors': [3, 5, 7],
         'weights': ['uniform', 'distance'],
         }]

In [18]:
gs = GridSearchCV(model.classifier,
    param_grid=params,
    scoring="accuracy",
    cv=5
)

In [20]:
gs.fit(X_val, y_val)
gs.best_params_

{'n_neighbors': 7, 'weights': 'uniform'}

In [21]:
import sys
sys.path.append(".")

In [22]:
import helpers.parameters as params

In [None]:
params.