In [52]:
# Load dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2, f_classif


#Tokenized & Lemmatized data
data = pd.read_csv('../Trip Advisor Preprocessing/data_tokenized.csv')



In [53]:
#Replacing all values with sentiment labels as discussed in the paper

data['ratings.overall'] = data['ratings.overall'].replace(range(0, 3), 'aNegative')
data['ratings.overall'] = data['ratings.overall'].replace(3, 'bNeutral')
data['ratings.overall'] = data['ratings.overall'].replace(range(4, 6), 'cPositive')

result = data.groupby('ratings.overall').size()

result

ratings.overall
aNegative     1577
bNeutral      1945
cPositive    19959
dtype: int64

In [54]:
# Extract dataframing and turning input & outvariable to model-readable list. 
preprocessed_data = data['text'].tolist()
labels = data['ratings.overall'].tolist()

In [55]:
#Creating train split and subsequent validation / test splits. 
X_train, X_rem, y_train, y_rem = train_test_split(preprocessed_data,labels,train_size=0.75,stratify=labels) #75% train data
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5) #12.5% in test and 12.5% in validation

In [56]:
# Define pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('nb', MultinomialNB()),
])


# Define parameters for grid search
parameters = {
    'vect__ngram_range': [(1,1), (1,2), (2,2)],
    'vect__max_df': [0.9, 0.95, 1.0],
    'vect__min_df': [1, 2, 3],
    'nb__alpha': [0.1, 0.5, 1.0],  # Smoothing parameter (Laplace smoothing)
    'nb__fit_prior': [True, False],  # Whether to learn class prior probabilities or not
    'nb__class_prior': [None, [0.25, 0.75], [0.5, 0.5]],  # Prior probabilities of the classes
}

# Create grid search object
grid_search = GridSearchCV(pipeline, parameters, scoring='f1_weighted', cv=5, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Print best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best accuracy score: ", grid_search.best_score_)

1620 fits failed out of a total of 2430.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1620 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/johanisak/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/johanisak/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/johanisak/opt/anaconda3/lib/python3.9/site-packages/sklearn/naive_bayes.py", line 729, in fit
    self._update_class_log_prior(class_prior=class_prior)
  File "/Users/johanisak/opt/anaconda3/lib/python3.9/site-pa

Best parameters:  {'nb__alpha': 0.5, 'nb__class_prior': None, 'nb__fit_prior': True, 'vect__max_df': 0.9, 'vect__min_df': 2, 'vect__ngram_range': (1, 2)}
Best accuracy score:  0.880499064297329


In [57]:
# Evaluate the best model on the val data
best_model = grid_search.best_estimator_
y_pred_val = best_model.predict(X_valid)
print("Best model used: ", grid_search.best_params_)

target_names = ['Negative', 'Netural', 'Positive']
print('\nValidation set Classification Report: \n',classification_report(y_valid, y_pred_val, target_names=target_names))

Best model used:  {'nb__alpha': 0.5, 'nb__class_prior': None, 'nb__fit_prior': True, 'vect__max_df': 0.9, 'vect__min_df': 2, 'vect__ngram_range': (1, 2)}

Validation set Classification Report: 
               precision    recall  f1-score   support

    Negative       0.69      0.68      0.68       201
     Netural       0.41      0.39      0.40       254
    Positive       0.94      0.94      0.94      2480

    accuracy                           0.88      2935
   macro avg       0.68      0.67      0.67      2935
weighted avg       0.87      0.88      0.88      2935



In [58]:
#FINAL SCORING ON TEST DATA

# Evaluate the best model on the val data
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)

target_names = ['Negative', 'Netural', 'Positive']
print('\nTest set Classification Report: \n',classification_report(y_test, y_pred_test, target_names=target_names))


Test set Classification Report: 
               precision    recall  f1-score   support

    Negative       0.71      0.70      0.71       193
     Netural       0.41      0.38      0.40       232
    Positive       0.95      0.95      0.95      2511

    accuracy                           0.89      2936
   macro avg       0.69      0.68      0.68      2936
weighted avg       0.89      0.89      0.89      2936



In [64]:
#This part was developed in collaboration with ChatGPT based on the metric logarithmic probability chosen by the project group. 

# Extract top 10 keywords that have an implication on the model classifying it as 'bad'
feature_names = best_model.named_steps['vect'].get_feature_names()
top10_bad_idx = best_model.named_steps['nb'].feature_log_prob_[0, :].argsort()[::-1][:10]
top10_bad_keywords = [(feature_names[i], best_model.named_steps['nb'].feature_log_prob_[0, i]) for i in top10_bad_idx]


# Extract top 10 keywords that have an implication on the model classifying it as 'neutral'
top10_neutral_idx = best_model.named_steps['nb'].feature_log_prob_[1, :].argsort()[::-1][:10]
top10_neutral_keywords = [(feature_names[i], best_model.named_steps['nb'].feature_log_prob_[1, i]) for i in top10_neutral_idx]


# Extract top 10 keywords that have an implication on the model classifying it as 'good'
top10_good_idx = best_model.named_steps['nb'].feature_log_prob_[2, :].argsort()[::-1][:10]
top10_good_keywords = [(feature_names[i], best_model.named_steps['nb'].feature_log_prob_[2, i]) for i in top10_good_idx]


# Display top 10 keywords that have an implication on the model classifying it as a 5-star rating
print("Top 10 keywords for a Positive rating:")
for i, (keyword, value) in enumerate(top10_good_keywords):
    print(f"{i+1}. {keyword}: {value:.3f}")

# Display top 10 keywords that have an implication on the model classifying it as a 1-star rating
print("\nTop 10 keywords for a Neutral rating:")
for i, (keyword, value) in enumerate(top10_neutral_keywords):
    print(f"{i+1}. {keyword}: {value:.3f}")


# Display top 10 keywords that have an implication on the model classifying it as a 1-star rating
print("\nTop 10 keywords for a Negative rating:")
for i, (keyword, value) in enumerate(top10_bad_keywords):
    print(f"{i+1}. {keyword}: {value:.3f}")

Top 10 keywords for a Positive rating:
1. great: -3.572
2. hotel: -3.747
3. room: -3.772
4. staff: -3.982
5. location: -4.079
6. clean: -4.291
7. stay: -4.321
8. nice: -4.371
9. good: -4.470
10. service: -4.574

Top 10 keywords for a Neutral rating:
1. room: -3.755
2. hotel: -4.053
3. good: -4.298
4. location: -4.389
5. clean: -4.657
6. great: -4.697
7. nice: -4.868
8. staff: -4.969
9. stay: -5.116
10. small: -5.170

Top 10 keywords for a Negative rating:
1. room: -3.822
2. hotel: -4.131
3. stay: -4.846
4. dirty: -5.004
5. staff: -5.026
6. nt: -5.048
7. service: -5.136
8. location: -5.361
9. bed: -5.369
10. good: -5.492




## TESTING DC DATA ON MODEL

The following is a test on the above trained model on data from a city not included in the training data

In [60]:
#Importing the Washington DC only dataset for geographic test on unseen data. 
data_DC = pd.read_csv('../Trip Advisor Preprocessing/data_tokenized_DC.csv')

import_DC = data_DC.groupby('ratings.overall').size()
import_DC

ratings.overall
1.0     28
2.0     27
3.0     80
4.0    374
5.0    679
dtype: int64

In [61]:
#Replacing all values with sentiment labels as discussed in the paper
data_DC['ratings.overall'] = data_DC['ratings.overall'].replace(range(0, 3), 'aNegative')
data_DC['ratings.overall'] = data_DC['ratings.overall'].replace(3, 'bNeutral')
data_DC['ratings.overall'] = data_DC['ratings.overall'].replace(range(4, 6), 'cPositive')

result_DC = data_DC.groupby('ratings.overall').size()

result_DC

ratings.overall
aNegative      55
bNeutral       80
cPositive    1053
dtype: int64

In [62]:
# Extract preprocessed data and labels
preprocessed_data_DC = data_DC['text'].tolist()
labels_DC = data_DC['ratings.overall'].tolist()

In [63]:
# Evaluate the best model on the data
best_model = grid_search.best_estimator_
y_pred_DC = best_model.predict(preprocessed_data_DC)

print("Best model used: ", grid_search.best_params_)

target_names_DC = ['Negative', 'Netural', 'Positive']
print('\nDC set Classification Report: \n',classification_report(labels_DC, y_pred_DC, target_names=target_names))

Best model used:  {'nb__alpha': 0.5, 'nb__class_prior': None, 'nb__fit_prior': True, 'vect__max_df': 0.9, 'vect__min_df': 2, 'vect__ngram_range': (1, 2)}

DC set Classification Report: 
               precision    recall  f1-score   support

    Negative       0.66      0.56      0.61        55
     Netural       0.40      0.40      0.40        80
    Positive       0.95      0.96      0.96      1053

    accuracy                           0.90      1188
   macro avg       0.67      0.64      0.65      1188
weighted avg       0.90      0.90      0.90      1188

