In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2, f_classif


# Tokenized & Lemmatized data
#data = pd.read_csv('../Trip Advisor Preprocessing/data_tokenized_Isak.csv') #Smaller dataset
data = pd.read_csv('../Trip Advisor Preprocessing/data_tokenized.csv')

In [25]:
#Replacing all values with sentiment labels as discussed in the paper

data['ratings.overall'] = data['ratings.overall'].replace(range(0, 3), 'aNegative')
data['ratings.overall'] = data['ratings.overall'].replace(3, 'bNeutral')
data['ratings.overall'] = data['ratings.overall'].replace(range(4, 6), 'cPositive')

result = data.groupby('ratings.overall').size()

result

ratings.overall
aNegative     1577
bNeutral      1945
cPositive    19959
dtype: int64

In [26]:
# Extract dataframing and turning input & outvariable to model-readable list. 
preprocessed_data = data['text'].tolist()
labels = data['ratings.overall'].tolist()

In [27]:
#Creating train split and subsequent validation / test splits. 

X_train, X_rem, y_train, y_rem = train_test_split(preprocessed_data,labels,train_size=0.75,stratify=labels) #75% train data
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5) #12.5% in test and 12.5% in validation

In [28]:
# Define pipeline
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('logreg', LogisticRegression()),
])


# Define parameters for grid search
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'vect__max_df': [0.9, 0.95, 1.0],
    'vect__min_df': [1, 2, 3],
    'logreg__C': [0.1, 1.0, 10.0],
    'logreg__max_iter': [10000],
    'logreg__solver': ['saga'],
}


# Create grid search object
grid_search = GridSearchCV(pipeline, parameters, scoring='f1_weighted', cv=5, n_jobs=-1)


# Fit the grid search to the training data
#grid_search.fit(X_train, y_train)
grid_search.fit(X_train, y_train)


# Print best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best accuracy score: ", grid_search.best_score_)

Best parameters:  {'logreg__C': 10.0, 'logreg__max_iter': 10000, 'logreg__solver': 'saga', 'vect__max_df': 0.9, 'vect__min_df': 1, 'vect__ngram_range': (1, 2)}
Best accuracy score:  0.8773240432671511


In [29]:
# Evaluate the best model on the val data
best_model = grid_search.best_estimator_
y_pred_val = best_model.predict(X_valid)
print("Best model used: ", best_model)

#target_names = ['Rating 1', 'Rating 2', 'Rating 3', 'Rating 4', 'Rating 5']
target_names = ['Negative', 'Netural', 'Positive']
print('\nValidation set Classification Report: \n',classification_report(y_valid, y_pred_val, target_names=target_names))

Best model used:  Pipeline(steps=[('vect', TfidfVectorizer(max_df=0.9, ngram_range=(1, 2))),
                ('logreg',
                 LogisticRegression(C=10.0, max_iter=10000, solver='saga'))])

Validation set Classification Report: 
               precision    recall  f1-score   support

    Negative       0.72      0.56      0.63       196
     Netural       0.54      0.27      0.36       242
    Positive       0.92      0.98      0.95      2497

    accuracy                           0.89      2935
   macro avg       0.73      0.60      0.65      2935
weighted avg       0.88      0.89      0.88      2935



In [30]:
#FINAL SCORING ON TEST DATA

# Evaluate the best model on the test data
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)

print("Best model used: ", best_model)

target_names = ['Negative', 'Netural', 'Positive']
print('\nTest set Classification Report: \n',classification_report(y_test, y_pred_test, target_names=target_names))

Best model used:  Pipeline(steps=[('vect', TfidfVectorizer(max_df=0.9, ngram_range=(1, 2))),
                ('logreg',
                 LogisticRegression(C=10.0, max_iter=10000, solver='saga'))])

Test set Classification Report: 
               precision    recall  f1-score   support

    Negative       0.79      0.63      0.70       198
     Netural       0.48      0.24      0.32       244
    Positive       0.92      0.98      0.95      2494

    accuracy                           0.89      2936
   macro avg       0.73      0.62      0.66      2936
weighted avg       0.87      0.89      0.88      2936



In [36]:
#This part was developed in collaboration with ChatGPT based on the metric coefficient chosen by the project group. 

# Extract top 10 keywords that have an implication on the model classifying it as 'bad'
feature_names = best_model.named_steps['vect'].get_feature_names()
top10_bad_idx = best_model.named_steps['logreg'].coef_[0, :].argsort()[::-1][:10]
top10_bad_keywords = [(feature_names[i], best_model.named_steps['logreg'].coef_[0, i]) for i in top10_bad_idx]


# Extract top 10 keywords that have an implication on the model classifying it as 'neutral'
top10_neutral_idx = best_model.named_steps['logreg'].coef_[1, :].argsort()[::-1][:10]
top10_neutral_keywords = [(feature_names[i], best_model.named_steps['logreg'].coef_[1, i]) for i in top10_neutral_idx]


# Extract top 10 keywords that have an implication on the model classifying it as 'good'
top10_good_idx = best_model.named_steps['logreg'].coef_[2, :].argsort()[::-1][:10]
top10_good_keywords = [(feature_names[i], best_model.named_steps['logreg'].coef_[2, i]) for i in top10_good_idx]


# Display top 10 keywords that have an implication on the model classifying it as a good rating
print("Top 10 keywords for a Positive rating:")
for i, (keyword, value) in enumerate(top10_good_keywords):
    print(f"{i+1}. {keyword}: {value:.3f}")

# Display top 10 keywords that have an implication on the model classifying it as a neutral rating
print("\nTop 10 keywords for a Neutral rating:")
for i, (keyword, value) in enumerate(top10_neutral_keywords):
    print(f"{i+1}. {keyword}: {value:.3f}")


# Display top 10 keywords that have an implication on the model classifying it as a bad rating
print("\nTop 10 keywords for a Negative rating:")
for i, (keyword, value) in enumerate(top10_bad_keywords):
    print(f"{i+1}. {keyword}: {value:.3f}")

Top 10 keywords for a Positive rating:
1. great: 9.119
2. excellent: 7.633
3. loved: 6.368
4. wonderful: 5.235
5. beautiful: 5.099
6. perfect: 4.564
7. everything: 4.561
8. nice: 4.445
9. best: 4.274
10. fantastic: 4.119

Top 10 keywords for a Neutral rating:
1. average: 4.735
2. okay: 4.474
3. ok: 4.310
4. basic: 3.583
5. location stay: 3.484
6. nothing: 3.428
7. frill: 3.371
8. breakfast staff: 3.251
9. small: 3.229
10. dated: 3.223

Top 10 keywords for a Negative rating:
1. dirty: 8.305
2. rude: 6.376
3. worst: 5.773
4. horrible: 5.607
5. poor: 5.514
6. terrible: 5.161
7. bug: 4.764
8. never: 4.631
9. awful: 3.917
10. cockroach: 3.768




## TESTING DC DATA ON MODEL

The following is a test on the above trained model on data from a city not included in the training data

In [32]:
#Importing the Washington DC only dataset for geographic test on unseen data. 
data_DC = pd.read_csv('../Trip Advisor Preprocessing/data_tokenized_DC.csv')

import_DC = data_DC.groupby('ratings.overall').size()
import_DC

ratings.overall
1.0     28
2.0     27
3.0     80
4.0    374
5.0    679
dtype: int64

In [33]:
#Replacing all values with sentiment labels as discussed in the paper
data_DC['ratings.overall'] = data_DC['ratings.overall'].replace(range(0, 3), 'aNegative')
data_DC['ratings.overall'] = data_DC['ratings.overall'].replace(3, 'bNeutral')
data_DC['ratings.overall'] = data_DC['ratings.overall'].replace(range(4, 6), 'cPositive')

result_DC = data_DC.groupby('ratings.overall').size()

result_DC

ratings.overall
aNegative      55
bNeutral       80
cPositive    1053
dtype: int64

In [34]:
# Extract dataframing and turning input & outvariable to model-readable list. 
preprocessed_data_DC = data_DC['text'].tolist()
labels_DC = data_DC['ratings.overall'].tolist()

In [35]:
# Evaluate the best model on the data
best_model = grid_search.best_estimator_
y_pred_DC = best_model.predict(preprocessed_data_DC)

print("Best model used: ", grid_search.best_params_)

target_names_DC = ['Negative', 'Netural', 'Positive']
print('\nDC set Classification Report: \n',classification_report(labels_DC, y_pred_DC, target_names=target_names))

Best model used:  {'logreg__C': 10.0, 'logreg__max_iter': 10000, 'logreg__solver': 'saga', 'vect__max_df': 0.9, 'vect__min_df': 1, 'vect__ngram_range': (1, 2)}

DC set Classification Report: 
               precision    recall  f1-score   support

    Negative       0.70      0.47      0.57        55
     Netural       0.47      0.24      0.32        80
    Positive       0.93      0.98      0.96      1053

    accuracy                           0.91      1188
   macro avg       0.70      0.56      0.61      1188
weighted avg       0.89      0.91      0.89      1188

