In [77]:
# Load dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2, f_classif

#Tokenized & Lemmatized data
data = pd.read_csv('../Trip Advisor Preprocessing/data_tokenized.csv')



In [78]:
data['ratings.overall'] = data['ratings.overall'].replace(range(0, 3), 'aNegative')
data['ratings.overall'] = data['ratings.overall'].replace(3, 'bNeutral')
data['ratings.overall'] = data['ratings.overall'].replace(range(4, 6), 'cPositive')

result = data.groupby('ratings.overall').size()

result

ratings.overall
aNegative     1577
bNeutral      1945
cPositive    19959
dtype: int64

In [79]:
# Extract preprocessed data and labels
preprocessed_data = data['text'].tolist()
labels = data['ratings.overall'].tolist()

In [80]:
X_train, X_rem, y_train, y_rem = train_test_split(preprocessed_data,labels,train_size=0.75,stratify=labels,random_state=42) #75% train data
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5,random_state=42) #12.5% in test and 12.5% in validation

In [81]:
# Define pipeline
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('rf', RandomForestClassifier()),
])

# Define parameters for grid search
parameters = {
    'vect__ngram_range': [(1,1), (1,2), (2,2)],
    'vect__max_df': [0.9, 0.95, 1.0],
    'vect__min_df': [1, 2, 3],
    'rf__criterion': ['gini', 'entropy', 'log_loss'],
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [60,70], #[10, 20, 30],
    'rf__min_samples_leaf':[5],
    'rf__class_weight':['balanced'],
}

# Create grid search object
grid_search = GridSearchCV(pipeline, parameters, scoring='f1_weighted', cv=5, n_jobs=-1,verbose=1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Print best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best accuracy score: ", grid_search.best_score_)



Fitting 5 folds for each of 486 candidates, totalling 2430 fits
Best parameters:  {'rf__class_weight': 'balanced', 'rf__criterion': 'gini', 'rf__max_depth': 70, 'rf__min_samples_leaf': 5, 'rf__n_estimators': 300, 'vect__max_df': 1.0, 'vect__min_df': 1, 'vect__ngram_range': (1, 2)}
Best accuracy score:  0.8536480045062792


In [82]:
# Evaluate the best model on the val data
best_model = grid_search.best_estimator_
y_pred_val = best_model.predict(X_valid)
print("Best model used: ", grid_search.best_params_)

#target_names = ['Rating 1', 'Rating 2', 'Rating 3', 'Rating 4', 'Rating 5']
target_names = ['Negative', 'Netural', 'Positive']
print('\nValidation set Classification Report: \n',classification_report(y_valid, y_pred_val, target_names=target_names))

Best model used:  {'rf__class_weight': 'balanced', 'rf__criterion': 'gini', 'rf__max_depth': 70, 'rf__min_samples_leaf': 5, 'rf__n_estimators': 300, 'vect__max_df': 1.0, 'vect__min_df': 1, 'vect__ngram_range': (1, 2)}

Validation set Classification Report: 
               precision    recall  f1-score   support

    Negative       0.52      0.68      0.59       189
     Netural       0.29      0.42      0.34       225
    Positive       0.95      0.89      0.92      2521

    accuracy                           0.84      2935
   macro avg       0.59      0.66      0.62      2935
weighted avg       0.87      0.84      0.85      2935



In [83]:
#FINAL SCORING ON TEST DATA

# Evaluate the best model on the val data
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)

print("Best model used: ", best_model)

target_names = ['Negative', 'Netural', 'Positive']
print('\nTest set Classification Report: \n',classification_report(y_test, y_pred_test, target_names=target_names))

Best model used:  Pipeline(steps=[('vect', TfidfVectorizer(ngram_range=(1, 2))),
                ('rf',
                 RandomForestClassifier(class_weight='balanced', max_depth=70,
                                        min_samples_leaf=5,
                                        n_estimators=300))])

Test set Classification Report: 
               precision    recall  f1-score   support

    Negative       0.53      0.77      0.63       205
     Netural       0.32      0.40      0.36       261
    Positive       0.95      0.89      0.92      2470

    accuracy                           0.84      2936
   macro avg       0.60      0.69      0.63      2936
weighted avg       0.86      0.84      0.85      2936



In [84]:
# Extract top 10 keywords that have an implication on the model classifying it as 'bad'
feature_names = best_model.named_steps['vect'].get_feature_names()
top10_bad_idx = best_model.named_steps['rf'].feature_importances_.argsort()[::-1][:10]
top10_bad_keywords = [(feature_names[i], best_model.named_steps['rf'].feature_importances_[i]) for i in top10_bad_idx]

# Extract top 10 keywords that have an implication on the model classifying it as 'neutral'
top10_neutral_idx = best_model.named_steps['rf'].feature_importances_.argsort()[::-1][:10]
top10_neutral_keywords = [(feature_names[i], best_model.named_steps['rf'].feature_importances_[i]) for i in top10_neutral_idx]

# Extract top 10 keywords that have an implication on the model classifying it as 'good'
top10_good_idx = best_model.named_steps['rf'].feature_importances_.argsort()[::-1][:10]
top10_good_keywords = [(feature_names[i], best_model.named_steps['rf'].feature_importances_[i]) for i in top10_good_idx]

# Display top 10 keywords that have an implication on the model classifying it as a 5-star rating
print("Top 10 keywords for a Positive rating:")
for i, (keyword, importance) in enumerate(top10_good_keywords):
    print(f"{i+1}. {keyword}: {importance:.3f}")

# Display top 10 keywords that have an implication on the model classifying it as a 1-star rating
print("\nTop 10 keywords for a Neutral rating:")
for i, (keyword, importance) in enumerate(top10_neutral_keywords):
    print(f"{i+1}. {keyword}: {importance:.3f}")

# Display top 10 keywords that have an implication on the model classifying it as a 1-star rating
print("\nTop 10 keywords for a Negative rating:")
for i, (keyword, importance) in enumerate(top10_bad_keywords):
    print(f"{i+1}. {keyword}: {importance:.3f}")


AttributeError: 'TfidfVectorizer' object has no attribute 'get_feature_names'

## TESTING DC DATA ON MODEL

The following is a test on the above trained model on data from a city not included in the training data

In [None]:
# Tokenized & Lemmatized data
#data = pd.read_csv('../Trip Advisor Preprocessing/data_tokenized_Isak.csv') #Smaller dataset
data_DC = pd.read_csv('../Trip Advisor Preprocessing/data_tokenized_DC.csv')

import_DC = data_DC.groupby('ratings.overall').size()
import_DC

ratings.overall
1.0     28
2.0     27
3.0     80
4.0    374
5.0    679
dtype: int64

In [None]:
data_DC['ratings.overall'] = data_DC['ratings.overall'].replace(range(0, 3), 'aNegative')
data_DC['ratings.overall'] = data_DC['ratings.overall'].replace(3, 'bNeutral')
data_DC['ratings.overall'] = data_DC['ratings.overall'].replace(range(4, 6), 'cPositive')

result_DC = data_DC.groupby('ratings.overall').size()

result_DC

ratings.overall
aNegative      55
bNeutral       80
cPositive    1053
dtype: int64

In [None]:
# Extract preprocessed data and labels
preprocessed_data_DC = data_DC['text'].tolist()
labels_DC = data_DC['ratings.overall'].tolist()

In [None]:
best_model = grid_search.best_estimator_
y_pred_DC = best_model.predict(preprocessed_data_DC)

print("Best model used: ", grid_search.best_params_)

target_names_DC = ['Negative', 'Netural', 'Positive']
print('\nDC set Classification Report: \n',classification_report(labels_DC, y_pred_DC, target_names=target_names))

Best model used:  {'rf__criterion': 'gini', 'rf__max_depth': 70, 'rf__min_samples_leaf': 5, 'rf__n_estimators': 100, 'vect__max_df': 1.0, 'vect__min_df': 3, 'vect__ngram_range': (1, 1)}

DC set Classification Report: 
               precision    recall  f1-score   support

    Negative       0.93      0.25      0.40        55
     Netural       0.00      0.00      0.00        80
    Positive       0.90      1.00      0.95      1053

    accuracy                           0.90      1188
   macro avg       0.61      0.42      0.45      1188
weighted avg       0.84      0.90      0.86      1188



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
