In [2]:
## Import dependencies

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.linear_model import LogisticRegression


In [3]:
## Load the data
file_path = Path("Resources/disney_clean.csv")
disney_clean_df = pd.read_csv(file_path)
disney_clean_df

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Review_Words,Review_Letters,Year,Month,Tourist,Branch_Encoded,Location_Encoded
0,670772142,4,2019-04-01,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,59,329,2019,4,1,1,0
1,670682799,4,2019-05-01,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,171,970,2019,5,1,1,8
2,670623270,4,2019-04-01,Other,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,169,938,2019,4,1,1,7
3,670607911,4,2019-04-01,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,91,485,2019,4,1,1,0
4,670607296,4,2019-04-01,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,31,163,2019,4,1,1,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40018,92198076,4,2011-01-01,United Kingdom,Although our pick up was prompt the taxi drive...,Disneyland_Paris,316,1574,2011,1,1,2,10
40019,92061774,4,2011-01-01,Other,Just returned from a 4 days family trip to Dis...,Disneyland_Paris,647,3593,2011,1,1,2,7
40020,91995748,1,2010-12-01,United Kingdom,We spent the 20 Dec 2010 in the Disney park an...,Disneyland_Paris,440,2537,2010,12,1,2,10
40021,91984642,2,2010-12-01,United Kingdom,Well I was really looking forward to this trip...,Disneyland_Paris,314,1758,2010,12,1,2,10


In [None]:
disney_raw_df.Reviewer_Location.value_counts().plot.density()

In [None]:
location_counts = disney_raw_df.Reviewer_Location.value_counts()
# Determine which values to replace if counts are less than ..?
replace_location = list(location_counts[location_counts < 500].index)

# Replace in dataframe
for place in replace_location:
    disney_raw_df.Reviewer_Location = disney_raw_df.Reviewer_Location.replace(place,"Other")
    
# Check to make sure binning was successful
disney_raw_df.Reviewer_Location.value_counts()

In [5]:
# Create our features
X = disney_clean_df[["Rating","Review_Words", "Review_Letters", "Month", "Year", "Tourist", "Location_Encoded"]]

# Create our target
y = disney_clean_df['Branch_Encoded']

In [6]:
# Check counts, see if we need resampling
y.value_counts()

0    18196
2    12693
1     9134
Name: Branch_Encoded, dtype: int64

In [17]:
# Split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [98]:
# Creating the scaler instance
scaler = StandardScaler()
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)
# Fitting the scaler
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [107]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [52]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
X_train, y_train)

In [108]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train_scaled, y_train)

LogisticRegression(random_state=1)

In [109]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.3333333333333333

In [18]:
# Train the EasyEnsembleClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=500, random_state=1)
brf_model = brf_model.fit(X_train, y_train)

In [19]:
# Calculated the balanced accuracy score
y_pred = brf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7660844959103454

In [11]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[3541,  629,  361],
       [ 145, 1699,  455],
       [  84,  596, 2496]], dtype=int64)

In [12]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      0.78      0.96      0.85      0.87      0.74      4531
          1       0.58      0.74      0.84      0.65      0.79      0.62      2299
          2       0.75      0.79      0.88      0.77      0.83      0.69      3176

avg / total       0.80      0.77      0.91      0.78      0.84      0.69     10006



In [20]:
# List the features sorted in descending order by feature importance
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.517441470511195, 'Location_Encoded'),
 (0.2585139879508482, 'Tourist'),
 (0.10176790220431653, 'Month'),
 (0.07506723368097394, 'Year'),
 (0.047209405652666306, 'Rating')]

In [21]:
brf_model.predict([[5,1,15,1,1,1,3]])

ValueError: X has 7 features, but DecisionTreeClassifier is expecting 5 features as input.

In [81]:
# Create our features
X = disney_clean_df[["Rating", "Month", "Year", "Tourist", "Location_Encoded"]]

# Create our target
y = disney_clean_df['Branch_Encoded']

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Train the EasyEnsembleClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=500, random_state=1)
brf_model = brf_model.fit(X_train, y_train)

In [83]:
# Calculated the balanced accuracy score
y_pred = brf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7660844959103454

In [84]:
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.517441470511195, 'Location_Encoded'),
 (0.2585139879508482, 'Tourist'),
 (0.10176790220431653, 'Month'),
 (0.07506723368097394, 'Year'),
 (0.047209405652666306, 'Rating')]

In [90]:
brf_model.predict([[5, 5, 2022, 1, 10]])

array([2], dtype=int64)

In [91]:
import pickle
filename = 'disney_branch.pkl'
pickle.dump(brf_model, open(filename, 'wb'))

In [57]:
pd.DataFrame(y_pred).value_counts()

0    3878
2    3430
1    2698
dtype: int64

In [93]:
list(pd.DataFrame(X_test).iloc[0])

[5, 1, 2012, 0, 11]

In [110]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
brf_model = BalancedRandomForestClassifier(n_estimators=500, random_state=1)

brf_model = brf_model.fit(X_resampled, y_resampled)

In [111]:
y_pred = brf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7572970364952737

In [112]:
confusion_matrix(y_test, y_pred)

array([[3644,  486,  401],
       [ 228, 1529,  542],
       [ 110,  517, 2549]], dtype=int64)

In [94]:
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.517441470511195, 'Location_Encoded'),
 (0.2585139879508482, 'Tourist'),
 (0.10176790220431653, 'Month'),
 (0.07506723368097394, 'Year'),
 (0.047209405652666306, 'Rating')]

In [97]:
brf_model.predict([[5, 1, 2022, 0, 11]])

array([0], dtype=int64)

In [115]:
pd.DataFrame(y_pred).value_counts()

0    3982
2    3492
1    2532
dtype: int64

In [52]:
# Create our features
X = disney_clean_df[["Month", "Tourist", "Location_Encoded"]]

# Create our target
y = disney_clean_df['Branch_Encoded']

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
brf_model = BalancedRandomForestClassifier(n_estimators=500, random_state=1)
brf_model = brf_model.fit(X_train, y_train)
y_pred = brf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.768933864583135

In [54]:
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.6646600856221806, 'Location_Encoded'),
 (0.3113051385473205, 'Tourist'),
 (0.02403477583049888, 'Month')]

In [55]:
pd.DataFrame(y_pred).value_counts()

0    3785
2    3549
1    2672
dtype: int64

In [69]:
brf_model.predict([[7, 1, 11]])

array([2], dtype=int64)