In [1]:
## Import dependencies

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.linear_model import LogisticRegression


In [2]:
## Load the data
file_path = Path("Resources/disney_clean.csv")
disney_clean_df = pd.read_csv(file_path)
disney_clean_df

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Review_Words,Review_Letters,Year,Month,Tourist,Branch_Encoded,Location_Encoded
0,670772142,4,2019-04-01,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,59,329,2019,4,1,1,0
1,670682799,4,2019-05-01,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,171,970,2019,5,1,1,8
2,670623270,4,2019-04-01,Other,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,169,938,2019,4,1,1,7
3,670607911,4,2019-04-01,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,91,485,2019,4,1,1,0
4,670607296,4,2019-04-01,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,31,163,2019,4,1,1,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40018,92198076,4,2011-01-01,United Kingdom,Although our pick up was prompt the taxi drive...,Disneyland_Paris,316,1574,2011,1,1,2,10
40019,92061774,4,2011-01-01,Other,Just returned from a 4 days family trip to Dis...,Disneyland_Paris,647,3593,2011,1,1,2,7
40020,91995748,1,2010-12-01,United Kingdom,We spent the 20 Dec 2010 in the Disney park an...,Disneyland_Paris,440,2537,2010,12,1,2,10
40021,91984642,2,2010-12-01,United Kingdom,Well I was really looking forward to this trip...,Disneyland_Paris,314,1758,2010,12,1,2,10


In [3]:
# Create our features
X = disney_clean_df[["Rating", "Month", "Year", "Tourist", "Location_Encoded"]]

# Create our target
y = disney_clean_df['Branch_Encoded']

In [4]:
# Check counts, see if we need resampling
y.value_counts()

0    18196
2    12693
1     9134
Name: Branch_Encoded, dtype: int64

In [5]:
# Split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Train the EasyEnsembleClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=500, random_state=1)
brf_model = brf_model.fit(X_train, y_train)

In [7]:
# Calculated the balanced accuracy score
y_pred = brf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7660844959103454

In [9]:
# List the features sorted in descending order by feature importance
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.517441470511195, 'Location_Encoded'),
 (0.2585139879508482, 'Tourist'),
 (0.10176790220431653, 'Month'),
 (0.07506723368097394, 'Year'),
 (0.047209405652666306, 'Rating')]

In [10]:
brf_model.predict([[5, 5, 2022, 1, 10]])

array([2], dtype=int64)

In [91]:
import pickle
filename = 'disney_branch.pkl'
pickle.dump(brf_model, open(filename, 'wb'))

In [11]:
pd.DataFrame(y_pred).value_counts()

0    3879
2    3421
1    2706
dtype: int64

In [12]:
list(pd.DataFrame(X_test).iloc[0])

[5, 1, 2012, 0, 11]

In [15]:
brf_model.predict([[5, 1, 2022, 0, 11]])

array([0], dtype=int64)