In [None]:
import warnings
warnings.simplefilter("ignore")

import json
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import confusion_matrix
# from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV

In [None]:
#  Import and read the coffee_shops.csv
coffee_df = pd.read_csv("Resources/coffee_shops.csv")
coffee_df.drop(['id'], axis = 1, inplace = True)
coffee_df

### Features Engineering 

In [None]:
# Convert attr price range into separate column price
price_df = coffee_df[['business_id','attr_value']].loc[coffee_df['attr_key']=='restaurantspricerange2']
price_df.rename(columns = {'attr_value':'price'}, inplace = True)
price_df.drop(price_df.loc[price_df['price'] == 'None'].index, inplace = True)
cp_df = coffee_df.join(price_df.set_index('business_id'), on='business_id')
cp_df.drop_duplicates(inplace = True)
cp_df

In [None]:
# convert attr bikeparking into separate column price
bike_df = cp_df[['business_id','attr_value']].loc[cp_df['attr_key']=='bikeparking']
bike_df.rename(columns = {'attr_value':'bikeparking'}, inplace = True)
bike_df['bikeparking'].loc[bike_df['bikeparking'] == 'True'] = 1
bike_df['bikeparking'].loc[bike_df['bikeparking'] == 'False'] = 0
bike_df.drop(bike_df.loc[bike_df['bikeparking'] == 'None'].index, inplace = True)
cb_df = cp_df.join(bike_df.set_index('business_id'), on='business_id')
cb_df.drop_duplicates(inplace = True)
cb_df

In [None]:
# convert attr outdoorseating into separate column outdoorseating
out_df = cb_df[['business_id','attr_value']].loc[cb_df['attr_key']=='outdoorseating']
out_df.rename(columns = {'attr_value':'outdoorseating'}, inplace = True)
out_df['outdoorseating'].loc[out_df['outdoorseating'] == 'True'] = 1
out_df['outdoorseating'].loc[out_df['outdoorseating'] == 'False'] = 0
out_df.drop(out_df.loc[out_df['outdoorseating'] == 'None'].index, inplace = True)
co_df = cb_df.join(out_df.set_index('business_id'), on='business_id')
co_df.drop_duplicates(inplace = True)
co_df

In [None]:
# Remove attr_key and attr_value columns as they are no longer needed
# Clean data frame - drop duplicates, NA
co_df.drop(['attr_key', 'attr_value'], axis = 1, inplace = True)
co_df.drop_duplicates(inplace = True)
co_df.dropna(inplace = True)
co_df

In [None]:
#  Import and read the generated features
add_data_df = pd.read_csv("Resources/mean_features.csv")
add_data_df

In [None]:
add_data_df.rename(columns={"date_diff": "age_in_days", "month_rev": "monthly_reviews"}, inplace = True)
add_data_df.head()

In [None]:
# join them
# coffee_df.set_index('business_id').join(add_data_df.set_index('b_id'))
new_df = co_df.join(add_data_df.set_index('b_id'), on='business_id')
new_df

In [None]:
new_df.price = pd.to_numeric(new_df.price, errors='coerce')
new_df.bikeparking = pd.to_numeric(new_df.bikeparking, errors='coerce')
new_df.outdoorseating = pd.to_numeric(new_df.outdoorseating, errors='coerce')

In [None]:
new_df.dtypes

In [None]:
# export to csv
# new_df.to_csv("joined_data1.csv", index = False)

In [None]:
# Drop the non-beneficial ID columns
new_df.drop(['business_id', 'name', 'address', 'city', 'state'], axis = 1, inplace = True)

In [None]:
new_df

In [None]:
# Convert categorical values to numeric
X_dummies = pd.get_dummies(new_df)
X_dummies

In [None]:
# Split our preprocessed data into our features and target arrays
X = X_dummies.drop(columns=["is_open"])
y = X_dummies["is_open"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Use Logistic Regression
lr1 = LogisticRegression().fit(X_train_scaled, y_train)
print(f'Training Score: {lr1.score(X_train_scaled, y_train)}')
print(f'Testing Score: {lr1.score(X_test_scaled, y_test)}')

y_predL = lr1.predict(X_test_scaled)
print('Confusion Matrix: \n',confusion_matrix(list(y_test.values), y_predL))

In [None]:
# Use AdaBoosting
ada = AdaBoostClassifier().fit(X_train_scaled, y_train)
print(f'Training Score: {ada.score(X_train_scaled, y_train)}')
print(f'Testing Score: {ada.score(X_test_scaled, y_test)}')

y_predA = ada.predict(X_test_scaled)
print('Confusion Matrix: \n',confusion_matrix(list(y_test.values), y_predA))

In [None]:
dt = DecisionTreeClassifier(random_state=1).fit(X_train_scaled, y_train)
print(f'Training Score: {dt.score(X_train_scaled, y_train)}')
print(f'Testing Score: {dt.score(X_test_scaled, y_test)}')

y_predD = dt.predict(X_test_scaled)
print('Confusion Matrix: \n',confusion_matrix(list(y_test.values), y_predD))

In [None]:
rf = RandomForestClassifier(random_state=1).fit(X_train_scaled, y_train)
print(f'Training Score: {rf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {rf.score(X_test_scaled, y_test)}')

y_predR = rf.predict(X_test_scaled)
print('Confusion Matrix: \n',confusion_matrix(list(y_test.values), y_predR))

In [None]:
# print confusion matrix
import itertools

classes=['Closed', 'Open']
cm = confusion_matrix(y_test, y_predR)

plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Reds)
plt.title("Random Forest Confusion Matrix")
plt.colorbar()

tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

# fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, format(cm[i, j], 'd'),
             horizontalalignment="center",
             color="white" if cm[i, j] > thresh else "black")

plt.tight_layout()
plt.ylabel('True')
plt.xlabel('Predicted')

In [None]:
cm = confusion_matrix(y_test, y_predD)

plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Reds)
plt.title("Decision Tree Confusion Matrix")
plt.colorbar()

tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

# fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, format(cm[i, j], 'd'),
             horizontalalignment="center",
             color="white" if cm[i, j] > thresh else "black")

plt.tight_layout()
plt.ylabel('True')
plt.xlabel('Predicted')

In [None]:
# features = rf.feature_importances_
# print(features)
# plt.bar(x = range(len(features)), height=features)
# plt.show()

In [None]:
# np.std(X_test_scaled,axis=0)*clf.feature_importances_[0]

In [None]:
feats = {}
for feature, importance in zip(X.columns, dt.feature_importances_):
    feats[feature] = importance
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-Importance'})
importances = importances.sort_values(by='Gini-Importance', ascending=False)
importances = importances.reset_index()
importances = importances.rename(columns={'index': 'Features'})
sns.set(font_scale = 5)
sns.set(style="whitegrid", color_codes=True, font_scale = 1.7)
fig, ax = plt.subplots()
fig.set_size_inches(20,15)
sns.barplot(x=importances['Gini-Importance'], y=importances['Features'], data=importances, color='lightseagreen')
plt.xlabel('Importance', fontsize=25, weight = 'bold')
plt.ylabel('Features', fontsize=25, weight = 'bold')
plt.title('Desicion Tree Feature Importance', fontsize=25, weight = 'bold')
display(plt.show())
display(importances)

In [None]:
feats = {}
for feature, importance in zip(X.columns, rf.feature_importances_):
    feats[feature] = importance
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-Importance'})
importances = importances.sort_values(by='Gini-Importance', ascending=False)
importances = importances.reset_index()
importances = importances.rename(columns={'index': 'Features'})
sns.set(font_scale = 5)
sns.set(style="whitegrid", color_codes=True, font_scale = 1.7)
fig, ax = plt.subplots()
fig.set_size_inches(20,15)
sns.barplot(x=importances['Gini-Importance'], y=importances['Features'], data=importances, color='lightseagreen')
plt.xlabel('Importance', fontsize=25, weight = 'bold')
plt.ylabel('Features', fontsize=25, weight = 'bold')
plt.title('Random Forest Feature Importance', fontsize=25, weight = 'bold')
display(plt.show())
display(importances)

### Random Forest did better in predicting True Positive, and features selection looks better
### Optimize RF with hypertunning

In [None]:
n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

forest = RandomForestClassifier(random_state = 1)

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

gridF = GridSearchCV(forest, hyperF, cv = 3, verbose = 1, 
                      n_jobs = -1)
bestF = gridF.fit(X_train_scaled, y_train)

In [None]:
bestF.score(X_test_scaled, y_test)

In [None]:
bestF.best_params_

In [None]:
clf_optimized = bestF.best_estimator_

In [None]:
y_predBR = clf_optimized.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_predBR)

plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Reds)
plt.title("Optimized Random Forest Confusion Matrix")
plt.colorbar()

tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

# fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, format(cm[i, j], 'd'),
             horizontalalignment="center",
             color="white" if cm[i, j] > thresh else "black")

plt.tight_layout()
plt.ylabel('True')
plt.xlabel('Predicted')

In [None]:
feats = {}
for feature, importance in zip(X.columns, clf_optimized.feature_importances_):
    feats[feature] = importance
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-Importance'})
importances = importances.sort_values(by='Gini-Importance', ascending=False)
importances = importances.reset_index()
importances = importances.rename(columns={'index': 'Features'})
sns.set(font_scale = 5)
sns.set(style="whitegrid", color_codes=True, font_scale = 1.7)
fig, ax = plt.subplots()
fig.set_size_inches(30,15)
sns.barplot(x=importances['Gini-Importance'], y=importances['Features'], data=importances, color='lightseagreen')
plt.xlabel('Importance', fontsize=25, weight = 'bold')
plt.ylabel('Features', fontsize=25, weight = 'bold')
plt.title('Optimized Random Forest Feature Importance', fontsize=25, weight = 'bold')
display(plt.show())
display(importances)

In [None]:
# pip install shap

In [None]:
import shap

In [None]:
# define feature importances by using shap - it is using Shapley values from game theory
# to estimate how does each feature contribute to the prediction
explainer = shap.TreeExplainer(clf_optimized)
shap_values = explainer.shap_values(X_test_scaled)

In [None]:
# shap.summary_plot(shap_values, X_test)
# The plot below sorts features by the sum of SHAP value magnitudes over all samples,
# and uses SHAP values to show the distribution of the impacts each feature has on the model output.
# The color represents the feature value (red high, blue low)
shap.summary_plot(shap_values, features=X, feature_names=X.columns)

### Interesting that Random Forest and Shap show different distribution of Features

In [None]:
# Analyze features by their importance - Age
plt.hist([new_df[new_df['is_open']==1]['age_in_days'].values,\
          new_df[new_df['is_open']==0]['age_in_days'].values],\
        label=['Open','Closed'],color=['green','red'], alpha = 0.6)


plt.legend()
plt.title('Distribution of Coffee Shops by Age')
plt.xlabel('Coffee Shops Age based on Reviews')
plt.ylabel('Counts of Coffee Shops')

plt.rcParams["figure.figsize"] = (30,15)
# plt.figure(figsize=(30,15))
plt.show

In [None]:
# Analyze features by their importance - Coffe Shop Stars
plt.hist([new_df[new_df['is_open']==1]['stars'].values,\
          new_df[new_df['is_open']==0]['stars'].values],\
        label=['Open','Closed'],color=['green','red'], alpha = 0.6)


plt.legend()
plt.title('Distribution of Coffee Shops by Stars')
plt.xlabel('Coffee Shops Stars')
plt.ylabel('Counts of Coffee Shops')

# plt.rcParams["figure.figsize"] = (30,15)
plt.figure(figsize=(30,15))
plt.show

In [None]:
# Analyze features by their importance - Average Number of Reviews per Month for Coffe Shops
plt.hist([new_df[new_df['is_open']==1]['monthly_reviews'].values,\
          new_df[new_df['is_open']==0]['monthly_reviews'].values],\
        label=['Open','Closed'],color=['green','red'], alpha = 0.6)


plt.legend()
plt.title('Distribution of Coffee Shops by monthly_reviews')
plt.xlabel('Coffee Shops average monthly reviews')
plt.ylabel('Counts of Coffee Shops')

# plt.rcParams["figure.figsize"] = (30,15)
plt.figure(figsize=(30,15))
plt.show

In [None]:
# Analyze features by their importance - Is_chain
plt.hist([new_df[new_df['is_open']==1]['is_chain'].values,\
          new_df[new_df['is_open']==0]['is_chain'].values],\
        label=['Open','Closed'],color=['green','red'], alpha = 0.6)


plt.legend()
plt.title('Distribution of Coffee Shops by Chain')
plt.xlabel('Coffee Shops Is in Chain')
plt.ylabel('Counts of Coffee Shops')

plt.rcParams["figure.figsize"] = (10,7)
# plt.figure(figsize=(30,15))
plt.show