In [1]:
import warnings
warnings.simplefilter("ignore")

import json
import pandas as pd
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_score

In [2]:
#  Import and read the coffee_shops.csv
coffee_df = pd.read_csv("Resources/coffee_shops.csv")
coffee_df

Unnamed: 0,business_id,name,address,city,state,latitude,longitude,stars,review_count,is_open,category,attr_key,attr_value,id
0,y2gFcAVBXmVxFXAugRe5ig,Scrumptious Crumpets,7414 SE Milwaukie Ave,Portland,OR,45.471070,-122.648270,5.0,23,0,Coffee & Tea,restaurantspricerange2,1,9
1,y2gFcAVBXmVxFXAugRe5ig,Scrumptious Crumpets,7414 SE Milwaukie Ave,Portland,OR,45.471070,-122.648270,5.0,23,0,Coffee & Tea,bikeparking,True,11
2,y2gFcAVBXmVxFXAugRe5ig,Scrumptious Crumpets,7414 SE Milwaukie Ave,Portland,OR,45.471070,-122.648270,5.0,23,0,Coffee & Tea,outdoorseating,True,12
3,XrrIr0HukWA5hvM62ir65g,Tapio Tea,1145-2551 No 6 Rd,Richmond,BC,49.194168,-123.069725,4.5,11,0,Coffee & Tea,restaurantspricerange2,2,21
4,XrrIr0HukWA5hvM62ir65g,Tapio Tea,1145-2551 No 6 Rd,Richmond,BC,49.194168,-123.069725,4.5,11,0,Coffee & Tea,outdoorseating,True,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30525,JnM3ClVTEKQr82GmicNGaQ,Oh My Tea,1441 Hancock St,Quincy,MA,42.249004,-71.002450,4.0,63,0,Coffee & Tea,bikeparking,False,190080
30526,JnM3ClVTEKQr82GmicNGaQ,Oh My Tea,1441 Hancock St,Quincy,MA,42.249004,-71.002450,4.0,63,0,Coffee & Tea,outdoorseating,False,190081
30527,lT9ZjFt1IvKgrnAzDSc5gQ,Blunch,2973 N High St,Columbus,OH,40.022823,-83.014140,4.0,276,0,Coffee & Tea,restaurantspricerange2,2,190104
30528,lT9ZjFt1IvKgrnAzDSc5gQ,Blunch,2973 N High St,Columbus,OH,40.022823,-83.014140,4.0,276,0,Coffee & Tea,bikeparking,True,190105


In [3]:
# Drop the non-beneficial ID columns
coffee_df.drop(['business_id', 'name', 'address', 'city', 'state', 'id'], axis = 1, inplace = True)

In [4]:
coffee_df

Unnamed: 0,latitude,longitude,stars,review_count,is_open,category,attr_key,attr_value
0,45.471070,-122.648270,5.0,23,0,Coffee & Tea,restaurantspricerange2,1
1,45.471070,-122.648270,5.0,23,0,Coffee & Tea,bikeparking,True
2,45.471070,-122.648270,5.0,23,0,Coffee & Tea,outdoorseating,True
3,49.194168,-123.069725,4.5,11,0,Coffee & Tea,restaurantspricerange2,2
4,49.194168,-123.069725,4.5,11,0,Coffee & Tea,outdoorseating,True
...,...,...,...,...,...,...,...,...
30525,42.249004,-71.002450,4.0,63,0,Coffee & Tea,bikeparking,False
30526,42.249004,-71.002450,4.0,63,0,Coffee & Tea,outdoorseating,False
30527,40.022823,-83.014140,4.0,276,0,Coffee & Tea,restaurantspricerange2,2
30528,40.022823,-83.014140,4.0,276,0,Coffee & Tea,bikeparking,True


In [5]:
# Convert categorical values to numeric
X_dummies = pd.get_dummies(coffee_df)
X_dummies

Unnamed: 0,latitude,longitude,stars,review_count,is_open,category_Bakeries,category_Cafes,category_Chocolatiers & Shops,category_Coffee & Tea,category_Coffee & Tea Supplies,...,attr_key_drivethru,attr_key_outdoorseating,attr_key_restaurantspricerange2,attr_value_1,attr_value_2,attr_value_3,attr_value_4,attr_value_False,attr_value_None,attr_value_True
0,45.471070,-122.648270,5.0,23,0,0,0,0,1,0,...,0,0,1,1,0,0,0,0,0,0
1,45.471070,-122.648270,5.0,23,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,45.471070,-122.648270,5.0,23,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
3,49.194168,-123.069725,4.5,11,0,0,0,0,1,0,...,0,0,1,0,1,0,0,0,0,0
4,49.194168,-123.069725,4.5,11,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30525,42.249004,-71.002450,4.0,63,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
30526,42.249004,-71.002450,4.0,63,0,0,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
30527,40.022823,-83.014140,4.0,276,0,0,0,0,1,0,...,0,0,1,0,1,0,0,0,0,0
30528,40.022823,-83.014140,4.0,276,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
# Split our preprocessed data into our features and target arrays
X = X_dummies.drop(columns=["is_open"])
y = X_dummies["is_open"]
X

Unnamed: 0,latitude,longitude,stars,review_count,category_Bakeries,category_Cafes,category_Chocolatiers & Shops,category_Coffee & Tea,category_Coffee & Tea Supplies,category_Coffee Roasteries,...,attr_key_drivethru,attr_key_outdoorseating,attr_key_restaurantspricerange2,attr_value_1,attr_value_2,attr_value_3,attr_value_4,attr_value_False,attr_value_None,attr_value_True
0,45.471070,-122.648270,5.0,23,0,0,0,1,0,0,...,0,0,1,1,0,0,0,0,0,0
1,45.471070,-122.648270,5.0,23,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,45.471070,-122.648270,5.0,23,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
3,49.194168,-123.069725,4.5,11,0,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
4,49.194168,-123.069725,4.5,11,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30525,42.249004,-71.002450,4.0,63,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
30526,42.249004,-71.002450,4.0,63,0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
30527,40.022823,-83.014140,4.0,276,0,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
30528,40.022823,-83.014140,4.0,276,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
# Split data into training and testin data set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Compare the performance of Different Models

In [9]:
print('Logistic Regression scores:\n')
lr = LogisticRegression(random_state = 1).fit(X_train_scaled, y_train)
print(f'Training Score: {lr.score(X_train_scaled, y_train)}')
print(f'Testing Score: {lr.score(X_test_scaled, y_test)}')
y_pred = lr.predict(X_test_scaled)
print('Accuracy: ',lr.score(X_test_scaled,y_test))
print('Precision: ',precision_score(y_test,y_pred))
print('Confusion Matrix: \n',confusion_matrix(y_test, y_pred))

Logistic Regression scores:

Training Score: 0.7379569375900773
Testing Score: 0.7387658849731429
Accuracy:  0.7387658849731429
Precision:  0.7407896474316651
Confusion Matrix: 
 [[  29 1963]
 [  31 5610]]


In [10]:
print('Decision Tree Classifier scores:\n')
dt = DecisionTreeClassifier(random_state=1).fit(X_train_scaled, y_train)
print(f'Training Score: {dt.score(X_train_scaled, y_train)}')
print(f'Testing Score: {dt.score(X_test_scaled, y_test)}')
y_pred = dt.predict(X_test_scaled)
print('Accuracy: ',dt.score(X_test_scaled,y_test))
print('Precision: ',precision_score(y_test,y_pred))
print('Confusion Matrix: \n',confusion_matrix(y_test, y_pred))

Decision Tree Classifier scores:

Training Score: 1.0
Testing Score: 0.9337088955849601
Accuracy:  0.9337088955849601
Precision:  0.9592201752817028
Confusion Matrix: 
 [[1764  228]
 [ 278 5363]]


In [11]:
print('Random Forest Classifier scores:\n')
rf = RandomForestClassifier(random_state=1).fit(X_train_scaled, y_train)
print(f'Training Score: {rf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {rf.score(X_test_scaled, y_test)}')
y_pred = rf.predict(X_test_scaled)
print('Accuracy: ',rf.score(X_test_scaled,y_test))
print('Precision: ',precision_score(y_test,y_pred))
print('Confusion Matrix: \n',confusion_matrix(y_test, y_pred))

Random Forest Classifier scores:

Training Score: 1.0
Testing Score: 0.7581553779641033
Accuracy:  0.7581553779641033
Precision:  0.8016213638531235
Confusion Matrix: 
 [[ 744 1248]
 [ 598 5043]]


In [12]:
print('Adaptive Boosting Classifier scores:\n')
ab = AdaBoostClassifier(random_state=1).fit(X_train_scaled, y_train)
print(f'Training Score: {ab.score(X_train_scaled, y_train)}')
print(f'Testing Score: {ab.score(X_test_scaled, y_test)}')
y_pred = ab.predict(X_test_scaled)
print('Accuracy: ',ab.score(X_test_scaled,y_test))
print('Precision: ',precision_score(y_test,y_pred))
print('Confusion Matrix: \n',confusion_matrix(y_test, y_pred))

Adaptive Boosting Classifier scores:

Training Score: 0.7557758658339521
Testing Score: 0.7513428533997117
Accuracy:  0.7513428533997117
Precision:  0.7701746787931283
Confusion Matrix: 
 [[ 400 1592]
 [ 306 5335]]


### Next step is to add more features and optimize prediction trying Logistic Regression, Random Forest and Desiccion Tree Classifiers