In [98]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [99]:
openRest = pd.read_csv('Open_Restaurant_Applications.csv')

#### Initial review / EDA of open restaurant application data

In [100]:
openRest.columns

Index(['objectid', 'globalid', 'Seating Interest (Sidewalk/Roadway/Both)',
       'Restaurant Name', 'Legal Business Name', 'Doing Business As (DBA)',
       'Building Number', 'Street', 'Borough', 'Postcode', 'Business Address',
       'Food Service Establishment Permit #', 'Sidewalk Dimensions (Length)',
       'Sidewalk Dimensions (Width)', 'Sidewalk Dimensions (Area)',
       'Roadway Dimensions (Length)', 'Roadway Dimensions (Width)',
       'Roadway Dimensions (Area)', 'Approved for Sidewalk Seating',
       'Approved for Roadway Seating', 'Qualify Alcohol', 'SLA Serial Number',
       'SLA License Type', 'Landmark District or Building',
       'landmarkDistrict_terms', 'healthCompliance_terms',
       'Time of Submission', 'Latitude', 'Longitude', 'Community Board',
       'Council District', 'Census Tract', 'BIN', 'BBL', 'NTA'],
      dtype='object')

In [101]:
# total number of applications in the dataset
len(openRest)

14107

In [102]:
openRest.describe()

Unnamed: 0,objectid,Postcode,Sidewalk Dimensions (Length),Sidewalk Dimensions (Width),Sidewalk Dimensions (Area),Roadway Dimensions (Length),Roadway Dimensions (Width),Roadway Dimensions (Area),Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL
count,14107.0,14107.0,11846.0,11846.0,11846.0,8921.0,8921.0,8921.0,12800.0,12800.0,12800.0,12800.0,12800.0,12705.0,12705.0
mean,7105.283193,10596.328419,33.079014,10.161067,375.400895,30.893061,8.213989,260.649927,40.733398,-73.953725,5.219062,17.240313,2921.94875,2254158.0,2199482000.0
std,4091.132364,928.416157,34.703304,6.970036,935.476247,26.872615,3.386939,404.064198,0.056207,0.059408,3.636028,15.28016,12793.37202,1310051.0,1275197000.0
min,32.0,7002.0,0.0,0.0,0.0,0.0,0.0,0.0,40.511019,-74.248014,1.0,1.0,1.0,1000000.0,0.0
25%,3561.5,10016.0,16.0,6.0,108.0,18.0,8.0,130.0,40.707434,-73.990246,2.0,3.0,69.0,1025043.0,1008430000.0
50%,7103.0,10305.0,24.0,8.0,208.0,24.0,8.0,192.0,40.734779,-73.97038,5.0,11.0,151.0,2005556.0,2025060000.0
75%,10639.5,11220.0,40.0,13.0,400.0,36.0,8.0,296.0,40.761872,-73.925257,7.0,33.0,470.25,3222229.0,3056580000.0
max,14194.0,91206.0,1000.0,50.0,50000.0,1000.0,50.0,25000.0,40.91119,-73.702668,81.0,51.0,157903.0,5169029.0,5080430000.0


In [103]:
temp = openRest.groupby(by='Postcode').count()

temp.describe()

Unnamed: 0,objectid,globalid,Seating Interest (Sidewalk/Roadway/Both),Restaurant Name,Legal Business Name,Doing Business As (DBA),Building Number,Street,Borough,Business Address,...,healthCompliance_terms,Time of Submission,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
count,245.0,245.0,245.0,245.0,245.0,245.0,245.0,245.0,245.0,245.0,...,245.0,245.0,245.0,245.0,245.0,245.0,245.0,245.0,245.0,245.0
mean,57.579592,57.579592,57.579592,57.57551,57.57551,57.530612,52.146939,57.579592,57.579592,57.579592,...,57.579592,57.579592,52.244898,52.244898,52.244898,52.244898,52.244898,51.857143,51.857143,52.244898
std,94.017039,94.017039,94.017039,94.012378,94.018743,93.960513,84.090035,94.017039,94.017039,94.017039,...,94.017039,94.017039,85.598934,85.598934,85.598934,85.598934,85.598934,85.119636,85.119636,85.598934
min,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,...,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,17.0,17.0,17.0,17.0,17.0,17.0,16.0,17.0,17.0,17.0,...,17.0,17.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0
75%,67.0,67.0,67.0,67.0,67.0,67.0,61.0,67.0,67.0,67.0,...,67.0,67.0,63.0,63.0,63.0,63.0,63.0,63.0,63.0,63.0
max,595.0,595.0,595.0,595.0,595.0,595.0,529.0,595.0,595.0,595.0,...,595.0,595.0,546.0,546.0,546.0,546.0,546.0,545.0,545.0,546.0


#### Initial Cleaning of dataset
Initial cleaning of dataset consists of converting date columns to datetime, removing irreleviant features, and filtering dataset for rows with missing data. 

In [104]:
# convert date data to datetime
openRest['Time of Submission'] = pd.to_datetime(openRest['Time of Submission'])

In [105]:
openRest.dropna(subset=['Latitude'], inplace=True)

In [106]:
# if you qualify for alcohol you have a SLA license, so we don't need both columns
openRest[openRest['Qualify Alcohol'] == 'no']['SLA License Type'].unique()

array([nan], dtype=object)

In [107]:
# drop irrelevant columns
openRest = openRest.copy()
openRest = openRest.drop(['objectid', 'globalid', 'Food Service Establishment Permit #',
       'Restaurant Name', 'Legal Business Name', 'Doing Business As (DBA)', 'Seating Interest (Sidewalk/Roadway/Both)',
       'Building Number', 'Street', 'Borough', 'Business Address', 'healthCompliance_terms', 'SLA Serial Number',
       'SLA License Type', 'Landmark District or Building', 'landmarkDistrict_terms','Community Board',
       'Council District', 'Census Tract', 'BIN', 'BBL', 'NTA'], axis=1)
openRest

Unnamed: 0,Postcode,Sidewalk Dimensions (Length),Sidewalk Dimensions (Width),Sidewalk Dimensions (Area),Roadway Dimensions (Length),Roadway Dimensions (Width),Roadway Dimensions (Area),Approved for Sidewalk Seating,Approved for Roadway Seating,Qualify Alcohol,Time of Submission,Latitude,Longitude
0,10028,,,,10.0,8.0,80.0,no,yes,yes,2022-06-09 16:26:00,40.776277,-73.952051
1,10026,47.0,3.0,141.0,,,,yes,no,yes,2020-06-26 20:38:00,40.800500,-73.952507
2,10065,8.0,8.0,64.0,,,,yes,no,no,2021-10-22 11:01:00,40.766845,-73.962708
3,10011,,,,44.0,8.0,352.0,no,yes,yes,2020-12-14 19:54:00,40.744338,-73.996240
4,10002,,,,22.0,8.0,176.0,no,yes,yes,2021-12-22 12:46:00,40.722124,-73.988160
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14102,11356,22.0,4.0,88.0,22.0,8.0,176.0,yes,yes,yes,2020-08-04 15:27:00,40.784739,-73.845776
14103,11220,26.0,7.0,182.0,26.0,8.0,208.0,yes,yes,no,2020-07-13 15:53:00,40.635543,-74.011220
14104,10007,25.0,14.0,350.0,,,,yes,no,yes,2020-06-21 13:46:00,40.713298,-74.007773
14105,10003,10.0,10.0,100.0,,,,yes,no,no,2020-06-19 13:19:00,40.733916,-73.989872


In [108]:
# if you're approved for sidewalk seating, you might not have dimensions associated.
# therefore, the number if restaurants approved for sidewalk seating might not equal the number that have sidewalk seating

temp = openRest[openRest['Approved for Sidewalk Seating'] == 'yes']
print(len(temp[temp['Sidewalk Dimensions (Area)'] == 0]))
temp[temp['Sidewalk Dimensions (Area)'] == 0]

20


Unnamed: 0,Postcode,Sidewalk Dimensions (Length),Sidewalk Dimensions (Width),Sidewalk Dimensions (Area),Roadway Dimensions (Length),Roadway Dimensions (Width),Roadway Dimensions (Area),Approved for Sidewalk Seating,Approved for Roadway Seating,Qualify Alcohol,Time of Submission,Latitude,Longitude
315,11201,0.0,0.0,0.0,30.0,8.0,240.0,yes,yes,no,2020-06-30 14:17:00,40.702991,-73.989584
494,10003,0.0,0.0,0.0,50.0,8.0,400.0,yes,yes,yes,2020-08-21 10:56:00,40.729118,-73.988869
631,10018,0.0,0.0,0.0,40.0,8.0,320.0,yes,yes,no,2021-05-28 14:10:00,40.752072,-73.983466
850,10065,18.0,0.0,0.0,18.0,8.0,144.0,yes,yes,no,2021-03-12 10:45:00,40.763668,-73.967125
3239,11205,9.0,0.0,0.0,,,,yes,no,no,2021-03-25 14:17:00,40.693136,-73.969518
3329,10003,0.0,0.0,0.0,50.0,8.0,400.0,yes,yes,yes,2020-08-21 10:51:00,40.729156,-73.988956
4901,10013,0.0,0.0,0.0,35.0,8.0,280.0,yes,yes,no,2020-07-28 14:27:00,40.715087,-73.998802
6171,10040,18.0,0.0,0.0,14.0,4.0,56.0,yes,yes,yes,2020-07-08 19:19:00,40.854172,-73.930194
6334,10014,0.0,0.0,0.0,16.0,7.0,112.0,yes,yes,no,2020-07-02 12:11:00,40.734424,-74.001855
6626,11201,0.0,0.0,0.0,20.0,8.0,160.0,yes,yes,yes,2020-06-26 16:42:00,40.699587,-73.991994


In [109]:
temp = openRest[openRest['Approved for Roadway Seating'] == 'yes']
print(len(temp[temp['Roadway Dimensions (Area)'] == 0]))
temp[temp['Roadway Dimensions (Area)'] == 0]

23


Unnamed: 0,Postcode,Sidewalk Dimensions (Length),Sidewalk Dimensions (Width),Sidewalk Dimensions (Area),Roadway Dimensions (Length),Roadway Dimensions (Width),Roadway Dimensions (Area),Approved for Sidewalk Seating,Approved for Roadway Seating,Qualify Alcohol,Time of Submission,Latitude,Longitude
127,10024,30.0,10.0,300.0,0.0,0.0,0.0,yes,yes,yes,2021-03-11 17:19:00,40.790153,-73.973226
474,10012,15.0,3.0,45.0,0.0,0.0,0.0,yes,yes,no,2023-03-20 14:03:00,40.725858,-73.99531
1377,10025,50.0,6.0,300.0,0.0,0.0,0.0,yes,yes,yes,2020-06-19 20:23:00,40.802466,-73.964245
1575,11235,68.0,33.0,2244.0,0.0,0.0,0.0,yes,yes,yes,2020-06-19 13:50:00,40.587836,-73.955247
2554,10021,,,,0.0,0.0,0.0,no,yes,yes,2021-06-10 16:05:00,40.770876,-73.96189
3001,11473,45.0,10.0,450.0,0.0,0.0,0.0,yes,yes,yes,2020-06-21 14:54:00,40.745952,-73.890303
4277,11414,115.0,17.0,1955.0,0.0,0.0,0.0,yes,yes,no,2020-06-22 08:24:00,40.653238,-73.838418
4417,11372,17.0,15.0,255.0,0.0,0.0,0.0,yes,yes,no,2020-06-26 17:04:00,40.747259,-73.887439
5184,10014,12.0,6.0,72.0,0.0,0.0,0.0,yes,yes,yes,2020-06-19 13:33:00,40.737572,-74.003551
5313,11238,47.0,8.0,376.0,0.0,0.0,0.0,yes,yes,yes,2020-06-25 22:08:00,40.681602,-73.958544


#### Preprocess Data

In [110]:
processed_openRest = openRest.copy()

# change yes/no to 1/0
processed_openRest['Approved for Sidewalk Seating'] = processed_openRest['Approved for Sidewalk Seating'].map(dict(yes=1, no=0))
processed_openRest['Approved for Roadway Seating'] = processed_openRest['Approved for Roadway Seating'].map(dict(yes=1, no=0))
processed_openRest['Qualify Alcohol'] = processed_openRest['Qualify Alcohol'].map(dict(yes=1, no=0))
processed_openRest

Unnamed: 0,Postcode,Sidewalk Dimensions (Length),Sidewalk Dimensions (Width),Sidewalk Dimensions (Area),Roadway Dimensions (Length),Roadway Dimensions (Width),Roadway Dimensions (Area),Approved for Sidewalk Seating,Approved for Roadway Seating,Qualify Alcohol,Time of Submission,Latitude,Longitude
0,10028,,,,10.0,8.0,80.0,0,1,1,2022-06-09 16:26:00,40.776277,-73.952051
1,10026,47.0,3.0,141.0,,,,1,0,1,2020-06-26 20:38:00,40.800500,-73.952507
2,10065,8.0,8.0,64.0,,,,1,0,0,2021-10-22 11:01:00,40.766845,-73.962708
3,10011,,,,44.0,8.0,352.0,0,1,1,2020-12-14 19:54:00,40.744338,-73.996240
4,10002,,,,22.0,8.0,176.0,0,1,1,2021-12-22 12:46:00,40.722124,-73.988160
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14102,11356,22.0,4.0,88.0,22.0,8.0,176.0,1,1,1,2020-08-04 15:27:00,40.784739,-73.845776
14103,11220,26.0,7.0,182.0,26.0,8.0,208.0,1,1,0,2020-07-13 15:53:00,40.635543,-74.011220
14104,10007,25.0,14.0,350.0,,,,1,0,1,2020-06-21 13:46:00,40.713298,-74.007773
14105,10003,10.0,10.0,100.0,,,,1,0,0,2020-06-19 13:19:00,40.733916,-73.989872


In [111]:
processed_openRest[processed_openRest['Postcode'] == 7002]

Unnamed: 0,Postcode,Sidewalk Dimensions (Length),Sidewalk Dimensions (Width),Sidewalk Dimensions (Area),Roadway Dimensions (Length),Roadway Dimensions (Width),Roadway Dimensions (Area),Approved for Sidewalk Seating,Approved for Roadway Seating,Qualify Alcohol,Time of Submission,Latitude,Longitude


In [112]:
processed_openRest = processed_openRest.groupby(by='Postcode').sum()
processed_openRest[processed_openRest['Postcode'] == 7002]

KeyError: 'Postcode'

#### EDA on cleaned data

In [None]:
# plot range of open restaurant application submissions 
fig = px.histogram(processed_openRest, x="Time of Submission",marginal="box")
fig.show()

In [None]:
# plot range of sidewalk area ranges
fig = px.histogram(processed_openRest, x="Sidewalk Dimensions (Area)",marginal="box")
fig.show()

In [None]:
# plot range of roadway area ranges
fig = px.histogram(processed_openRest, x="Roadway Dimensions (Area)",marginal="box")
fig.show()