In [1]:
import pandas as pd
import numpy as np

In [1]:
"""
    https://data.cityofnewyork.us/Public-Safety/EMS-Incident-Dispatch-Data/76xm-jjuj/explore/query/
    SELECT%0A%20%20%60cad_incident_id
    %60%2C%0A%20%20%60incident_datetime
    %60%2C%0A%20%20%60incident_travel_tm_seconds_qy
    %60%2C%0A%20%20%60zipcode
    %60%0AORDER%20BY%20%60incident_datetime
    %60%20DESC%20NULL%20FIRST/page/aggregate
"""

'\n    https://data.cityofnewyork.us/Public-Safety/EMS-Incident-Dispatch-Data/76xm-jjuj/explore/query/\n    SELECT%0A%20%20%60cad_incident_id\n    %60%2C%0A%20%20%60incident_datetime\n    %60%2C%0A%20%20%60incident_travel_tm_seconds_qy\n    %60%2C%0A%20%20%60zipcode\n    %60%0AORDER%20BY%20%60incident_datetime\n    %60%20DESC%20NULL%20FIRST/page/aggregate\n'

## Load & Parse EMS Data

In [2]:
ems_src = "https://data.cityofnewyork.us/resource/76xm-jjuj.json?$limit=243979&%24where=cad_incident_id%20%25%2095%20%3D%200"
ems_df = pd.read_json(ems_src)
print(ems_df.columns)
ems_df

Index(['cad_incident_id', 'incident_datetime', 'initial_call_type',
       'initial_severity_level_code', 'final_call_type',
       'final_severity_level_code', 'first_assignment_datetime',
       'valid_dispatch_rspns_time_indc', 'dispatch_response_seconds_qy',
       'first_activation_datetime', 'first_on_scene_datetime',
       'valid_incident_rspns_time_indc', 'incident_response_seconds_qy',
       'incident_travel_tm_seconds_qy', 'first_to_hosp_datetime',
       'first_hosp_arrival_datetime', 'incident_close_datetime',
       'held_indicator', 'incident_disposition_code', 'borough',
       'incident_dispatch_area', 'zipcode', 'policeprecinct',
       'citycouncildistrict', 'communitydistrict', 'communityschooldistrict',
       'congressionaldistrict', 'reopen_indicator', 'special_event_indicator',
       'standby_indicator', 'transfer_indicator'],
      dtype='object')


Unnamed: 0,cad_incident_id,incident_datetime,initial_call_type,initial_severity_level_code,final_call_type,final_severity_level_code,first_assignment_datetime,valid_dispatch_rspns_time_indc,dispatch_response_seconds_qy,first_activation_datetime,...,zipcode,policeprecinct,citycouncildistrict,communitydistrict,communityschooldistrict,congressionaldistrict,reopen_indicator,special_event_indicator,standby_indicator,transfer_indicator
0,223655175,2022-12-31T23:59:32.000,CARD,3,CARD,3,2023-01-01T00:03:16.000,Y,224,2023-01-01T00:03:41.000,...,10457,46.0,15.0,205.0,9.0,15.0,N,N,N,N
1,223655080,2022-12-31T23:39:30.000,SICK,6,SICK,6,2022-12-31T23:53:50.000,Y,860,2022-12-31T23:53:57.000,...,11692,100.0,31.0,414.0,27.0,5.0,N,N,N,N
2,223654985,2022-12-31T23:22:05.000,STATEP,2,STATEP,2,2022-12-31T23:24:39.000,Y,154,2022-12-31T23:24:51.000,...,10039,32.0,9.0,110.0,5.0,13.0,N,N,N,N
3,223654795,2022-12-31T22:37:50.000,EDP,7,EDP,7,,N,0,,...,10028,19.0,5.0,108.0,2.0,12.0,N,N,N,N
4,223654700,2022-12-31T22:16:14.000,ALTMFC,3,ALTMFC,3,2022-12-31T22:16:26.000,Y,12,2022-12-31T22:17:07.000,...,10039,32.0,9.0,110.0,5.0,13.0,N,N,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243974,60310180,2006-01-31T01:34:40.000,SICK,6,SICK,6,2006-01-31T01:35:15.000,Y,35,2006-01-31T01:35:28.000,...,11225,71.0,40.0,309.0,17.0,9.0,N,N,N,N
243975,60303625,2006-01-30T23:30:13.000,UNC,2,UNC,2,2006-01-30T23:30:50.000,Y,37,2006-01-30T23:34:10.000,...,10458,48.0,15.0,206.0,10.0,15.0,N,N,N,N
243976,60303530,2006-01-30T22:42:12.000,DRUG,4,DRUG,4,2006-01-30T22:42:57.000,Y,45,2006-01-30T22:45:02.000,...,10009,13.0,4.0,106.0,2.0,12.0,N,N,N,N
243977,60303435,2006-01-30T21:57:23.000,INHALE,5,INHALE,5,2006-01-30T21:57:40.000,Y,17,2006-01-30T21:57:47.000,...,11212,73.0,42.0,316.0,23.0,8.0,N,N,N,N


In [3]:
# adding basic filters
target_cols = ["incident_travel_tm_seconds_qy", "incident_datetime", "zipcode"]
df = ems_df[target_cols].dropna()
df = df.rename(columns={"incident_travel_tm_seconds_qy": "travel_time"})

# specifying types
df["zipcode"] = df["zipcode"].astype("category")
df["travel_time"] = pd.to_numeric(df["travel_time"])
df["incident_datetime"] = pd.to_datetime(df["incident_datetime"])

# splitting datetime
df["date"] = pd.to_datetime(df["incident_datetime"]).dt.date
df["time"] = pd.to_datetime(df["incident_datetime"]).dt.time
df["time_block"] = df["time"].apply(lambda x: x.hour)
df = df.drop(columns=["incident_datetime"])
df

Unnamed: 0,travel_time,zipcode,date,time,time_block
0,575.0,10457,2022-12-31,23:59:32,23
1,316.0,11692,2022-12-31,23:39:30,23
2,521.0,10039,2022-12-31,23:22:05,23
4,668.0,10039,2022-12-31,22:16:14,22
5,242.0,10456,2022-12-31,21:50:28,21
...,...,...,...,...,...
243974,298.0,11225,2006-01-31,01:34:40,1
243975,253.0,10458,2006-01-30,23:30:13,23
243976,422.0,10009,2006-01-30,22:42:12,22
243977,93.0,11212,2006-01-30,21:57:23,21


## Load and Parse Traffic Data

In [4]:
traffic_src = "https://data.cityofnewyork.us/resource/btm5-ppia.json?$limit=42800"
traffic_df = pd.read_json(traffic_src)
traffic_df["date"] = pd.to_datetime(traffic_df["date"])
print(traffic_df.columns)
traffic_df

Index(['id', 'segmentid', 'roadway_name', 'from', 'to', 'direction', 'date',
       '_12_00_1_00_am', '_1_00_2_00am', '_2_00_3_00am', '_3_00_4_00am',
       '_4_00_5_00am', '_5_00_6_00am', '_6_00_7_00am', '_7_00_8_00am',
       '_8_00_9_00am', '_9_00_10_00am', '_10_00_11_00am', '_11_00_12_00pm',
       '_12_00_1_00pm', '_1_00_2_00pm', '_2_00_3_00pm', '_3_00_4_00pm',
       '_4_00_5_00pm', '_5_00_6_00pm', '_6_00_7_00pm', '_7_00_8_00pm',
       '_8_00_9_00pm', '_9_00_10_00pm', '_10_00_11_00pm', '_11_00_12_00am'],
      dtype='object')


Unnamed: 0,id,segmentid,roadway_name,from,to,direction,date,_12_00_1_00_am,_1_00_2_00am,_2_00_3_00am,...,_2_00_3_00pm,_3_00_4_00pm,_4_00_5_00pm,_5_00_6_00pm,_6_00_7_00pm,_7_00_8_00pm,_8_00_9_00pm,_9_00_10_00pm,_10_00_11_00pm,_11_00_12_00am
0,1,15540,BEACH STREET,UNION PLACE,VAN DUZER STREET,NB,2012-01-09,20.0,10.0,11.0,...,104.0,105.0,147.0,120.0,91.0,83.0,74.0,49.0,42.0,42.0
1,2,15540,BEACH STREET,UNION PLACE,VAN DUZER STREET,NB,2012-01-10,21.0,16.0,8.0,...,102.0,98.0,133.0,131.0,95.0,73.0,70.0,63.0,42.0,35.0
2,3,15540,BEACH STREET,UNION PLACE,VAN DUZER STREET,NB,2012-01-11,27.0,14.0,6.0,...,115.0,115.0,130.0,143.0,106.0,89.0,68.0,64.0,56.0,43.0
3,4,15540,BEACH STREET,UNION PLACE,VAN DUZER STREET,NB,2012-01-12,22.0,7.0,7.0,...,71.0,127.0,122.0,144.0,122.0,76.0,64.0,58.0,64.0,43.0
4,5,15540,BEACH STREET,UNION PLACE,VAN DUZER STREET,NB,2012-01-13,31.0,17.0,7.0,...,113.0,126.0,133.0,135.0,102.0,106.0,58.0,58.0,55.0,54.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42751,373,35832,WEST 49th STREET,Rockefeller Plaza,6th Avenue,WB,2020-11-18,68.0,63.0,31.0,...,152.0,177.0,162.0,164.0,183.0,173.0,143.0,113.0,127.0,133.0
42752,373,35832,WEST 49th STREET,Rockefeller Plaza,6th Avenue,WB,2020-11-19,71.0,59.0,42.0,...,166.0,162.0,187.0,182.0,180.0,190.0,162.0,155.0,234.0,166.0
42753,373,35832,WEST 49th STREET,Rockefeller Plaza,6th Avenue,WB,2020-11-20,111.0,79.0,54.0,...,157.0,186.0,170.0,158.0,194.0,196.0,222.0,174.0,182.0,178.0
42754,373,35832,WEST 49th STREET,Rockefeller Plaza,6th Avenue,WB,2020-11-21,129.0,95.0,63.0,...,139.0,147.0,153.0,183.0,163.0,184.0,183.0,169.0,187.0,211.0


In [5]:
new_cols = ["time_block", "date", "zipcode", "traffic_volume"]
new_data = dict([[name, []] for name in new_cols])
for i, row in traffic_df.iterrows():
    for j in range(7,len(row)):  # 7 to 30
        new_data["date"].append(row[6])
#         location_data = [name.lower().strip() for name in row[2:5]]
        new_data["zipcode"].append(row[2].lower().strip())
        
        new_data["traffic_volume"].append(row[j])
        new_data["time_block"].append(j-7)

In [6]:
tvol_df = pd.DataFrame(new_data)
tvol_df

Unnamed: 0,time_block,date,zipcode,traffic_volume
0,0,2012-01-09,beach street,20.0
1,1,2012-01-09,beach street,10.0
2,2,2012-01-09,beach street,11.0
3,3,2012-01-09,beach street,14.0
4,4,2012-01-09,beach street,13.0
...,...,...,...,...
1026139,19,2020-11-22,west 49th street,177.0
1026140,20,2020-11-22,west 49th street,160.0
1026141,21,2020-11-22,west 49th street,162.0
1026142,22,2020-11-22,west 49th street,147.0


In [31]:
# save streets for testing street to zip
# example_streets = pd.DataFrame(list(tvol_df["zipcode"].unique()), columns=["streets"])
# example_streets["zipcodes"] = [""]*len(example_streets)
# street_loc = "data/example_streets.csv"
# example_streets.to_csv(street_loc, index=False)  # np.savetxt(street_loc, example_streets, delimiter=",", fmt='%s')
# print("streets saved for later at:", street_loc)
# example_streets

In [34]:
# Dependency: https://github.com/ipython/ipynb
from ipynb.fs.full.streetzip import street_to_zip

zips = list(tvol_df["zipcode"])
translated_zipcodes = street_to_zip(zips, True)
translated_zipcodes

TypeError: street_to_zip() takes 1 positional argument but 2 were given

In [32]:
for i, row in tvol_df.iterrows():
    if row["zipcode"] == zips[i]:
        if translated_zipcodes[i] != '':
            tvol_df.loc[i, "zipcode"] == translated_zipcodes[i]
    else:
        print("street doesnt match", row["zipcode"], "vs.", zips[i])

IndentationError: expected an indented block (2374487287.py, line 2)

## Merging the Dataset

In [7]:
# TODO: finish later

# travel_time	zipcode	date	time	time_block
# time_block	date	zipcode	traffic_volume
traffic_vol_ordered = []
for i, row in df.iterrows():
    merge_df = tvol_df[(tvol_df["time_block"] == row["time_block"]) &
                       (tvol_df["date"] == tvol_df["date"]) &
                       (tvol_df["zipcode"] == tvol_df["zipcode"])]
    assert len(merge_df) == 1, "actual len:"+str(len(merge_df))
    traffic_vol_ordered = merge_df["traffic_volume"].sum()

AssertionError: actual len:42756

In [None]:
all_streets = []
for streets in tvol_df["zipcode"]:
#     for street in streets:
    if street not in all_streets:
        all_streets.append(street)

In [None]:
street_to_zip = dict([[name, 0] for name in tvol_df["zipcode"].unique()])  # 
print(len(street_to_zip))
print(street_to_zip.keys())