In [1]:
import pandas as pd
import numpy as np

In [1]:
"""
    https://data.cityofnewyork.us/Public-Safety/EMS-Incident-Dispatch-Data/76xm-jjuj/explore/query/
    SELECT%0A%20%20%60cad_incident_id
    %60%2C%0A%20%20%60incident_datetime
    %60%2C%0A%20%20%60incident_travel_tm_seconds_qy
    %60%2C%0A%20%20%60zipcode
    %60%0AORDER%20BY%20%60incident_datetime
    %60%20DESC%20NULL%20FIRST/page/aggregate
"""

'\n    https://data.cityofnewyork.us/Public-Safety/EMS-Incident-Dispatch-Data/76xm-jjuj/explore/query/\n    SELECT%0A%20%20%60cad_incident_id\n    %60%2C%0A%20%20%60incident_datetime\n    %60%2C%0A%20%20%60incident_travel_tm_seconds_qy\n    %60%2C%0A%20%20%60zipcode\n    %60%0AORDER%20BY%20%60incident_datetime\n    %60%20DESC%20NULL%20FIRST/page/aggregate\n'

## Load & Parse EMS Data

In [2]:
ems_src = "https://data.cityofnewyork.us/resource/76xm-jjuj.json?$limit=243979&%24where=cad_incident_id%20%25%2095%20%3D%200"
ems_df = pd.read_json(ems_src)
print(ems_df.columns)
ems_df

Index(['cad_incident_id', 'incident_datetime', 'initial_call_type',
       'initial_severity_level_code', 'final_call_type',
       'final_severity_level_code', 'first_assignment_datetime',
       'valid_dispatch_rspns_time_indc', 'dispatch_response_seconds_qy',
       'first_activation_datetime', 'first_on_scene_datetime',
       'valid_incident_rspns_time_indc', 'incident_response_seconds_qy',
       'incident_travel_tm_seconds_qy', 'first_to_hosp_datetime',
       'first_hosp_arrival_datetime', 'incident_close_datetime',
       'held_indicator', 'incident_disposition_code', 'borough',
       'incident_dispatch_area', 'zipcode', 'policeprecinct',
       'citycouncildistrict', 'communitydistrict', 'communityschooldistrict',
       'congressionaldistrict', 'reopen_indicator', 'special_event_indicator',
       'standby_indicator', 'transfer_indicator'],
      dtype='object')


Unnamed: 0,cad_incident_id,incident_datetime,initial_call_type,initial_severity_level_code,final_call_type,final_severity_level_code,first_assignment_datetime,valid_dispatch_rspns_time_indc,dispatch_response_seconds_qy,first_activation_datetime,...,zipcode,policeprecinct,citycouncildistrict,communitydistrict,communityschooldistrict,congressionaldistrict,reopen_indicator,special_event_indicator,standby_indicator,transfer_indicator
0,223655175,2022-12-31T23:59:32.000,CARD,3,CARD,3,2023-01-01T00:03:16.000,Y,224,2023-01-01T00:03:41.000,...,10457,46.0,15.0,205.0,9.0,15.0,N,N,N,N
1,223655080,2022-12-31T23:39:30.000,SICK,6,SICK,6,2022-12-31T23:53:50.000,Y,860,2022-12-31T23:53:57.000,...,11692,100.0,31.0,414.0,27.0,5.0,N,N,N,N
2,223654985,2022-12-31T23:22:05.000,STATEP,2,STATEP,2,2022-12-31T23:24:39.000,Y,154,2022-12-31T23:24:51.000,...,10039,32.0,9.0,110.0,5.0,13.0,N,N,N,N
3,223654795,2022-12-31T22:37:50.000,EDP,7,EDP,7,,N,0,,...,10028,19.0,5.0,108.0,2.0,12.0,N,N,N,N
4,223654700,2022-12-31T22:16:14.000,ALTMFC,3,ALTMFC,3,2022-12-31T22:16:26.000,Y,12,2022-12-31T22:17:07.000,...,10039,32.0,9.0,110.0,5.0,13.0,N,N,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243974,60310180,2006-01-31T01:34:40.000,SICK,6,SICK,6,2006-01-31T01:35:15.000,Y,35,2006-01-31T01:35:28.000,...,11225,71.0,40.0,309.0,17.0,9.0,N,N,N,N
243975,60303625,2006-01-30T23:30:13.000,UNC,2,UNC,2,2006-01-30T23:30:50.000,Y,37,2006-01-30T23:34:10.000,...,10458,48.0,15.0,206.0,10.0,15.0,N,N,N,N
243976,60303530,2006-01-30T22:42:12.000,DRUG,4,DRUG,4,2006-01-30T22:42:57.000,Y,45,2006-01-30T22:45:02.000,...,10009,13.0,4.0,106.0,2.0,12.0,N,N,N,N
243977,60303435,2006-01-30T21:57:23.000,INHALE,5,INHALE,5,2006-01-30T21:57:40.000,Y,17,2006-01-30T21:57:47.000,...,11212,73.0,42.0,316.0,23.0,8.0,N,N,N,N


In [3]:
# adding basic filters
target_cols = ["incident_travel_tm_seconds_qy", "incident_datetime", "zipcode"]
df = ems_df[target_cols].dropna()
df = df.rename(columns={"incident_travel_tm_seconds_qy": "travel_time"})

# specifying types
df["zipcode"] = df["zipcode"].astype("category")
df["travel_time"] = pd.to_numeric(df["travel_time"])
df["incident_datetime"] = pd.to_datetime(df["incident_datetime"])

# splitting datetime
df["date"] = pd.to_datetime(df["incident_datetime"]).dt.date
df["time"] = pd.to_datetime(df["incident_datetime"]).dt.time
df["time_block"] = df["time"].apply(lambda x: x.hour)
df = df.drop(columns=["incident_datetime"])
df

Unnamed: 0,travel_time,zipcode,date,time,time_block
0,575.0,10457,2022-12-31,23:59:32,23
1,316.0,11692,2022-12-31,23:39:30,23
2,521.0,10039,2022-12-31,23:22:05,23
4,668.0,10039,2022-12-31,22:16:14,22
5,242.0,10456,2022-12-31,21:50:28,21
...,...,...,...,...,...
243974,298.0,11225,2006-01-31,01:34:40,1
243975,253.0,10458,2006-01-30,23:30:13,23
243976,422.0,10009,2006-01-30,22:42:12,22
243977,93.0,11212,2006-01-30,21:57:23,21


## Load and Parse Traffic Data

In [4]:
traffic_src = "https://data.cityofnewyork.us/resource/btm5-ppia.json?$limit=42800"
traffic_df = pd.read_json(traffic_src)
traffic_df["date"] = pd.to_datetime(traffic_df["date"])
print(traffic_df.columns)
traffic_df

Index(['id', 'segmentid', 'roadway_name', 'from', 'to', 'direction', 'date',
       '_12_00_1_00_am', '_1_00_2_00am', '_2_00_3_00am', '_3_00_4_00am',
       '_4_00_5_00am', '_5_00_6_00am', '_6_00_7_00am', '_7_00_8_00am',
       '_8_00_9_00am', '_9_00_10_00am', '_10_00_11_00am', '_11_00_12_00pm',
       '_12_00_1_00pm', '_1_00_2_00pm', '_2_00_3_00pm', '_3_00_4_00pm',
       '_4_00_5_00pm', '_5_00_6_00pm', '_6_00_7_00pm', '_7_00_8_00pm',
       '_8_00_9_00pm', '_9_00_10_00pm', '_10_00_11_00pm', '_11_00_12_00am'],
      dtype='object')


Unnamed: 0,id,segmentid,roadway_name,from,to,direction,date,_12_00_1_00_am,_1_00_2_00am,_2_00_3_00am,...,_2_00_3_00pm,_3_00_4_00pm,_4_00_5_00pm,_5_00_6_00pm,_6_00_7_00pm,_7_00_8_00pm,_8_00_9_00pm,_9_00_10_00pm,_10_00_11_00pm,_11_00_12_00am
0,1,15540,BEACH STREET,UNION PLACE,VAN DUZER STREET,NB,2012-01-09,20.0,10.0,11.0,...,104.0,105.0,147.0,120.0,91.0,83.0,74.0,49.0,42.0,42.0
1,2,15540,BEACH STREET,UNION PLACE,VAN DUZER STREET,NB,2012-01-10,21.0,16.0,8.0,...,102.0,98.0,133.0,131.0,95.0,73.0,70.0,63.0,42.0,35.0
2,3,15540,BEACH STREET,UNION PLACE,VAN DUZER STREET,NB,2012-01-11,27.0,14.0,6.0,...,115.0,115.0,130.0,143.0,106.0,89.0,68.0,64.0,56.0,43.0
3,4,15540,BEACH STREET,UNION PLACE,VAN DUZER STREET,NB,2012-01-12,22.0,7.0,7.0,...,71.0,127.0,122.0,144.0,122.0,76.0,64.0,58.0,64.0,43.0
4,5,15540,BEACH STREET,UNION PLACE,VAN DUZER STREET,NB,2012-01-13,31.0,17.0,7.0,...,113.0,126.0,133.0,135.0,102.0,106.0,58.0,58.0,55.0,54.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42751,373,35832,WEST 49th STREET,Rockefeller Plaza,6th Avenue,WB,2020-11-18,68.0,63.0,31.0,...,152.0,177.0,162.0,164.0,183.0,173.0,143.0,113.0,127.0,133.0
42752,373,35832,WEST 49th STREET,Rockefeller Plaza,6th Avenue,WB,2020-11-19,71.0,59.0,42.0,...,166.0,162.0,187.0,182.0,180.0,190.0,162.0,155.0,234.0,166.0
42753,373,35832,WEST 49th STREET,Rockefeller Plaza,6th Avenue,WB,2020-11-20,111.0,79.0,54.0,...,157.0,186.0,170.0,158.0,194.0,196.0,222.0,174.0,182.0,178.0
42754,373,35832,WEST 49th STREET,Rockefeller Plaza,6th Avenue,WB,2020-11-21,129.0,95.0,63.0,...,139.0,147.0,153.0,183.0,163.0,184.0,183.0,169.0,187.0,211.0


In [5]:
new_cols = ["time_block", "date", "zipcode", "traffic_volume"]
new_data = dict([[name, []] for name in new_cols])
for i, row in traffic_df.iterrows():
    for j in range(7,len(row)):  # 7 to 30
        new_data["date"].append(row[6])
#         location_data = [name.lower().strip() for name in row[2:5]]
        new_data["zipcode"].append(row[2].lower().strip())
        
        new_data["traffic_volume"].append(row[j])
        new_data["time_block"].append(j-7)

In [6]:
tvol_df = pd.DataFrame(new_data)
tvol_df

Unnamed: 0,time_block,date,zipcode,traffic_volume
0,0,2012-01-09,beach street,20.0
1,1,2012-01-09,beach street,10.0
2,2,2012-01-09,beach street,11.0
3,3,2012-01-09,beach street,14.0
4,4,2012-01-09,beach street,13.0
...,...,...,...,...
1026139,19,2020-11-22,west 49th street,177.0
1026140,20,2020-11-22,west 49th street,160.0
1026141,21,2020-11-22,west 49th street,162.0
1026142,22,2020-11-22,west 49th street,147.0


In [28]:
# save streets for testing street to zip
example_streets = pd.DataFrame(list(tvol_df["zipcode"].unique()), columns=["streets"])
example_streets["zipcodes"] = [""]*len(example_streets)
street_loc = "data/example_streets.csv"
example_streets.to_csv(street_loc, sep='\t')  # np.savetxt(street_loc, example_streets, delimiter=",", fmt='%s')
print("streets saved for later at:", street_loc)
example_streets

streets saved for later at: data/example_streets.csv


Unnamed: 0,streets,zipcodes
0,beach street,
1,little clove road,
2,narrows road south,
3,ocean terrace,
4,bay st,
...,...,...
892,west 53rd street,
893,west 55th street,
894,west 56th street,
895,west 57th street,


In [23]:
# Dependency: https://github.com/ipython/ipynb
from ipynb.fs.full.streetzip import street_to_zip  # Note: needs to be finalized

tvol_df["zipcode"] = tvol_df["zipcode"].apply(lambda x: street_to_zip[x])
tvol_df

beach street => beach street
little clove road => little clove road
narrows road south => narrows road south
ocean terrace => ocean terrace
bay st => bay st
clove rd => clove rd
hylan blvd => hylan blvd
manor rd => manor rd
targee st => targee st
todt hill rd => todt hill rd
van duzer street => van duzer street
victory boulevard => victory boulevard
morgan ave => morgan ave
park avenue => park avenue
8 ave => 8 ave
kingsland avenue => kingsland avenue
lewis ave => lewis ave
humboldt street => humboldt street
kent avenue => kent avenue
ralph ave => ralph ave
rockaway ave => rockaway ave
stuyvesant avenue => stuyvesant avenue
throop avenue => throop avenue
union avenue => union avenue
malcolm x blvd => malcolm x blvd
graham avenue => graham avenue
greene avenue => greene avenue
greenpoint avenue => greenpoint avenue
kingston ave => kingston ave
3 ave => 3 ave
4 ave => 4 ave
5 ave => 5 ave
6 ave => 6 ave
7 ave => 7 ave
new york ave => new york ave
quincy st => quincy st
smith st => smith 

hutchinson river pkwy => hutchinson river pkwy
boston road => boston road
maryland rd => maryland rd
jewel ave => jewel ave
union street => union street
sanford ave => sanford ave
roosevelt ave => roosevelt ave
east 177 street => east 177 street
lafayette avenue => lafayette avenue
underhill ave => underhill ave
homelawn street => homelawn street
hollis court blvd => hollis court blvd
35 ave => 35 ave
56 ave => 56 ave
26 ave => 26 ave
marathon pkwy => marathon pkwy
fingerboard road => fingerboard road
west 155 st => west 155 st
manhattan west 155 st => manhattan west 155 st
flushing ave => flushing ave
glen street => glen street
grand ave => grand ave
nostrand avenue => nostrand avenue
louis nine  boulevard => louis nine  boulevard
columbus ave => columbus ave
west gun hill road => west gun hill road
colonial road => colonial road
115 road => 115 road
vernon blvd => vernon blvd
springfield blvd => springfield blvd
francis lewis blvd => francis lewis blvd
atlantic ave => atlantic ave
wo

TypeError: 'function' object is not subscriptable

## Merging the Dataset

In [7]:
# travel_time	zipcode	date	time	time_block
# time_block	date	zipcode	traffic_volume
traffic_vol_ordered = []
for i, row in df.iterrows():
    merge_df = tvol_df[(tvol_df["time_block"] == row["time_block"]) &
                       (tvol_df["date"] == tvol_df["date"]) &
                       (tvol_df["zipcode"] == tvol_df["zipcode"])]
    assert len(merge_df) == 1, "actual len:"+str(len(merge_df))
    traffic_vol_ordered = merge_df["traffic_volume"].sum()

AssertionError: actual len:42756

In [None]:
all_streets = []
for streets in tvol_df["zipcode"]:
#     for street in streets:
    if street not in all_streets:
        all_streets.append(street)

In [None]:
street_to_zip = dict([[name, 0] for name in tvol_df["zipcode"].unique()])  # 
print(len(street_to_zip))
print(street_to_zip.keys())

### Get Street to Zip Translation:

In [103]:
src_311 = "https://data.cityofnewyork.us/resource/erm2-nwe9.json?$limit=82800"
df311 = pd.read_json(src_311)
df311

Unnamed: 0,unique_key,created_date,agency,agency_name,complaint_type,descriptor,location_type,incident_zip,incident_address,street_name,...,landmark,bridge_highway_name,bridge_highway_direction,bridge_highway_segment,closed_date,taxi_pick_up_location,due_date,road_ramp,vehicle_type,taxi_company_borough
0,59341585,2023-11-06T12:00:00.000,DSNY,Department of Sanitation,Derelict Vehicles,Derelict Vehicles,Street,11221.0,970 DEKALB AVENUE,DEKALB AVENUE,...,,,,,,,,,,
1,59335912,2023-11-06T12:00:00.000,DSNY,Department of Sanitation,Derelict Vehicles,Derelict Vehicles,Street,11385.0,1980 STARR STREET,STARR STREET,...,,,,,,,,,,
2,59337873,2023-11-06T12:00:00.000,DSNY,Department of Sanitation,Derelict Vehicles,Derelict Vehicles,Street,11419.0,124-02 ATLANTIC AVENUE,ATLANTIC AVENUE,...,,,,,,,,,,
3,59338637,2023-11-06T02:07:29.000,NYPD,New York City Police Department,Blocked Driveway,Partial Access,Street/Sidewalk,11239.0,650 SCHROEDERS AVENUE,SCHROEDERS AVENUE,...,SCHROEDERS AVENUE,,,,,,,,,
4,59341554,2023-11-06T02:07:02.000,NYPD,New York City Police Department,Noise - Residential,Loud Music/Party,Residential Building/House,11235.0,3062 BRIGHTON 7 STREET,BRIGHTON 7 STREET,...,BRIGHTON 7 STREET,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82795,59258230,2023-10-28T14:35:41.000,HPD,Department of Housing Preservation and Develop...,UNSANITARY CONDITION,PESTS,RESIDENTIAL BUILDING,10026.0,182 ST NICHOLAS AVENUE,ST NICHOLAS AVENUE,...,,,,,,,,,,
82796,59258664,2023-10-28T14:35:00.000,DEP,Department of Environmental Protection,Sewer,Sewer Backup (Use Comments) (SA),,10304.0,55 SEAVIEW AVENUE,SEAVIEW AVENUE,...,,,,,2023-10-28T15:00:00.000,,,,,
82797,59251623,2023-10-28T14:34:49.000,NYPD,New York City Police Department,Noise - Commercial,Loud Music/Party,Club/Bar/Restaurant,10469.0,3007 EASTCHESTER ROAD,EASTCHESTER ROAD,...,EASTCHESTER ROAD,,,,2023-10-28T14:54:52.000,,,,,
82798,59259668,2023-10-28T14:34:42.000,DOT,Department of Transportation,Street Condition,Defective Hardware,Street,10024.0,145 WEST 86 STREET,WEST 86 STREET,...,WEST 86 STREET,,,,2023-11-01T13:04:54.000,,,,,


In [104]:
df311 = df311[df311['incident_zip'].notna() & df311['incident_address'].notna()]
df311 = df311[["incident_zip", "incident_address"]]
df311["incident_address"] = df311["incident_address"].apply(lambda var: var.lower().strip())
df311

Unnamed: 0,incident_zip,incident_address
0,11221.0,970 dekalb avenue
1,11385.0,1980 starr street
2,11419.0,124-02 atlantic avenue
3,11239.0,650 schroeders avenue
4,11235.0,3062 brighton 7 street
...,...,...
82795,10026.0,182 st nicholas avenue
82796,10304.0,55 seaview avenue
82797,10469.0,3007 eastchester road
82798,10024.0,145 west 86 street


In [112]:
unknown_streets = []
for street, zp in street_to_zip.items():  # each street in ems data
    entry = df311[df311['incident_address'].str.contains(street)]  # subset of 311 data containing street name
    if len(entry) > 0:
        street_to_zip[street] = int(entry.iloc[0]["incident_zip"])
    else:
        unknown_streets.append(street)
        print(street)

little clove road
narrows road south
clove rd
hylan blvd
manor rd
todt hill rd
malcolm x blvd
marcus garvey blvd
veterans avenue
richmond rd
amboy rd
4th ave
shore blvd
shore pkwy north
shore pkwy south
shore rd
shore pkwy
ave h
so conduit ave
east 174 street
murdock ave nue
empire avenue
seagirt blvd
east 167 street
east 170 street
east 188 street
east 165 street
east 169 street
east 181 street
east 134 street
cross island pkwy sr south
east 138 street
east 149 street
east 163 street
west 239 street
ave u
east 222 street
east 241 street
east 161 street
st mary's street
morley avenue
little clove rd
86th st
kings hwy
bay ridge pkwy
ave p
bay pkwy
trinity pl
ave z
ave n
ave m
32nd ave
56th rd
linden blvd
e 4th st
east 14 street
west 29 street
west 30 street
west 34 street
west 39 street
ash st
west 135 street
150th st
cross island pkwy sr
hungry harbor rd
39th st
48th st
east 135 street
sheridana avenue
henry hudson pkwy
hutchinson river pkwy
maryland rd
east 177 street
homelawn street


In [None]:
import csv

with open(..., 'wb') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(mylist)

In [113]:
unknown_count = 0
for street, zipcode in street_to_zip.items():
    if zipcode == 0:
        unknown_count += 1
unknown = unknown_count / len(street_to_zip)
known = 1 - unknown
print("Known Percentage:", known*100)

Known Percentage: 66.66666666666667


In [88]:
stz_dir = "snd23b/snd23Bcow.txt"
stz_data = []
line_num = 0
for line in f:
    if line_num != 0:  # header
        line = line.replace("\n", '')
        temp_data = [info.strip() for info in line.split(' ') if len(info) != 0]
        print(temp_data)
        assert False
    line_num += 1

['14POLICE', 'OFFICER', 'A', 'ABRUZZO', 'PLACE', 'VS46549005600', 'M30POLICE', 'OFFICER', 'A', 'ABRUZZO', 'PLACE']


AssertionError: 