<a href="https://colab.research.google.com/github/jennahgosciak/nyc_fire_risk/blob/main/00_data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# setup
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import requests
import calendar
import geopandas as gpd
import os.path as os
import scipy.stats
import seaborn.palettes
import seaborn.utils
import sys
from census import Census
from us import states
import http.client, urllib.request, urllib.parse, urllib.error, base64
import config

root= r"C:/Users/Jennah/Desktop/Code/machine-learning-final"
inp= os.join(root, "data", "1_raw")
out= os.join(root, "data", "2_intermediate")

# Estimating Fires in NYC

## Load data on fire dispatch events
* Only structural fires are included
* Only residential fires


In [2]:
# load data on all fire dispate events for structural fires
url_fire_ev = 'https://data.cityofnewyork.us/resource/8m42-w767.csv?$limit=1000000&$where=INCIDENT_CLASSIFICATION_GROUP="Structural%20Fires"'
fire_ev = pd.read_csv(url_fire_ev)
print(fire_ev.shape)

# if false, then we might not have downloaded all data
print(fire_ev.shape[0])
assert fire_ev.shape[0] < 1000000

(456376, 29)
456376


In [3]:
fire_ev["incident_classification"].unique()

array(["Multiple Dwelling 'A' - Other fire",
       "Multiple Dwelling 'A' - Compactor fire",
       'Other Commercial Building Fire',
       "Multiple Dwelling 'A' - Food on the stove fire",
       'Private Dwelling Fire', "Multiple Dwelling 'B' Fire",
       'Store Fire', 'Hospital Fire', 'Transit System - Structural',
       'Church Fire', 'Construction or Demolition Building Fire',
       'Other Public Building Fire', 'Factory Fire', 'School Fire',
       'Untenanted Building Fire', 'Theater or TV Studio Fire'],
      dtype=object)

In [4]:
fire_ev= fire_ev.loc[fire_ev["incident_classification"].isin(["Multiple Dwelling 'A' - Other fire",
                                                  "Multiple Dwelling 'A' - Compactor fire",
                                                  "Multiple Dwelling 'A' - Food on the stove fire",
                                                  "Private Dwelling Fire",
                                                  "Multiple Dwelling 'B' Fire",
                                                  "Untenanted Building Fire"]), :].copy()

In [5]:
# create month, date, and year variables
fire_ev["incident_date"]= pd.to_datetime(fire_ev["incident_datetime"]).dt.date
fire_ev["incident_month"]= pd.to_datetime(fire_ev["incident_datetime"]).dt.month
fire_ev["incident_day"]= pd.to_datetime(fire_ev["incident_datetime"]).dt.day
fire_ev["incident_year"]= pd.to_datetime(fire_ev["incident_datetime"]).dt.year

fire_ev["incident_md"]= fire_ev[["incident_month", "incident_day"]].astype(str).apply('-'.join, 1)
print("\nFire events by year")
fire_ev["incident_year"].value_counts().sort_index()


Fire events by year


2005    26182
2006    26270
2007    26364
2008    25226
2009    24960
2010    25234
2011    24038
2012    24220
2013    21452
2014    21822
2015    22501
2016    22599
2017    23218
2018    21874
2019    21256
2020    21102
2021     7223
Name: incident_year, dtype: int64

In [6]:
fire_ev.head()

Unnamed: 0,starfire_incident_id,incident_datetime,alarm_box_borough,alarm_box_number,alarm_box_location,incident_borough,zipcode,policeprecinct,citycouncildistrict,communitydistrict,...,incident_response_seconds_qy,incident_travel_tm_seconds_qy,engines_assigned_quantity,ladders_assigned_quantity,other_units_assigned_quantity,incident_date,incident_month,incident_day,incident_year,incident_md
0,500192400000000.0,2005-01-01T00:07:32.000,QUEENS,9237,N/SVC RD H. HARDING EXPY & 99 ST,QUEENS,11368.0,110.0,21.0,404.0,...,338,236,3,2,2,2005-01-01,1,1,2005,1-1
1,500114900000000.0,2005-01-01T00:14:40.000,MANHATTAN,1493,BWAY & W125 ST\M.L.KING JR BLVD,MANHATTAN,10027.0,26.0,7.0,109.0,...,266,217,2,2,1,2005-01-01,1,1,2005,1-1
4,500106500000000.0,2005-01-01T00:24:58.000,BROOKLYN,653,LAFAYETTE & CLASSON AVES,BROOKLYN,11238.0,79.0,35.0,303.0,...,226,189,3,2,1,2005-01-01,1,1,2005,1-1
5,500116500000000.0,2005-01-01T00:27:19.000,MANHATTAN,1649,RIVERSIDE DR & 150 ST,MANHATTAN,10031.0,30.0,7.0,109.0,...,274,200,5,3,5,2005-01-01,1,1,2005,1-1
6,500116500000000.0,2005-01-01T00:27:19.000,MANHATTAN,1649,RIVERSIDE DR & 150 ST,MANHATTAN,10031.0,30.0,7.0,109.0,...,274,200,5,3,5,2005-01-01,1,1,2005,1-1


In [7]:
## save file in output folder
fire_ev.to_csv(os.join(out, "fire_dispatch.csv"))

In [8]:
## load in service alarm boxes
alarm_box= gpd.read_file(os.join(inp, "In-Service Alarm Box Locations.geojson"))

## load census tracts
tracts= gpd.read_file("https://services5.arcgis.com/GfwWNkhOj9bNBqoJ/arcgis/rest/services/NYC_Census_Tracts_for_2010_US_Census/FeatureServer/0/query?where=1=1&outFields=*&outSR=4326&f=pgeojson")

In [9]:
# spatial join to get info
alarm_box_t= gpd.sjoin(alarm_box, tracts, how = "left", op = "intersects")
alarm_box_t.head()

Unnamed: 0,location,latitude,zip,borobox,communitydistict,longitude,box_type,citycouncil,borough,geometry,...,BoroCode,BoroName,CT2010,BoroCT2010,CDEligibil,NTACode,NTAName,PUMA,Shape__Area,Shape__Length
0,3 AVE & 65 ST,40.63932033,11220,B2653,BK07,-74.02354939,ERS,38,Brooklyn,POINT (-74.02355 40.63932),...,3,Brooklyn,7000,3007000,E,BK31,Bay Ridge,4013,1609298.0,7191.182362
1,WOODSIDE AVE & 69 ST,40.7426855,11377,Q7917,QN02,-73.89565167,BARS,26,Queens,POINT (-73.89565 40.74269),...,4,Queens,48300,4048300,E,QN50,Elmhurst-Maspeth,4109,2399981.0,7509.356297
2,MYRTLE AVE & PALMETTO ST,40.69953211,11237,B0801,QN05,-73.9110349,ERS,34,Brooklyn,POINT (-73.91103 40.69953),...,3,Brooklyn,43900,3043900,E,BK77,Bushwick North,4002,2152393.0,8162.503636
3,NEW YORK AVE & LEFFERTS AVE,40.66253364,11225,B1046,BK09,-73.94791393,ERS,40,Brooklyn,POINT (-73.94791 40.66253),...,3,Brooklyn,80600,3080600,E,BK60,Prospect Lefferts Gardens-Wingate,4011,1790169.0,6017.644684
4,RIVER & NORTH 3 STS,40.71837562,11211,B0109,BK01,-73.96462115,ERS,33,Brooklyn,POINT (-73.96462 40.71838),...,3,Brooklyn,55500,3055500,I,BK73,North Side-South Side,4001,2838296.0,7349.098694


In [10]:
fire_ev["alarm_box_number_char"]= fire_ev["alarm_box_number"].astype(str).str.pad(width = 4, fillchar = "0")

In [11]:
fire_ev["borobox"]= np.select( [fire_ev["alarm_box_borough"] == "QUEENS",\
                               fire_ev["alarm_box_borough"] == "MANHATTAN",\
                               fire_ev["alarm_box_borough"] == "BRONX",\
                               fire_ev["alarm_box_borough"] == "BROOKLYN",\
                               fire_ev["alarm_box_borough"] == "STATEN ISLAND"], ["Q" + fire_ev["alarm_box_number_char"],\
                                                                         "M" + fire_ev["alarm_box_number_char"],\
                                                                         "X" + fire_ev["alarm_box_number_char"],\
                                                                         "B" + fire_ev["alarm_box_number_char"],\
                                                                         "R" + fire_ev["alarm_box_number_char"]])
fire_ev

Unnamed: 0,starfire_incident_id,incident_datetime,alarm_box_borough,alarm_box_number,alarm_box_location,incident_borough,zipcode,policeprecinct,citycouncildistrict,communitydistrict,...,engines_assigned_quantity,ladders_assigned_quantity,other_units_assigned_quantity,incident_date,incident_month,incident_day,incident_year,incident_md,alarm_box_number_char,borobox
0,5.001924e+14,2005-01-01T00:07:32.000,QUEENS,9237,N/SVC RD H. HARDING EXPY & 99 ST,QUEENS,11368.0,110.0,21.0,404.0,...,3,2,2,2005-01-01,1,1,2005,1-1,9237,Q9237
1,5.001149e+14,2005-01-01T00:14:40.000,MANHATTAN,1493,BWAY & W125 ST\M.L.KING JR BLVD,MANHATTAN,10027.0,26.0,7.0,109.0,...,2,2,1,2005-01-01,1,1,2005,1-1,1493,M1493
4,5.001065e+14,2005-01-01T00:24:58.000,BROOKLYN,653,LAFAYETTE & CLASSON AVES,BROOKLYN,11238.0,79.0,35.0,303.0,...,3,2,1,2005-01-01,1,1,2005,1-1,0653,B0653
5,5.001165e+14,2005-01-01T00:27:19.000,MANHATTAN,1649,RIVERSIDE DR & 150 ST,MANHATTAN,10031.0,30.0,7.0,109.0,...,5,3,5,2005-01-01,1,1,2005,1-1,1649,M1649
6,5.001165e+14,2005-01-01T00:27:19.000,MANHATTAN,1649,RIVERSIDE DR & 150 ST,MANHATTAN,10031.0,30.0,7.0,109.0,...,5,3,5,2005-01-01,1,1,2005,1-1,1649,M1649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456371,2.112560e+15,2021-05-05T21:28:00.000,QUEENS,6117,CORBETT RD & 221 ST,QUEENS,11361.0,111.0,19.0,411.0,...,1,1,0,2021-05-05,5,5,2021,5-5,6117,Q6117
456372,2.112510e+15,2021-05-05T21:31:00.000,MANHATTAN,1442,LEXINGTON AVE & 121 ST,MANHATTAN,10035.0,25.0,9.0,111.0,...,4,2,1,2021-05-05,5,5,2021,5-5,1442,M1442
456373,2.112510e+15,2021-05-05T22:23:00.000,MANHATTAN,1325,AMSTERDAM AVE & 109 ST,MANHATTAN,10025.0,24.0,7.0,107.0,...,3,2,1,2021-05-05,5,5,2021,5-5,1325,M1325
456374,2.112530e+15,2021-05-05T23:16:00.000,BRONX,3170,BRYANT AVE & BRONX PARK SO.,BRONX,10460.0,48.0,15.0,206.0,...,3,2,1,2021-05-05,5,5,2021,5-5,3170,X3170


In [12]:
# create zipcode level file with counts
fire_ev_tracts= fire_ev.merge(alarm_box_t, on = "borobox", how = "outer")
fire_ev_tracts.to_csv(os.join(out, "fire_dispatch_tracts.csv"))

In [13]:
fire_tract_avgs= fire_ev_tracts.groupby("BoroCT2010")[["engines_assigned_quantity", "ladders_assigned_quantity", "highest_alarm_level",\
                                 "dispatch_response_seconds_qy"]].mean().reset_index()
fire_tract_counts= fire_ev_tracts["BoroCT2010"].value_counts().reset_index().rename({"index":"BoroCT2010", "BoroCT2010":"num_fire_ev"},\
                                                                                   axis = 1)
print(fire_tract_avgs)
print(fire_tract_counts)

fire_tract_sum= fire_tract_avgs.merge(fire_tract_counts, on = "BoroCT2010", how = "outer")
fire_tract_sum.head()

     BoroCT2010  engines_assigned_quantity  ladders_assigned_quantity  \
0       1000201                   3.218978                   2.094891   
1       1000202                   3.271817                   2.160229   
2       1000600                   3.083815                   2.089595   
3       1000700                   2.450000                   1.787500   
4       1000800                   3.222222                   2.093567   
...         ...                        ...                        ...   
2039    5030301                        NaN                        NaN   
2040    5030302                        NaN                        NaN   
2041    5031901                        NaN                        NaN   
2042    5031902                        NaN                        NaN   
2043    5032300                        NaN                        NaN   

      dispatch_response_seconds_qy  
0                        34.591241  
1                        39.273247  
2           

Unnamed: 0,BoroCT2010,engines_assigned_quantity,ladders_assigned_quantity,dispatch_response_seconds_qy,num_fire_ev
0,1000201,3.218978,2.094891,34.591241,139
1,1000202,3.271817,2.160229,39.273247,699
2,1000600,3.083815,2.089595,38.008671,346
3,1000700,2.45,1.7875,43.13125,163
4,1000800,3.222222,2.093567,39.330409,344


In [14]:
fire_tract_sum.to_csv(os.join(out, "fire_tract_sum.csv"))

# Load data on vacate orders from HPD, due to fire
## From January 1st, 2017 through the present

* Order to repair/vacate orders: https://data.cityofnewyork.us/resource/tb8q-a3ar
* Fire Department building vacate list: https://data.cityofnewyork.us/resource/n5xc-7jfa
* Future work: combine old and new vacate orders? First dataset only has 93 records

In [15]:
url_vac_old = 'https://data.cityofnewyork.us/resource/n5xc-7jfa.csv?$limit=1000000'
vac_old = pd.read_csv(url_vac_old)
print("Size of data:", vac_old.shape)
# format date variable
vac_old["vac_date_orig"]= vac_old["vac_date"].copy()
vac_old["vac_date"]= pd.to_datetime(vac_old["vac_date"]).dt.date
vac_old["vac_year"]= pd.to_datetime(vac_old["vac_date"]).dt.year
# sort data
vac_old.sort_values("vac_year", ascending = True).head()

HTTPError: HTTP Error 500: Server Error

In [None]:
# load ALL vacate orders
url_vac_all = 'https://data.cityofnewyork.us/resource/tb8q-a3ar.csv?$limit=1000000'
vac_all = pd.read_csv(url_vac_all)
print(vac_all.shape)

# if false, then we might not have downloaded all data
print(vac_all.shape[0])
assert vac_all.shape[0] < 1000000
vac_all.to_csv(os.join(out, "all_vacate.csv"))

In [None]:
# load vacate orders
url_vac = 'https://data.cityofnewyork.us/resource/tb8q-a3ar.csv?$limit=1000000&$where=primary_vacate_reason="Fire%20Damage"'
vac = pd.read_csv(url_vac)
print(vac.shape)

# if false, then we might not have downloaded all data
print(vac.shape[0])
assert vac.shape[0] < 1000000

In [None]:
# create date, month, and year variables
vac["vacate_effective_date2"]= pd.to_datetime(vac["vacate_effective_date"]).dt.date
vac["vacate_effective_month"]= pd.to_datetime(vac["vacate_effective_date"]).dt.month
vac["vacate_effective_day"]= pd.to_datetime(vac["vacate_effective_date"]).dt.day
vac["vacate_effective_year"]= pd.to_datetime(vac["vacate_effective_date"]).dt.year

vac["vacate_effective_md"]= vac[["vacate_effective_month", "vacate_effective_day"]].astype(str).apply('-'.join, 1)

In [None]:
# number of fires per year
vac["vacate_effective_year"].value_counts().sort_index()

In [None]:
vac_sub= vac["bbl"].value_counts().reset_index().rename({"index":"bbl", "bbl":"num_vac_orders"}, axis = 1)
vac_sub

In [None]:
## save file in output folder
vac.to_csv(os.join(out, "fire_vacate.csv"))
vac_sub.to_csv(os.join(out, "fire_vacate_bbl.csv"))

## Load PLUTO

In [None]:
## load pluto
pluto= gpd.read_file(os.join(inp, "nyc_mappluto_21v4_shp/MapPLUTO.shp"))
pluto= pluto.loc[pluto["LandUse"].isin(['01','02','03','04']), ["BBL", "geometry"]].rename({"BBL":"bbl"}, axis = 1)

In [None]:
pluto.shape

In [None]:
usecols = ["borough", "bbl", "cd", "ct2010", "zipcode", "address", "bldgclass", "landuse", "ownertype", "ownername",
           "lotarea", "bldgarea", "numbldgs", "numfloors", "unitsres", "unitstotal", "assessland", "assesstot",\
           "exempttot", "yearbuilt", "yearalter1", "yearalter2", "sanborn"]

pluto_df= pd.read_csv(os.join(inp, "nyc_pluto_21v4_csv/pluto_21v4.csv"),
                      usecols = usecols)
pluto_df= pluto_df.loc[pluto_df["landuse"].isin([1,2,3,4]), :]
print(pluto_df.shape)
pluto_df

### Spatial join redlining
* Add to Pluto data via spatial join

In [None]:
## red lining maps
mn_rl= gpd.read_file(os.join(inp, "NYManhattan1937.geojson"))
bk_rl= gpd.read_file(os.join(inp, "NYBrooklyn1938.geojson"))
bx_rl= gpd.read_file(os.join(inp, "NYBronx1938.geojson"))
qn_rl= gpd.read_file(os.join(inp, "NYQueens1938.geojson"))
si_rl= gpd.read_file(os.join(inp, "NYStatenIsland1940.geojson"))

all_rl = gpd.GeoDataFrame(pd.concat([mn_rl, bk_rl, bx_rl, qn_rl, si_rl], ignore_index = True)).to_crs(2263)

In [None]:
pluto_cen= pluto.copy()
pluto_cen["geometry"]= pluto_cen["geometry"].centroid

pluto_rl= gpd.sjoin(pluto_cen, all_rl, how = "left", op = "intersects")
pluto_rl.head()

In [None]:
print(pluto_rl["holc_grade"].unique())

pluto_rl["holc_AB"]= np.where((pluto_rl["holc_grade"] == "A") | \
                              (pluto_rl["holc_grade"] == "B"), 1, 0)

pluto_rl["holc_CD"]= np.where((pluto_rl["holc_grade"] == "C") | \
                              (pluto_rl["holc_grade"] == "D"), 1, 0)

pluto_rl["holc_D"]= np.where((pluto_rl["holc_grade"] == "D"), 1, 0)

In [None]:
fig, ax = plt.subplots (figsize = (15,15))
all_rl.plot(ax=ax)
pluto_rl.loc[pluto_rl["index_right"].isna()].head(100).plot(ax=ax, color='red')

In [None]:
## merge to pluto
pluto_df_rl= pluto_df.merge(pluto_rl[["bbl", "holc_AB", "holc_CD", "holc_D"]], on = "bbl", how = "left")

In [None]:
# save output to intermediate file
# create pluto and vacate orders
pluto_df_rl.to_csv(os.join(out, "pluto_df.csv"))

In [None]:
landusecols = ["landuse_" + str(x) for x in pluto_df_rl["landuse"].unique()]
bldgclasscols = ["bldgclass_" + str(x) for x in pluto_df_rl["bldgclass"].unique()]
pluto_df_rl= pd.merge(pluto_df_rl, \
                      pd.get_dummies(pluto_df_rl[["bbl", "landuse", "bldgclass"]], columns = ["landuse", "bldgclass"]), on = "bbl")

In [None]:
## create same file at census tract level level for dispatch data
meancols = ["lotarea", "bldgarea", "numbldgs", "numfloors", "unitsres", "unitstotal", "assessland", "assesstot",\
           "exempttot", "yearbuilt", "holc_AB", "holc_CD", "holc_D"] + landusecols + bldgclasscols
pluto_df_ct= pluto_df_rl.groupby(["borough","ct2010"])[meancols].mean().merge(
                        pluto_df_rl.groupby(["borough","ct2010"])[["bldgclass", "landuse"]].agg(pd.Series.mode), on = "ct2010", how = "outer")
pluto_df_ct

In [None]:
pluto_df_ct.to_csv(os.join(out, "pluto_ct.csv"))

## Load data on electricity and water usage (Local Law 84)

In [None]:
# store urls
url_2020= "https://data.cityofnewyork.us/resource/usc3-8zwd.csv?$limit=100000"
url_2019= "https://data.cityofnewyork.us/resource/wcm8-aq5w.csv?$limit=100000"
url_2018= "https://data.cityofnewyork.us/resource/4tys-3tzj.csv?$limit=100000"
url_2017= "https://data.cityofnewyork.us/resource/4t62-jm4m.csv?$limit=100000"
url_2016= "https://data.cityofnewyork.us/resource/utpj-74fz.csv?$limit=100000"
url_2015= "https://data.cityofnewyork.us/resource/4t62-jm4m.csv?$limit=100000"
url_2014= "https://data.cityofnewyork.us/resource/nbun-wekj.csv?$limit=100000"
url_2013= "https://data.cityofnewyork.us/resource/yr5p-wjer.csv?$limit=100000"
url_2012= "https://data.cityofnewyork.us/resource/r6ub-zhff.csv?$limit=100000"

# load and append data from each year
urls= [url_2012, url_2013, url_2014, url_2015, url_2016, url_2017, url_2018, url_2019, url_2020]
cols= [["property_id", "year_ending","bbl","weather_normalized_site_eui"],\
      ["property_id", "year_ending", "nyc_borough_block_and_lot", "weather_normalized_site_eui"],\
      ["property_id", "year_ending", "nyc_borough_block_and_lot", "weather_normalized_site_eui"],\
      ["property_id", "year_ending", "nyc_borough_block_and_lot", "weather_normalized_site_eui"],\
      ["property_id", "year_ending", "nyc_borough_block_and_lot", "weather_normalized_site_eui"],\
      ["property_id", "year_ending", "nyc_borough_block_and_lot", "weather_normalized_site_eui"],\
      ["property_id", "year_ending", "nyc_borough_block_and_lot", "weather_normalized_site_eui"],\
      ["property_id", "year_ending", "nyc_borough_block_and_lot", "weather_normalized_site_eui"],\
      ["property_id", "year_ending", "nyc_borough_block_and_lot_bbl", "weather_normalized_site_eui_kbtu_ft"]]
# read in all data files using lc, combine across the rows
dfs= [pd.read_csv(urls[i], usecols = cols[i]) for i in range(5,9)]
dfs

In [None]:
# make names of columns uniform
for i in range(5,len(dfs)+1):
    dfs[i].columns = cols[0]

In [None]:
# concatenate along rows (i.e. append data from each year)
df_ew= pd.concat(dfs, axis = 0, ignore_index = True)
df_ew

In [None]:
# text formatting of bbl
df_ew.loc[:, "bbl"]= df_ew.loc[:, "nyc_borough_block_and_lot"].str.replace("[A-z]{1}[0-9]{3}-", "", regex = True)
df_ew.loc[:, "bbl"]= df_ew.loc[:, "bbl"].str.replace("-", "").str.replace(" ", ";")\
                        .str.replace("(?<=[0-9]{10})/", ";", regex = True)\
                        .str.replace("/", "")\
                        .str.replace(",", ";")\
                        .str.replace(":", ";")\
                        .str.replace("and", ";")\
                        .str.replace("&", ";")\
                        .str.replace("NotAvailable", "")\
                        .str.replace("multiple", "")
df_ew.loc[:, ["bbl", "nyc_borough_block_and_lot"]]

In [None]:
# need to split bbl into several columns
max_num= int(df_ew["bbl"].str.count(";").max())
print(max_num)
df_ew[["bbl" + str(i) for i in range(0, (max_num + 1))]]= df_ew["bbl"].str.split(pat= ";", n=-1, expand = True)
df_ew[["bbl" + str(i) for i in range(0, (max_num + 1))]].head()

In [None]:
# reshape and reformat bbl column
# pivot data long using melt
cols = [x for x in df_ew.columns if x not in ["bbl" + str(i) for i in range(0, max_num)] + ["bbl"]]
cols

# melt pivots data long
df_ewlong= pd.melt(df_ew.drop("bbl", axis = 1), id_vars=cols,var_name='bbl_num', value_name='bbl')
# recode comparison
print(df_ewlong[["bbl", "nyc_borough_block_and_lot"]].dropna().loc[(df_ewlong["bbl"].dropna()).map(len) > 10, :])

df_ewlong= df_ewlong[(df_ewlong["bbl"].notna()) | (df_ewlong["bbl"].isna() & \
                                                         df_ewlong["bbl_num"] == "bbl0")].drop_duplicates()

In [None]:
df_ewlong= df_ewlong.rename({"weather_normalized_site_eui":"energy_usage"}, axis = 1)

# drop these values, not useful
df_ewlong= df_ewlong.drop(df_ewlong.loc[(df_ewlong["bbl"].str.len() != 10) | \
                (df_ewlong["bbl"] == "Code9Code9") | \
                (df_ewlong["bbl"] == "Not Available") | \
                (df_ewlong["bbl"] == "XXXXXXXXXX"), :].index, axis = 0)

# drop these values, not useful
df_ewlong= df_ewlong.drop(df_ewlong.loc[(df_ewlong["energy_usage"] == "Code9Code9") | \
                (df_ewlong["energy_usage"] == "Not Available"), :].index, axis = 0)

In [None]:
df_ewlong["year"]= pd.to_datetime(df_ewlong["year_ending"]).dt.year
df_ewlong["energy_usage"]= df_ewlong["energy_usage"].astype(float)

In [None]:
df_ewlong= df_ewlong.loc[~df_ewlong[["bbl", "year", "energy_usage"]].duplicated(), :]
df_ewlong= df_ewlong.groupby(["bbl", "year"])["energy_usage"].mean().reset_index()
df_ewlong

In [None]:
df_ew_avg= df_ewlong.pivot(index = "bbl", columns = "year", values = "energy_usage").mean(axis = 1).rename("average_energy_usage", axis = 1).reset_index()

In [None]:
# merge to pluto and main df
print(df_ew_avg[["bbl","average_energy_usage"]].head())
assert (df_ew_avg["bbl"].str.len() == 10).all()

df_ew_avg.to_csv(os.join(out, "ll84_avgenergy.csv"))

## Development Patterns

In [None]:
url= "https://data.cityofnewyork.us/resource/ipu4-2q9a.csv?$LIMIT=1000000&$WHERE=(starts_with(job_type,'A')%20OR%20starts_with(job_type,'N'))%20AND%20residential='YES'%20AND%20permit_status='ISSUED'%20AND%20filing_date>='2012-01-01'%20AND%20filing_status='INITIAL'%20AND%20permit_type%20IN%20('AL','NB')"
dev_perm= pd.read_csv(url, usecols = ["borough", "block", "lot", "job__", "permit_type", "filing_date"])
dev_perm

In [None]:
dev_perm.columns
dev_perm["year"]= pd.to_datetime(dev_perm["filing_date"]).dt.year

In [None]:
dev_perm["borough_code"]= np.select( [dev_perm["borough"] == "QUEENS",\
                               dev_perm["borough"] == "MANHATTAN",\
                               dev_perm["borough"] == "BRONX",\
                               dev_perm["borough"] == "BROOKLYN",\
                               dev_perm["borough"] == "STATEN ISLAND"], ["4", "1", "2", "3", "5"])

In [None]:
dev_perm["bbl"]= dev_perm["borough_code"] + dev_perm["block"].astype(str).str.replace(".0", "").str.pad(width = 5, side = "left", fillchar = "0") + \
        dev_perm["lot"].astype(str).str.replace(".0", "").str.pad(width = 4, side = "left", fillchar = "0")
dev_perm["bbl"]

In [None]:
dev_perm_wide= dev_perm[["bbl", "year", "permit_type"]].value_counts().rename("count").reset_index()
dev_perm_wide= dev_perm_wide.pivot(index = ["bbl", "permit_type"], columns = "year", values = "count").fillna(0)

dev_perm_wide["avg_permits"]=dev_perm_wide.mean(axis = 1)
dev_perm_wide= dev_perm_wide.reset_index().pivot(index = "bbl", columns = "permit_type", values = ["avg_permits"] + list(range(2012, 2021))).fillna(0)
dev_perm_wide

In [None]:
dev_perm_wide.to_csv(os.join(out, "dev_permits.csv"))

## Euclidean distance to a fire station

In [None]:
## load firehouses
firest= pd.read_csv("https://data.cityofnewyork.us/resource/hc8x-tcnd.csv?$limit=100000")
firest_geo= gpd.GeoDataFrame(firest, geometry= gpd.points_from_xy(firest.longitude, firest.latitude)).set_crs(4326).to_crs(2263)

In [None]:
firest_geo

## Speculation Watchlist
* List of sales of select rent-regulated multiple dwellings where there may be potential for speculation. This list is promulgated pursuant to Local Law 7 of 2018 and 28 RCNY Chapter 52. This list is a subset of data provided in the LL7-2018 Qualified Transactions file.

In [None]:
url= "https://data.cityofnewyork.us/resource/adax-9mit.csv?$limit=100000"
spec = pd.read_csv(url, usecols = ["bbl"])
spec["speculation"]= 1
spec.to_csv(os.join(out, "speculation_hpd.csv"))