<a href="https://colab.research.google.com/github/jennahgosciak/nyc_fire_risk/blob/main/00_data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# setup
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import requests
import calendar
import geopandas as gpd
import os.path as os
import scipy.stats
import seaborn.palettes
import seaborn.utils
import sys
from census import Census
from us import states
import http.client, urllib.request, urllib.parse, urllib.error, base64
import config
import quickstart

root= r"C:/Users/Jennah/Desktop/Code/machine-learning-final"
inp= os.join(root, "data", "2_intermediate")
out= os.join(root, "data", "3_clean")

In [None]:
from pydrive.auth import GoogleAuth

gauth = GoogleAuth()
gauth.LocalWebserverAuth() # Creates local webserver and auto handles authentication.

In [None]:
from pydrive.drive import GoogleDrive
drive = GoogleDrive(gauth)
fileList = drive.ListFile({'q': "'1kiYbPtSYavHQKzyMWtXaLhP-JM2toXRm' in parents and trashed=false"}).GetList()
for file in fileList:
  print('Title: %s, ID: %s' % (file['title'], file['id']))
  # Get the folder ID that you want
  if(file['title'] == "To Share"):
      fileID = file['id']

## Load outcomes (fire vacate orders)

In [None]:
csv = drive.CreateFile({'id': '1caDnIMowquAoJuZi1tDlyEqAS4HhXiU0'})
csv.GetContentFile('fire_vacate_bbl.csv')  

df= pd.read_csv('fire_vacate_bbl.csv')[["bbl", "num_vac_orders"]]
df["bbl"]= df["bbl"].astype(str).str.replace(".0", "", regex = False)

### Load PLUTO

In [None]:
csv = drive.CreateFile({'id': '1j2KR6Tpa4CTA0k1omVfE52UYlMBMz17o'})
csv.GetContentFile('pluto_df.csv')  

pluto= pd.read_csv('pluto_df.csv').drop("Unnamed: 0", axis =1)
pluto["bbl"]= pluto["bbl"].astype(str).str.replace(".0", "", regex = False)

In [None]:
pluto.head()

In [None]:
# left join from pluto to capture all properties
# use df for space constraints
df_pl= pluto.merge(df, how = "left", on = "bbl", indicator = True)
print(df_pl["_merge"].value_counts())

# create indicator of fire vacancy based on indicator
df_pl["vacate_ind"]= np.where(
            df_pl["_merge"]=="both", 1, 0)
df_pl.loc[df_pl["_merge"]=="left_only", "num_vacate_orders"]= 0

print("\nTab of outcome var (vacate ind)")
print(df_pl[["vacate_ind", "_merge"]].value_counts())
df_pl.drop("_merge", axis = 1, inplace = True)

### HPD Speculations

In [None]:
csv = drive.CreateFile({'id': '1yf86QR6yv2Z9Q6WNINW87RASUsdHPDl6'})
csv.GetContentFile('hpd_spec.csv')  

hpd_spec= pd.read_csv('hpd_spec.csv').drop("Unnamed: 0", axis = 1)
hpd_spec["bbl"]= hpd_spec["bbl"].astype(str).str.replace(".0", "", regex = False)

In [None]:
hpd_spec.head()

In [None]:
# left join from pluto to HPD
# use df for space constraints
df_hpd_spec= df_pl.merge(hpd_spec, how = "left", on = "bbl", indicator = True)
print(df_hpd_spec["_merge"].value_counts())

# create indicator of fire vacancy based on indicator
df_hpd_spec["speculation"]= np.where(
            df_hpd_spec["_merge"]=="both", 1, 0)

df_hpd_spec.drop("_merge", axis = 1, inplace = True)

In [None]:
hpd_spec.head()

### HPD Violations

In [None]:
csv = drive.CreateFile({'id': '13-4GIdxui77MIVhk6QmfBUdmTTZS1U3v'})
csv.GetContentFile('hpd_viol.csv')  

hpd_viol= pd.read_csv('hpd_viol.csv', skiprows = 2).rename({"Unnamed: 1": "viol_count",
                                                           "BBL":"bbl"}, axis = 1)
hpd_viol= hpd_viol.drop(hpd_viol[(hpd_viol["bbl"] - 0)<1].index, axis = 0)
hpd_viol["bbl"]= hpd_viol["bbl"].astype(str).str.replace(".0", "", regex = False)

In [None]:
hpd_viol.head()

In [None]:
# left join from pluto to HPD
# use df for space constraints
df_hpd_viol= df_pl.merge(hpd_viol, how = "left", on = "bbl", indicator = True)
print(df_hpd_viol["_merge"].value_counts())

# create indicator of fire vacancy based on indicator
df_hpd_viol.loc[df_hpd_viol["_merge"]=="left_only", "viol_count"]= 0

df_hpd_viol.drop("_merge", axis = 1, inplace = True)

In [None]:
df_hpd_viol["viol_count_per_unit"]= df_hpd_viol["viol_count"].div(df_hpd_viol["unitsres"])

### DOB Building Permits

In [None]:
csv = drive.CreateFile({'id': '1mH_X0bLa_a1r-hpzwz43HpC7fFcayufp'})
csv.GetContentFile('dev_perm.csv')  

dev= pd.read_csv('dev_perm.csv', header=[0,1], index_col=[0])['avg_permits'].reset_index()
dev["bbl"]= dev["bbl"].astype(str).str.replace(".0", "", regex = False)
dev.head()

In [None]:
# left join from pluto to dev dob permits
# use df for space constraints
df_dev= df_hpd_viol.merge(dev, how = "left", on = "bbl", indicator = True)
print(df_dev["_merge"].value_counts())

# create indicator of fire vacancy based on indicator
df_dev.loc[df_dev["_merge"]=="left_only", "AL"]= 0
df_dev.loc[df_dev["_merge"]=="left_only", "NB"]= 0

df_dev.drop("_merge", axis = 1, inplace = True)

### LL 84

In [None]:
csv = drive.CreateFile({'id': '1NOKtbGyL3_TBSMp_eqybVaQc67t33yuo'})
csv.GetContentFile('ll84_avg.csv')  

ll84= pd.read_csv('ll84_avg.csv').drop({"Unnamed: 0"}, axis = 1)
ll84["bbl"]= ll84["bbl"].astype(str).str.replace(".0", "", regex = False)
ll84= ll84.drop(ll84[ll84["bbl"].apply(len)<10].index, axis = 0)

In [None]:
# left join from pluto to dev dob permits
# use df for space constraints
df_ll84= df_dev.merge(ll84, how = "left", on = "bbl", indicator = True)
print(df_ll84["_merge"].value_counts())

# create indicator of fire vacancy based on indicator
df_ll84.loc[df_ll84["_merge"]=="left_only", "average_energy_usage_mi"]= 1
df_ll84.loc[df_ll84["_merge"]=="both", "average_energy_usage_mi"]= 0
df_ll84.loc[df_ll84["_merge"]=="left_only", "average_energy_usage"]= 0

df_ll84.drop("_merge", axis = 1, inplace = True)

### Export analytic file

In [None]:
df_ll84.to_csv(os.join(root, "data", "3_clean", "analysis_bbl.csv"))

In [None]:
df_ll84.head()

## Predictors to add?
* Number of times sold
* Sale within 2012?
* Tax Lien Sale
* Unpaid charges