In [29]:
import pandas as pd
import geopandas as gpd
import re
import os
import numpy as np
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process

## Exploring feasibility of layering parks properties dataset into db-checkbook

### setup

In [5]:
csdb_df = pd.read_csv('../.output/historical_spend.csv')
csdb_gdf = gpd.GeoDataFrame(csdb_df)
parks_df = pd.read_csv('Parks_Properties.csv')
parks_gdf = gpd.GeoDataFrame(parks_df)

### exploratory on parks gdf

In [13]:
print(parks_gdf.shape)
print(parks_gdf.columns)
parks_gdf.head(5)

(2044, 35)
Index(['ACQUISITIONDATE', 'ACRES', 'ADDRESS', 'BOROUGH', 'CLASS',
       'COMMUNITYBOARD', 'COUNCILDISTRICT', 'DEPARTMENT', 'EAPPLY', 'GISOBJID',
       'GISPROPNUM', 'GlobalID', 'JURISDICTION', 'LOCATION', 'MAPPED',
       'NAME311', 'NYS_ASSEMBLY', 'NYS_SENATE', 'OBJECTID', 'OMPPROPID',
       'PARENTID', 'PERMIT', 'PERMITDISTRICT', 'PERMITPARENT', 'PIP_RATABLE',
       'PRECINCT', 'RETIRED', 'SIGNNAME', 'SUBCATEGORY', 'TYPECATEGORY', 'URL',
       'US_CONGRESS', 'WATERFRONT', 'ZIPCODE', 'multipolygon'],
      dtype='object')


Unnamed: 0,ACQUISITIONDATE,ACRES,ADDRESS,BOROUGH,CLASS,COMMUNITYBOARD,COUNCILDISTRICT,DEPARTMENT,EAPPLY,GISOBJID,...,PRECINCT,RETIRED,SIGNNAME,SUBCATEGORY,TYPECATEGORY,URL,US_CONGRESS,WATERFRONT,ZIPCODE,multipolygon
0,2002-11-20 00:00:00.0000000,0.34,,X,PARK,202,17,X-02,Bryant Hill Garden,100004968.0,...,41,False,Bryant Hill Garden,Greenthumb,Garden,http://www.nycgovparks.org/parks/X315/,14.0,False,10474,MULTIPOLYGON (((-73.88736572860253 40.81970462...
1,1935-01-09 00:00:00.0000000,0.717,1300 SPOFFORD AVENUE,X,PARK,202,17,X-02,Hunts PointPlayground,100005079.0,...,41,False,Hunts Point Playground,JOP,Jointly Operated Playground,http://www.nycgovparks.org/parks/X113/,14.0,False,10474,MULTIPOLYGON (((-73.88619730395624 40.81384928...
2,2009-11-20 00:00:00.0000000,9.375,,Q,PARK,414,32,Q-14,Sunset Cove Park,100003992.0,...,100,False,Sunset Cove Park,,Undeveloped,http://www.nycgovparks.org/parks/Q498/,5.0,True,11693,MULTIPOLYGON (((-73.82218300936414 40.59892072...
3,1937-06-01 00:00:00.0000000,249.389,,Q,PARK,401403404406,1921222429,Q-03,Grand Central Parkway Ext,100000441.0,...,114,False,Grand Central Parkway Extension,EXWY,Parkway,http://www.nycgovparks.org/parks/Q084A/,614.0,True,11103113671136811369113701137111375,MULTIPOLYGON (((-73.8587476480729 40.767414466...
4,1949-03-24 00:00:00.0000000,2.035,61-25 LITTLE NECK PARKWAY,Q,PARK,411,23,Q-11,Challenge Playground,100000009.0,...,111,False,Challenge Playground,JOP,Jointly Operated Playground,http://www.nycgovparks.org/parks/Q346/,3.0,False,11362,MULTIPOLYGON (((-73.72738293199147 40.75605209...


In [None]:
parks_name_cols = ['EAPPLY', 'ADDRESS', 'NAME311', 'SIGNNAME']

In [16]:
gdf_nogeom = csdb_gdf[csdb_gdf['has_geometry']==False]
gdf = gdf_nogeom[gdf_nogeom['final_category']=='Fixed Asset'] # focus on unmapped fixed assets
gdf.shape

(8198, 23)

### exploring what unmapped capital projects mention parks

In [19]:
gdf.columns

Index(['Unnamed: 0', 'fms_id', 'check_amount', 'contract_purpose',
       'budget_code', 'agency', 'bc_category', 'cp_category', 'cpdb_category',
       'ccpversion', 'maprojid', 'magency', 'magencyacr', 'projectid',
       'descriptio', 'geomsource', 'dataname', 'datasource', 'datadate',
       'geometry', 'cartodb_id', 'has_geometry', 'final_category'],
      dtype='object')

In [32]:
parksearch = ['PARK', 'PLAYGROUND', 'GARDEN']

temp = gdf.dropna(subset=['budget_code', 'contract_purpose'])
bc_park = temp[temp['budget_code'].str.contains('|'.join(parksearch))]['budget_code']
cp_park = temp[temp['contract_purpose'].str.contains('|'.join(parksearch))]['contract_purpose']
print(bc_park.shape)
print(cp_park.shape)

(591,)
(230,)


In [33]:
sample_parks_names = parks_gdf['EAPPLY'].sample(n=100)

# source: this medium article https://towardsdatascience.com/fuzzywuzzy-basica-and-merging-datasets-on-names-with-different-transcriptions-e2bb6e179fbf
keys = {}
for bc in bc_park: #names in smaller dataset to compare and match
#get closest match of `name` compared to larger data `member_names`
    keys[bc] = ((process.extract(bc, sample_parks_names, limit=2)))
#you can limit to 1 with extractOne to take less time but I wanted to check if names were returning as correct.

In [35]:
print(keys.items())

dict_items([('BPFE (BPL: BROWER PARK LIBRARY @ BKLYN CHILDRE);BPIT (BPL: BROWER PARK LIBRARY @ BKLYN CHILDRE);BPIT (BPL: BROWER PARK LIBRARY @ BKLYN CHILDRE);BPIT (BPL: BROWER PARK LIBRARY @ BKLYN CHILDRE);BPIT (BPL: BROWER PARK LIBRARY @ BKLYN CHILDRE);BPIT (BPL: BROWER PARK LIBRARY @ BKLYN CHILDRE);BPIT (BPL: BROWER PARK LIBRARY @ BKLYN CHILDRE);BPFE (BPL: BROWER PARK LIBRARY @ BKLYN CHILDRE);BPFE (BPL: BROWER PARK LIBRARY @ BKLYN CHILDRE);BPFE (BPL: BROWER PARK LIBRARY @ BKLYN CHILDRE);BPIT (BPL: BROWER PARK LIBRARY @ BKLYN CHILDRE);BPIT (BPL: BROWER PARK LIBRARY @ BKLYN CHILDRE);BPFE (BPL: BROWER PARK LIBRARY @ BKLYN CHILDRE)', [('Bunker Ponds Park', 57, 1504), ('Estella Diggs Park', 57, 1922)]), ('PKS3 (PARK SLOPE BRANCH LIBRARY, BKLYN: PHASE);PKS3 (PARK SLOPE BRANCH LIBRARY, BKLYN: PHASE)', [('Bunker Ponds Park', 86, 1504), ('Estella Diggs Park', 86, 1922)]), ('LQNF (NORTH FOREST PARK BRANCH LIBRARY, 98-27);LQNF (NORTH FOREST PARK BRANCH LIBRARY, 98-27)', [('Bunker Ponds Park', 8

In [None]:
mask = gdf['budget_code'].isin()

591 capital projects categorized as Fixed Asset and not yet assigned a geometry have budget codes containing words relating to parks
230 capital project with the same criteria have contract purposes containing words relating to parks
There is some overlap between the two but this is promising regardless, given that we have only mapped 3880 capital projects so far

*Idea:* filter unmapped capital projects to be as small a subset as possible, then apply fuzzy string mtching to get park name or address out of budget code / contract purpose