# Step 1: Initial data work and extraction of geodata geometries
###### Geographic Data Cleaning and Assembly for Google Maps using Jupyter Gmaps;
------
Before starting the data cleaning process, we obtained a list of zip codes within the DC Metro area through a web utility attempting to select as many as possible zip codes that exist within the bounds of the D.C. Beltway.

Utilizing data published on census.gov, we start here with some simple organization and slimming down of the bulk of the dataset. Some of the datasets the team had found included data for all US zipcodes (the GeoJSON data comes to mind specifically).

Ultimately, before the conclusion of the project, much of this work would be implemented in more fully developed code in other notebooks. Given more time, this particular notebook would be revised if not outright retired. However, some specific output documents are still in use by other project components.


In [1]:
import pandas as pd
import json
import csv
from keys import gapikey
from matplotlib.cm import winter
from matplotlib.colors import to_hex
from area import area
import ast

with open('zcta/zcta.json') as a:
    geoj = json.load(a)
with open('resources/zip_targets.csv', newline='\n', encoding='utf-8-sig') as b:
    ziptgt = pd.read_csv(b)
with open('resources/dmv_pop.csv', newline='\n', encoding='utf-8-sig') as c:
    dc_pop = pd.read_csv(c)
# with open('resources/places_geo.csv', newline='\n', encoding='utf-8') as d:
#     places_geo = pd.read_csv(d)
with open('resources/median_income.csv', newline='\n', encoding='utf-8-sig') as f:
    dmv_income = pd.read_csv(f)

In [2]:
dmv_income.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_EST_VC02,HC01_MOE_VC02,HC02_EST_VC02,HC02_MOE_VC02,HC03_EST_VC02,HC03_MOE_VC02,HC01_EST_VC04,...,HC02_EST_VC52,HC02_MOE_VC52,HC03_EST_VC52,HC03_MOE_VC52,HC01_EST_VC53,HC01_MOE_VC53,HC02_EST_VC53,HC02_MOE_VC53,HC03_EST_VC53,HC03_MOE_VC53
0,Id,Id2,Geography,Number; Estimate; Households,Number; Margin of Error; Households,Percent Distribution; Estimate; Households,Percent Distribution; Margin of Error; Households,Median income (dollars); Estimate; Households,Median income (dollars); Margin of Error; Hous...,Number; Estimate; Households - One race-- - White,...,Percent Distribution; Estimate; NONFAMILY HOUS...,Percent Distribution; Margin of Error; NONFAMI...,Median income (dollars); Estimate; NONFAMILY H...,Median income (dollars); Margin of Error; NONF...,Number; Estimate; NONFAMILY HOUSEHOLDS - Nonfa...,Number; Margin of Error; NONFAMILY HOUSEHOLDS ...,Percent Distribution; Estimate; NONFAMILY HOUS...,Percent Distribution; Margin of Error; NONFAMI...,Median income (dollars); Estimate; NONFAMILY H...,Median income (dollars); Margin of Error; NONF...
1,8600000US20001,20001,ZCTA5 20001,18764,434,18764,434,100447,4062,9200,...,31.1,2.1,84023,8194,2337,252,18.4,2.0,162112,13233
2,8600000US20002,20002,ZCTA5 20002,26217,498,26217,498,82022,4585,11775,...,29.9,2.1,54726,9409,2376,250,15.3,1.6,139167,14723
3,8600000US20003,20003,ZCTA5 20003,12394,249,12394,249,118607,4707,8971,...,29.2,2.9,85862,7966,1219,154,17.2,2.1,155804,11155
4,8600000US20004,20004,ZCTA5 20004,1163,125,1163,125,144583,9150,934,...,45.3,8.6,138125,21908,79,37,8.9,4.0,168558,108697


In [3]:
dmv_incomedf = dmv_income[['GEO.id2', 'HC03_EST_VC02']].replace('-',0)
dmv_incomedf = dmv_incomedf.replace('250,000+',0)
dmv_incomedf = dmv_incomedf.rename(columns={'GEO.id2': 'zip', 'HC03_EST_VC02': 'median_household_income'}).drop(0).astype('int64')

dc_maindf = ziptgt.merge(dmv_incomedf, on='zip', how='outer').set_index('zip').fillna(value=0)

cu_id = dc_maindf[ dc_maindf['city'] == 0].index
dc_maindf = dc_maindf.drop(cu_id)
cu_id = dc_maindf[ dc_maindf['median_household_income'] == 0].index 
dc_maindf = dc_maindf.drop(cu_id)

dc_maindf.head()

Unnamed: 0_level_0,city,county,median_household_income
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20001,"Washington, DC",District Of Columbia,100447.0
20002,"Washington, DC",District Of Columbia,82022.0
20003,"Washington, DC",District Of Columbia,118607.0
20004,"Washington, DC",District Of Columbia,144583.0
20005,"Washington, DC",District Of Columbia,94506.0


In [4]:
# extract just population and zip from dc_pop.csv
dc_popdf = dc_pop[['GEO.id2','HC01_EST_VC01']]
# rename columns
dc_popdf = dc_popdf.rename(columns={'GEO.id2': 'zip', 'HC01_EST_VC01': 'population'}).drop(0).astype('int64')
# outer merge with target zipcode list, set index to zip and fill NaN
dc_maindf = dc_maindf.merge(dc_popdf, on='zip', how='outer').set_index('zip').fillna(value=0)

cu_id = dc_maindf[ dc_maindf['city'] == 0].index
dc_maindf = dc_maindf.drop(cu_id)
cu_id = dc_maindf[ dc_maindf['population'] == 0].index 
dc_maindf = dc_maindf.drop(cu_id)

zipdict = dict(dc_maindf['population'])

dc_maindf.head()

Unnamed: 0_level_0,city,county,median_household_income,population
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20001,"Washington, DC",District Of Columbia,100447.0,41692
20002,"Washington, DC",District Of Columbia,82022.0,52867
20003,"Washington, DC",District Of Columbia,118607.0,26330
20004,"Washington, DC",District Of Columbia,144583.0,1610
20005,"Washington, DC",District Of Columbia,94506.0,12311


In [5]:
geoj_scrubbed = {}
areadict = {}
features = geoj['features']
dc_maindf['feature'] = 0

for feature in features:
    try:
        feature_geoid = int(feature['properties']['GEOID10'])
        if feature_geoid in dc_maindf.index.values.tolist():
            dc_maindf.loc[[feature_geoid],['feature']]= str(feature)
            zarea = area(feature['geometry'])/int(1E6)
            areadict.update({feature_geoid:zarea})
            
            
    except ValueError:
            print(f'Value Error')

areadf = pd.DataFrame.from_dict(areadict, orient='index', columns=['area'])
dc_maindf2 = dc_maindf.join(areadf, how='left')
dc_maindf2['density'] = dc_maindf2['population']/dc_maindf2['area']

dc_maindf2.head()

Unnamed: 0_level_0,city,county,median_household_income,population,feature,area,density
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
20001,"Washington, DC",District Of Columbia,100447.0,41692,"{'type': 'Feature', 'geometry': {'type': 'Poly...",5.835524,7144.51697
20002,"Washington, DC",District Of Columbia,82022.0,52867,"{'type': 'Feature', 'geometry': {'type': 'Poly...",14.228399,3715.59728
20003,"Washington, DC",District Of Columbia,118607.0,26330,"{'type': 'Feature', 'geometry': {'type': 'Poly...",6.447784,4083.573672
20004,"Washington, DC",District Of Columbia,144583.0,1610,"{'type': 'Feature', 'geometry': {'type': 'Mult...",0.904905,1779.192084
20005,"Washington, DC",District Of Columbia,94506.0,12311,"{'type': 'Feature', 'geometry': {'type': 'Poly...",1.122787,10964.678013


In [6]:
min_pop = min(dc_maindf2['population'])
max_pop = max(dc_maindf2['population'])
pop_range = max_pop - min_pop

dc_maindf2['nml_dens'] = ((dc_maindf2['population'] - min_pop)/pop_range)

min_inc = min(dc_maindf2['median_household_income'])
max_inc = max(dc_maindf2['median_household_income'])
inc_range = max_inc - min_inc

dc_maindf2['nml_inc'] = ((dc_maindf2['median_household_income'] - min_inc)/inc_range)

dc_maindf2.head()

Unnamed: 0_level_0,city,county,median_household_income,population,feature,area,density,nml_dens,nml_inc
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20001,"Washington, DC",District Of Columbia,100447.0,41692,"{'type': 'Feature', 'geometry': {'type': 'Poly...",5.835524,7144.51697,0.759751,0.386274
20002,"Washington, DC",District Of Columbia,82022.0,52867,"{'type': 'Feature', 'geometry': {'type': 'Poly...",14.228399,3715.59728,0.964587,0.284281
20003,"Washington, DC",District Of Columbia,118607.0,26330,"{'type': 'Feature', 'geometry': {'type': 'Poly...",6.447784,4083.573672,0.478169,0.4868
20004,"Washington, DC",District Of Columbia,144583.0,1610,"{'type': 'Feature', 'geometry': {'type': 'Mult...",0.904905,1779.192084,0.025057,0.630593
20005,"Washington, DC",District Of Columbia,94506.0,12311,"{'type': 'Feature', 'geometry': {'type': 'Poly...",1.122787,10964.678013,0.221204,0.353387


In [9]:
feat_list = []
for feature in dc_maindf2['feature']:
    feat_list.append(ast.literal_eval(feature))

geoj_clean = {'type': 'FeatureCollection', 'features': feat_list}

In [10]:
with open('resources/features.json', 'w') as outfile:  
    json.dump(geoj_clean, outfile)