In [15]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from census import Census

# Census API Key
from config import api_key
c = Census(api_key, year=2018)

In [16]:
# Run Census Search to retrieve data on all zip codes (2017 ACS5 Census)
# See: https://github.com/CommerceDataService/census-wrapper for library documentation
# See: https://gist.github.com/afhaque/60558290d6efd892351c4b64e5c01e9b for labels
census_data = c.acs5.get(("NAME", "B19013_001E", "B01003_001E", "B01002_001E",
                          "B19301_001E",
                          "B17001_002E"), {'for': 'zip code tabulation area:*'})

# Convert to DataFrame
census_pd = pd.DataFrame(census_data)

# Column Reordering
census_pd = census_pd.rename(columns={"B01003_001E": "Population",
                                      "B01002_001E": "Median Age",
                                      "B19013_001E": "Household Income",
                                      "B19301_001E": "Per Capita Income",
                                      "B17001_002E": "Poverty Count",
                                      "NAME": "Name", "zip code tabulation area": "Zipcode"})



In [17]:
# Add in Poverty Rate (Poverty Count / Population)
# census_pd["Poverty Rate"] = 100 * \
#     census_pd["Poverty Count"].astype(
#         int) / census_pd["Population"].fillna(0).astype(int)

# Final DataFrame
census_pd = census_pd[["Zipcode", "Population", "Median Age", "Household Income",
                       "Per Capita Income", "Poverty Count"]]

# Visualize
print(len(census_pd))
census_pd.head()

33120


Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Count
0,601,17242.0,40.5,13092.0,6999.0,10772.0
1,602,38442.0,42.3,16358.0,9277.0,19611.0
2,603,48814.0,41.1,16603.0,11307.0,24337.0
3,606,6437.0,43.3,12832.0,5943.0,4163.0
4,610,27073.0,42.1,19309.0,10220.0,11724.0


In [18]:
census_pd["Poverty Rate"]=census_pd["Poverty Count"]*100/ census_pd["Population"]

In [19]:
census_pd

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate
0,00601,17242.0,40.5,13092.0,6999.0,10772.0,62.475351
1,00602,38442.0,42.3,16358.0,9277.0,19611.0,51.014515
2,00603,48814.0,41.1,16603.0,11307.0,24337.0,49.856599
3,00606,6437.0,43.3,12832.0,5943.0,4163.0,64.672984
4,00610,27073.0,42.1,19309.0,10220.0,11724.0,43.305138
...,...,...,...,...,...,...,...
33115,87515,363.0,44.2,,,,
33116,87518,9.0,-666666666.0,,,,
33117,87511,2896.0,36.0,,,,
33118,87578,245.0,48.0,,,,


In [20]:
census_pd.dropna(subset=['Household Income', 'Per Capita Income', 'Poverty Rate'], inplace=True)
census_pd

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate
0,00601,17242.0,40.5,13092.0,6999.0,10772.0,62.475351
1,00602,38442.0,42.3,16358.0,9277.0,19611.0,51.014515
2,00603,48814.0,41.1,16603.0,11307.0,24337.0,49.856599
3,00606,6437.0,43.3,12832.0,5943.0,4163.0,64.672984
4,00610,27073.0,42.1,19309.0,10220.0,11724.0,43.305138
...,...,...,...,...,...,...,...
33079,99922,330.0,39.5,34028.0,18213.0,129.0,39.090909
33081,99925,927.0,43.6,57375.0,25840.0,172.0,18.554477
33082,99926,1635.0,34.5,53409.0,22453.0,235.0,14.373089
33083,99927,38.0,55.5,-666666666.0,13658.0,28.0,73.684211


In [21]:
census_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32456 entries, 0 to 33084
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Zipcode            32456 non-null  object 
 1   Population         32456 non-null  float64
 2   Median Age         32456 non-null  float64
 3   Household Income   32456 non-null  float64
 4   Per Capita Income  32456 non-null  float64
 5   Poverty Count      32456 non-null  float64
 6   Poverty Rate       32456 non-null  float64
dtypes: float64(6), object(1)
memory usage: 2.0+ MB


In [22]:
census_pd['Zipcode'] = census_pd.Zipcode.astype(float)
census_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32456 entries, 0 to 33084
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Zipcode            32456 non-null  float64
 1   Population         32456 non-null  float64
 2   Median Age         32456 non-null  float64
 3   Household Income   32456 non-null  float64
 4   Per Capita Income  32456 non-null  float64
 5   Poverty Count      32456 non-null  float64
 6   Poverty Rate       32456 non-null  float64
dtypes: float64(7)
memory usage: 2.0 MB


In [23]:
# Save as a csv
# Note to avoid any issues later, use encoding="utf-8"
census_pd.to_csv("census_data_2018.csv", encoding="utf-8", index=False)

In [24]:
clean_2018_census_pd = census_pd.dropna(how='any')
print(len(census_pd))

32456


In [25]:
clean_2018_census_pd.to_csv("Resources/census_data_clean_2018.csv", encoding="utf-8", index=False)