In [1]:
# Import dependencies
import numpy as np
import pandas as pd
import requests
from census import Census

# Census API Key
from config import census_api_key
c = Census(census_api_key, year=2018)

In [2]:
# The following code is a slightly modified version of that found in the bootcamp on sections pertaining to the census module.
census_data = c.acs5.get(("B19013_001E", "B01003_001E", "B01002_001E", "B19301_001E", "B17001_002E",
                        "B15003_025E", "B15003_024E", "B15003_023E", "B15003_022E", "B15003_021E", "B15003_020E", "B15003_019E",
                        "B15003_018E", "B15003_017E", "B15003_016E", "B15003_015E", "B15003_014E", "B15003_013E", "B15003_012E", 
                        "B15003_011E", "B15003_010E", "B15003_009E", "B15003_008E", "B15003_007E", "B15003_006E", "B15003_005E",
                        "B15003_006E", "B15003_005E", "B15003_004E", "B15003_003E", "B15003_002E", "B15003_001E"
                         ),{'for': 'zip code tabulation area:*'})

# Convert to DataFrame
census_raw = pd.DataFrame(census_data)

# Column Renaming
# Note that many columns will not be renamed, as they will be combined for the sake of minimizing unique values.
#    In particular, columns B15003_005E through B15003_015E are simply grades 1 through 11.
census_raw = census_raw.rename(columns={"zip code tabulation area": "Zip Code Tabulation Area",
                                    "B01003_001E": "Population",
                                    "B01002_001E": "Median Age",
                                    "B19013_001E": "Household Income",
                                    "B19301_001E": "Per Capita Income",
                                    "B17001_002E": "Poverty Count",
                                    "B15003_001E": "Population 25 and Over",
                                    "B15003_002E": "Population 25 and Over w/ No Schooling",
                                    "B15003_003E": "Population 25 and Over w/ Nursery School",
                                    "B15003_004E": "Population 25 and Over w/ Kindergarten",
                                    "B15003_016E": "Population 25 and Over w/ 12th Grade, no diploma",
                                    "B15003_017E": "Population 25 and Over w/ Regular High School diploma",
                                    "B15003_018E": "Population 25 and Over w/ GED or alternative credential",
                                    "B15003_019E": "Population 25 and Over w/ Some college, less than 1 year",
                                    "B15003_020E": "Population 25 and Over w/ Some college, 1 or more years",
                                    "B15003_021E": "Population 25 and Over w/ Associate's degree",
                                    "B15003_022E": "Population 25 and Over w/ Bachelor's degree",
                                    "B15003_023E": "Population 25 and Over w/ Master's degree",
                                    "B15003_024E": "Population 25 and Over w/ Professional school degree",
                                    "B15003_025E": "Population 25 and Over w/ Doctorate degree",
                                      })

# Add in Poverty Rate (Poverty Count / Population)
#census_pd["Poverty Rate (%)"] = 100 * census_pd["Poverty Count"].astype(int) / census_pd["Population"].astype(int)
# Dataframe to be cleaned is put to screen
census_raw.head()

Unnamed: 0,Household Income,Population,Median Age,Per Capita Income,Poverty Count,Population 25 and Over w/ Doctorate degree,Population 25 and Over w/ Professional school degree,Population 25 and Over w/ Master's degree,Population 25 and Over w/ Bachelor's degree,Population 25 and Over w/ Associate's degree,...,B15003_009E,B15003_008E,B15003_007E,B15003_006E,B15003_005E,Population 25 and Over w/ Kindergarten,Population 25 and Over w/ Nursery School,Population 25 and Over w/ No Schooling,Population 25 and Over,Zip Code Tabulation Area
0,13092.0,17242.0,40.5,6999.0,10772.0,79.0,15.0,216.0,1781.0,888.0,...,181.0,174.0,584.0,167.0,68.0,0.0,19.0,492.0,11838.0,601
1,16358.0,38442.0,42.3,9277.0,19611.0,312.0,201.0,1119.0,3692.0,3387.0,...,687.0,917.0,571.0,236.0,137.0,11.0,27.0,901.0,27411.0,602
2,16603.0,48814.0,41.1,11307.0,24337.0,280.0,297.0,1437.0,5888.0,2370.0,...,861.0,806.0,420.0,195.0,92.0,14.0,35.0,924.0,34274.0,603
3,12832.0,6437.0,43.3,5943.0,4163.0,0.0,0.0,120.0,311.0,241.0,...,78.0,151.0,162.0,63.0,64.0,0.0,0.0,185.0,4516.0,606
4,19309.0,27073.0,42.1,10220.0,11724.0,66.0,54.0,749.0,2498.0,2241.0,...,487.0,454.0,456.0,206.0,132.0,0.0,0.0,643.0,19164.0,610


In [3]:
# Check for duplicate zip codes
len(census_raw[census_raw.duplicated(["Zip Code Tabulation Area"])])

0

In [4]:
# Rows with empty values are dropped
census_no_na = census_raw.dropna()
census_no_na.head()

Unnamed: 0,Household Income,Population,Median Age,Per Capita Income,Poverty Count,Population 25 and Over w/ Doctorate degree,Population 25 and Over w/ Professional school degree,Population 25 and Over w/ Master's degree,Population 25 and Over w/ Bachelor's degree,Population 25 and Over w/ Associate's degree,...,B15003_009E,B15003_008E,B15003_007E,B15003_006E,B15003_005E,Population 25 and Over w/ Kindergarten,Population 25 and Over w/ Nursery School,Population 25 and Over w/ No Schooling,Population 25 and Over,Zip Code Tabulation Area
0,13092.0,17242.0,40.5,6999.0,10772.0,79.0,15.0,216.0,1781.0,888.0,...,181.0,174.0,584.0,167.0,68.0,0.0,19.0,492.0,11838.0,601
1,16358.0,38442.0,42.3,9277.0,19611.0,312.0,201.0,1119.0,3692.0,3387.0,...,687.0,917.0,571.0,236.0,137.0,11.0,27.0,901.0,27411.0,602
2,16603.0,48814.0,41.1,11307.0,24337.0,280.0,297.0,1437.0,5888.0,2370.0,...,861.0,806.0,420.0,195.0,92.0,14.0,35.0,924.0,34274.0,603
3,12832.0,6437.0,43.3,5943.0,4163.0,0.0,0.0,120.0,311.0,241.0,...,78.0,151.0,162.0,63.0,64.0,0.0,0.0,185.0,4516.0,606
4,19309.0,27073.0,42.1,10220.0,11724.0,66.0,54.0,749.0,2498.0,2241.0,...,487.0,454.0,456.0,206.0,132.0,0.0,0.0,643.0,19164.0,610


In [5]:
# Force all dtypes to float
census_no_na = census_no_na.astype(float)
census_no_na.dtypes

Household Income                                            float64
Population                                                  float64
Median Age                                                  float64
Per Capita Income                                           float64
Poverty Count                                               float64
Population 25 and Over w/ Doctorate degree                  float64
Population 25 and Over w/ Professional school degree        float64
Population 25 and Over w/ Master's degree                   float64
Population 25 and Over w/ Bachelor's degree                 float64
Population 25 and Over w/ Associate's degree                float64
Population 25 and Over w/ Some college, 1 or more years     float64
Population 25 and Over w/ Some college, less than 1 year    float64
Population 25 and Over w/ GED or alternative credential     float64
Population 25 and Over w/ Regular High School diploma       float64
Population 25 and Over w/ 12th Grade, no diploma

In [6]:
# All rows containing a negative value are dropped

# Empty array for indeces to drop is initialized
indeces_to_drop = []

# Every row is checked for a negative value
# Iterate through the rows
for index, row in census_no_na.iterrows():
    
    # Ensure that only 1 index is given per row
    unique = True
    
    # Loop through columns
    for col in row:
        
        # If a value is negative, add its index to the array be dropped and flip boolean to ensure only 1 entry
        if ((col < 0) & unique):
            indeces_to_drop.append(index)
            unique = False
    
# Delete each row whose
for index in indeces_to_drop:
    census_no_na.drop(index, inplace=True)
    
census_no_na.head()

Unnamed: 0,Household Income,Population,Median Age,Per Capita Income,Poverty Count,Population 25 and Over w/ Doctorate degree,Population 25 and Over w/ Professional school degree,Population 25 and Over w/ Master's degree,Population 25 and Over w/ Bachelor's degree,Population 25 and Over w/ Associate's degree,...,B15003_009E,B15003_008E,B15003_007E,B15003_006E,B15003_005E,Population 25 and Over w/ Kindergarten,Population 25 and Over w/ Nursery School,Population 25 and Over w/ No Schooling,Population 25 and Over,Zip Code Tabulation Area
0,13092.0,17242.0,40.5,6999.0,10772.0,79.0,15.0,216.0,1781.0,888.0,...,181.0,174.0,584.0,167.0,68.0,0.0,19.0,492.0,11838.0,601.0
1,16358.0,38442.0,42.3,9277.0,19611.0,312.0,201.0,1119.0,3692.0,3387.0,...,687.0,917.0,571.0,236.0,137.0,11.0,27.0,901.0,27411.0,602.0
2,16603.0,48814.0,41.1,11307.0,24337.0,280.0,297.0,1437.0,5888.0,2370.0,...,861.0,806.0,420.0,195.0,92.0,14.0,35.0,924.0,34274.0,603.0
3,12832.0,6437.0,43.3,5943.0,4163.0,0.0,0.0,120.0,311.0,241.0,...,78.0,151.0,162.0,63.0,64.0,0.0,0.0,185.0,4516.0,606.0
4,19309.0,27073.0,42.1,10220.0,11724.0,66.0,54.0,749.0,2498.0,2241.0,...,487.0,454.0,456.0,206.0,132.0,0.0,0.0,643.0,19164.0,610.0


In [7]:
# # Final DataFrame is made
# census_df = census_clean[["Zip Code Tabulation Area", "Population", "Median Age", "Household Income",
#                        "Per Capita Income", "Poverty Count"]]

# # Output Dataframe to csv and screen
# census_df.to_csv("../data/acs5_2018.csv")
# census_df