In [2]:
# import dependencies
import requests
import json
from pprint import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# connect to API key
from config import gkey


In [3]:
# read zip codes csv
zipcodes_data = "ny_state_zipcodes.csv" 
zipcodes_df = pd.read_csv(zipcodes_data)
zipcodes_df.head()


Unnamed: 0,Zip Code
0,10001
1,10002
2,10003
3,10004
4,10005


In [4]:
# Add columns for income, # single, # no cards, # chinese
zipcodes_df["Avg Income ($)"] = ""
zipcodes_df["# Single"] = ""
zipcodes_df["# No Cars"] = ""
zipcodes_df["# Chinese"] = ""
zipcodes_df.head()


Unnamed: 0,Zip Code,Avg Income ($),# Single,# No Cars,# Chinese
0,10001,,,,
1,10002,,,,
2,10003,,,,
3,10004,,,,
4,10005,,,,


In [5]:
zipcodes_df = zipcodes_df.astype({"Zip Code": object})
zipcodes_df.dtypes

Zip Code          object
Avg Income ($)    object
# Single          object
# No Cars         object
# Chinese         object
dtype: object

In [6]:
# establish base URL 
base_url = "https://api.census.gov/data/2017/acs/acs5/profile"

# print the response object to the console
response = requests.get(base_url)
print(response)


<Response [200]>


In [10]:
# loop through all zips in the df, adding the census variables to each row

base_url = "https://api.census.gov/data/2017/acs/acs5/profile"

avg_income = "DP03_0063E"   
single_hh = "DP02_0011E"
hh_no_vehicles = "DP04_0058E"
chinese_pop = "DP05_0046E"

for index, row in zipcodes_df.iterrows():    
    print(zip)
    zip = row['Zip Code']
    
    query_url = f"{base_url}?get={hh_no_vehicles},NAME&for=zip%20code%20tabulation%20area:{zip}&key={gkey}"
    
    try:
        response = requests.get(query_url).json()
        #pprint(response)
        
        print(f"Finding demo data for zip code: {zip}")
        
        zipcodes_df.loc[index, "Avg Income ($)"] = response[1][0]
        zipcodes_df.loc[index, "# Single"] = response[1][1]
        zipcodes_df.loc[index, "# No Cars"] = response[1][2]
        zipcodes_df.loc[index, "# Chinese"] = response[1][3]  
    
    except json.decoder.JSONDecodeError:
        print("Missing field/result... skipping.")
        
    print("-------------------------------------------")

10001
Finding demo data for zip code: 10001


IndexError: list index out of range

In [12]:
# output data frame with info added in new columns
print(len(zipcodes_df))
zipcodes_df.head(10)


2202


Unnamed: 0,Zip Code,Avg Income ($),# Single,# No Cars,# Chinese
0,10001,161227.0,6554.0,10193.0,2467.0
1,10002,65718.0,14219.0,27306.0,30774.0
2,10003,182531.0,15067.0,20895.0,3166.0
3,10004,191760.0,808.0,1287.0,381.0
4,10005,201734.0,1728.0,3538.0,699.0
5,10006,196771.0,881.0,1527.0,208.0
6,10007,376573.0,1029.0,1961.0,446.0
7,10008,,,,
8,10009,90253.0,14663.0,23906.0,4768.0
9,10010,177950.0,8178.0,12097.0,1537.0


In [14]:
# drop rows with null values
zipcodes_df.replace('', np.nan, inplace=True)
operational_ny_df = zipcodes_df.dropna()
print(len(operational_ny_df))
operational_ny_df.head(10)


1772


Unnamed: 0,Zip Code,Avg Income ($),# Single,# No Cars,# Chinese
0,10001,161227,6554,10193,2467
1,10002,65718,14219,27306,30774
2,10003,182531,15067,20895,3166
3,10004,191760,808,1287,381
4,10005,201734,1728,3538,699
5,10006,196771,881,1527,208
6,10007,376573,1029,1961,446
8,10009,90253,14663,23906,4768
9,10010,177950,8178,12097,1537
10,10011,197050,16387,23073,2024


In [16]:
# explore data types
operational_ny_df.dtypes


Zip Code           int64
Avg Income ($)    object
# Single          object
# No Cars         object
# Chinese         object
dtype: object

In [21]:
# convert data types 
operational_ny_df= operational_ny_df.astype({"Avg Income ($)": int, 
                                             "# Single": int, 
                                             "# No Cars": int, 
                                             "# Chinese": int})
operational_ny_df.dtypes


Zip Code          int64
Avg Income ($)    int64
# Single          int64
# No Cars         int64
# Chinese         int64
dtype: object

In [22]:
# explore data

print(operational_ny_df.mean())

Zip Code          1.270949e+04
Avg Income ($)   -3.622324e+07
# Single          1.211324e+03
# No Cars         1.181729e+03
# Chinese         3.972438e+02
dtype: float64


In [25]:
# mean above suggests there are major erroneous negative numbers in the income data
# remove negative numbers from the dataframe
clean_operational_ny_df = operational_ny_df[(operational_ny_df["Avg Income ($)"]>=0) & 
                                           (operational_ny_df["# Single"]>=0) & 
                                           (operational_ny_df["# No Cars"]>=0) &
                                           (operational_ny_df["# Chinese"]>=0)]

print(len(clean_operational_ny_df))

1690


In [33]:
# create a DF with just the zip codes and output to new CSV file for use in Yelp API searches
clean_operational_ny_zip_codes = clean_operational_ny_df["Zip Code"]
clean_operational_ny_zip_codes.to_csv("clean_operational_ny_zips.csv", 
                                      index=False, 
                                      header=True)
