# Census data notebook: test on #10022


In [12]:
!pip install us

Collecting us
  Downloading https://files.pythonhosted.org/packages/72/83/8731cbf5afcf3434c0b24cfc520c11fd27bfc8a6878114662f4e3dbdab71/us-1.0.0.tar.gz
Collecting jellyfish==0.5.6 (from us)
[?25l  Downloading https://files.pythonhosted.org/packages/94/48/ddb1458d966f0a84e472d059d87a9d1527df7768a725132fc1d810728386/jellyfish-0.5.6.tar.gz (132kB)
[K     |████████████████████████████████| 133kB 6.1MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: us, jellyfish
  Building wheel for us (setup.py) ... [?25ldone
[?25h  Created wheel for us: filename=us-1.0.0-cp37-none-any.whl size=11834 sha256=dc13028893244d184ab757b7df813d105eeafca78aa859014a3b3d3c6f2a358b
  Stored in directory: /Users/alentersakyan/Library/Caches/pip/wheels/b3/98/40/cb8be35d7779a0ae4372c84e7a585c947bfc41540fd8999e53
  Building wheel for jellyfish (setup.py) ... [?25ldone
[?25h  Created wheel for jellyfish: filename=jellyfish-0.5.6-cp37-cp37m-macosx_10_9_x86_64.whl size=23144 sha256=261df20cf7b1a232fb993721

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

%config InlineBackend.figure_format = 'svg'
%matplotlib inline 
df = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_190921.txt', parse_dates=[['DATE', 'TIME']])
df['day_of_week'] = df['DATE_TIME'].dt.weekday_name
df.columns

Index(['DATE_TIME', 'C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION',
       'DESC', 'ENTRIES',
       'EXITS                                                               ',
       'day_of_week'],
      dtype='object')

In [2]:
# Rename exit
df.rename(columns ={"EXITS                                                               ":"EXITS"}, inplace=True)

In [3]:
# Get only regular/ working units. The irregulars could be contributing to outliers
df = df[df['DESC'] == 'REGULAR']

In [4]:
# Create dif scores for entries / exits
df['ENTRIES_diff'] = df.ENTRIES.diff()
df['EXITS_diff'] = df.EXITS.diff()

In [5]:
# Replace outliers: delete under zero, replace > 20,000 with mean
# GROUPED BY STATION-- so means are coming by station

df['ENTRIES_diff'] = df.groupby(['STATION']).ENTRIES_diff.transform(
    lambda x: np.where((x<0)|(x>20000),x.mask((x<0)|(x>20000)).mean(),x))

In [6]:
df['EXITS_diff'] = df.groupby(['STATION']).EXITS_diff.transform(
    lambda x: np.where((x<0)|(x>20000),x.mask((x<0)|(x>20000)).mean(),x))

In [7]:
df['TRAFFIC_FLOW'] = df['ENTRIES_diff'] + df['EXITS_diff']

In [15]:
from census import Census
from us import states

In [13]:
c = Census("cc11fdd181d0dd4254743a9fee5c695176ed0f25")

In [16]:
c.acs5.get(('NAME', 'B25034_010E'),
          {'for': 'state:{}'.format(states.MD.fips)})

[{'NAME': 'Maryland', 'B25034_010E': 129556.0, 'state': '24'}]

In [17]:
!pip install uszipcode

Collecting uszipcode
[?25l  Downloading https://files.pythonhosted.org/packages/34/e1/f828cd05732433d0074a17a2623a8c1281c5dc7fb265fef1d3867692037b/uszipcode-0.2.2-py2.py3-none-any.whl (137kB)
[K     |████████████████████████████████| 143kB 4.4MB/s eta 0:00:01
[?25hCollecting pathlib-mate (from uszipcode)
[?25l  Downloading https://files.pythonhosted.org/packages/ff/f2/a1e6044fe90784e7bbc05286f2e8616aa2ff167f7275f5a6f2df479092c0/pathlib_mate-0.0.15-py2.py3-none-any.whl (195kB)
[K     |████████████████████████████████| 204kB 5.6MB/s eta 0:00:01
[?25hCollecting sqlalchemy (from uszipcode)
[?25l  Downloading https://files.pythonhosted.org/packages/fc/49/82d64d705ced344ba458197dadab30cfa745f9650ee22260ac2b275d288c/SQLAlchemy-1.3.8.tar.gz (5.9MB)
[K     |████████████████████████████████| 5.9MB 5.9MB/s eta 0:00:01     |████████████████                | 3.0MB 5.9MB/s eta 0:00:01
Collecting autopep8 (from pathlib-mate->uszipcode)
[?25l  Downloading https://files.pythonhosted.org/packag

In [18]:
from uszipcode import SearchEngine

In [19]:
search = SearchEngine(simple_zipcode=True) # set simple_zipcode=False to use rich info database

Start downloading data for simple zipcode database, total size 9MB ...
  1 MB finished ...
  2 MB finished ...
  3 MB finished ...
  4 MB finished ...
  5 MB finished ...
  6 MB finished ...
  7 MB finished ...
  8 MB finished ...
  9 MB finished ...
  10 MB finished ...
  Complete!


In [20]:
zipcode = search.by_zipcode("10022")

In [27]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [22]:
zip_10022 = zipcode.to_dict()

In [29]:
#pp.pprint(zip_10022)

In [237]:
def getDemographics(test2):
    ''' Get demographics for a zipcode. Input str of zip ("12345")'''
    from uszipcode import SearchEngine
    import pprint
    census_data = []
    median_incomes = []
    zipcodes = []
    for zipcode_entry in test2: #zipcode_entry is a zipcode from the list 
        zipcode_census_info = search.by_zipcode(zipcode_entry)
        census_data.append(zipcode_census_info) # Whole set of census data for all zips
        #pp = pprint.PrettyPrinter(indent=4)
        for zipcode_object in census_data: #zipcode_object is a single set of census data for a single zipcode
            median_incomes.append(zipcode_object.values()[-5]) #median income for single output
            zipcodes.append(zipcode_object.values()[0]) #zipcode for that single output
        
        #zip_with_income = list(zip(median_income,zipcode))

    
            # y2 = list(zip(x[1].keys(), x[1].values()))
    return zipcode_object.values()[-5]

In [268]:
census_data = []
median_incomes = []
zipcodes = []
for zipcode_entry in list_of_zips: #zipcode_entry is a zipcode from the list 
    zipcode_census_info = search.by_zipcode(zipcode_entry)
    census_data.append(zipcode_census_info) # Whole set of census data for all zips
    #pp = pprint.PrettyPrinter(indent=4)
    for item in census_data: #item is a single set of census data for a single zipcode
        median_incomes.append(item.values()[-5]) #median income for single output
        zipcodes.append(item.values()[0]) #zipcode for that single output
        
    #zip_with_income = list(zip(median_income,zipcode))
#print(zipcode_entry)
#print(list_of_zips)
#print(census_data)
print(len(zipcodes))
print(len(median_incomes))
print(census_data[0])

45
45
SimpleZipcode(zipcode='10001', zipcode_type='Standard', major_city='New York', post_office_city='New York, NY', common_city_list=['New York'], county='New York County', state='NY', lat=40.75, lng=-73.99, timezone='Eastern', radius_in_miles=0.9090909090909091, area_code_list=['718', '917', '347', '646'], population=21102, population_density=33959.0, land_area_in_sqmi=0.62, water_area_in_sqmi=0.0, housing_units=12476, occupied_housing_units=11031, median_home_value=650200, median_household_income=81671, bounds_west=-74.008621, bounds_east=-73.984076, bounds_north=40.759731, bounds_south=40.743451)


In [None]:
# def getMedianIncome(final_out):
#     '''From the list of census zipcode data, parse out median household income for each zip'''
#     zipcodes = []
#     incomes = []
#     # later zip two together
#     for x in final_out:
#         median_income = x[0].values()[-5]
#     print(x)
    

In [240]:
getDemographics(list_of_zips)

In [199]:
zipcode_objects = getDemographics(list_of_zips)
#print(x[0].values()[-5])

In [195]:
x

[SimpleZipcode(zipcode='10001', zipcode_type='Standard', major_city='New York', post_office_city='New York, NY', common_city_list=['New York'], county='New York County', state='NY', lat=40.75, lng=-73.99, timezone='Eastern', radius_in_miles=0.9090909090909091, area_code_list=['718', '917', '347', '646'], population=21102, population_density=33959.0, land_area_in_sqmi=0.62, water_area_in_sqmi=0.0, housing_units=12476, occupied_housing_units=11031, median_home_value=650200, median_household_income=81671, bounds_west=-74.008621, bounds_east=-73.984076, bounds_north=40.759731, bounds_south=40.743451),
 SimpleZipcode(zipcode='10010', zipcode_type='Standard', major_city='New York', post_office_city='New York, NY', common_city_list=['New York'], county='New York County', state='NY', lat=40.73, lng=-73.98, timezone='Eastern', radius_in_miles=1.0, area_code_list=['212', '646', '917', '718'], population=31834, population_density=81487.0, land_area_in_sqmi=0.39, water_area_in_sqmi=0.0, housing_un

In [200]:
for zipcode_object in zipcode_objects:
    median_income = zipcode_object.values()[-5]
    zipcode = zipcode_object.values()[0]
    
    #zip_with_income = list(zip(median_income,zipcode))

    
    # y2 = list(zip(x[1].keys(), x[1].values()))

In [198]:
median_income

103534

In [137]:
y = list(zip(x[0].keys(), x[0].values()))

In [138]:
y

[('zipcode', '10001'),
 ('zipcode_type', 'Standard'),
 ('major_city', 'New York'),
 ('post_office_city', 'New York, NY'),
 ('common_city_list', ['New York']),
 ('county', 'New York County'),
 ('state', 'NY'),
 ('lat', 40.75),
 ('lng', -73.99),
 ('timezone', 'Eastern'),
 ('radius_in_miles', 0.9090909090909091),
 ('area_code_list', ['718', '917', '347', '646']),
 ('population', 21102),
 ('population_density', 33959.0),
 ('land_area_in_sqmi', 0.62),
 ('water_area_in_sqmi', 0.0),
 ('housing_units', 12476),
 ('occupied_housing_units', 11031),
 ('median_home_value', 650200),
 ('median_household_income', 81671),
 ('bounds_west', -74.008621),
 ('bounds_east', -73.984076),
 ('bounds_north', 40.759731),
 ('bounds_south', 40.743451)]

In [139]:
y2 = list(zip(x[1].keys(), x[1].values()))

In [140]:
y2

[('zipcode', '10010'),
 ('zipcode_type', 'Standard'),
 ('major_city', 'New York'),
 ('post_office_city', 'New York, NY'),
 ('common_city_list', ['New York']),
 ('county', 'New York County'),
 ('state', 'NY'),
 ('lat', 40.73),
 ('lng', -73.98),
 ('timezone', 'Eastern'),
 ('radius_in_miles', 1.0),
 ('area_code_list', ['212', '646', '917', '718']),
 ('population', 31834),
 ('population_density', 81487.0),
 ('land_area_in_sqmi', 0.39),
 ('water_area_in_sqmi', 0.0),
 ('housing_units', 18030),
 ('occupied_housing_units', 16556),
 ('median_home_value', 746200),
 ('median_household_income', 97955),
 ('bounds_west', -73.994028),
 ('bounds_east', -73.971566),
 ('bounds_north', 40.745421),
 ('bounds_south', 40.73231)]

In [201]:
for i,x in enumerate(y):
    if i == 14:
        print((i,x[-1]))

(0, '10001')
(1, 'Standard')
(2, 'New York')
(3, 'New York, NY')
(4, ['New York'])
(5, 'New York County')
(6, 'NY')
(7, 40.75)
(8, -73.99)
(9, 'Eastern')
(10, 0.9090909090909091)
(11, ['718', '917', '347', '646'])
(12, 21102)
(13, 33959.0)
(14, 0.62)
(15, 0.0)
(16, 12476)
(17, 11031)
(18, 650200)
(19, 81671)
(20, -74.008621)
(21, -73.984076)
(22, 40.759731)
(23, 40.743451)


In [102]:
enumerate(y)[0]

TypeError: 'enumerate' object is not subscriptable

### Picking top 5 from above based on census + gmaps api
#### Once those are selected we can do further analyses
- 34 ST-PENN STA-- delete 
- GRD CNTRL-42 ST-- delete
- 34 ST-HERALD SQ = 10001
- 23rd st zip = 10010
- 14 ST-UNION SQ zip = 10003
- TIMES SQ-42 ST--- delete
- FULTON ST--- zip = 10038
- 42 ST-PORT AUTH--- zip = 10036
- 86 ST--- zip = 10028
- 125 ST--- zip = 10027
- CANAL ST--- zip = 10013
- 59 ST COLUMBUS--- zip = 10023

In [47]:
list_of_zips = [10001, 10010, 10003, 10038, 10036, 10028, 10027, 10013, 10023]

In [146]:
getDemographics(list_of_zips)

[SimpleZipcode(zipcode='10001', zipcode_type='Standard', major_city='New York', post_office_city='New York, NY', common_city_list=['New York'], county='New York County', state='NY', lat=40.75, lng=-73.99, timezone='Eastern', radius_in_miles=0.9090909090909091, area_code_list=['718', '917', '347', '646'], population=21102, population_density=33959.0, land_area_in_sqmi=0.62, water_area_in_sqmi=0.0, housing_units=12476, occupied_housing_units=11031, median_home_value=650200, median_household_income=81671, bounds_west=-74.008621, bounds_east=-73.984076, bounds_north=40.759731, bounds_south=40.743451),
 SimpleZipcode(zipcode='10010', zipcode_type='Standard', major_city='New York', post_office_city='New York, NY', common_city_list=['New York'], county='New York County', state='NY', lat=40.73, lng=-73.98, timezone='Eastern', radius_in_miles=1.0, area_code_list=['212', '646', '917', '718'], population=31834, population_density=81487.0, land_area_in_sqmi=0.39, water_area_in_sqmi=0.0, housing_un

In [None]:
y = list(zip(x[0].keys(), x[0].values()))

In [144]:
def parseDemographics(final_out):
    for zipcode in final_out:
        y = list(zip(final_out[0].keys, x[0].values()))
    return y
    

In [147]:
parseDemographics(final_out)

NameError: name 'final_out' is not defined

In [270]:
census_data = []
median_incomes = []
zipcodes = []
for zipcode_entry in list_of_zips: #zipcode_entry is a zipcode from the list 
    zipcode_census_info = search.by_zipcode(zipcode_entry)
    census_data.append(zipcode_census_info) # Whole set of census data for all zips
    #pp = pprint.PrettyPrinter(indent=4)
    for item in census_data: #item is a single set of census data for a single zipcode
        print(item)
#         med_income = item.values()[-5]
#         print(med_income)
#         median_incomes.append(med_income) #median income for single output
#         zipcodes.append(item.values()[0]) #zipcode for that single output
        
    #zip_with_income = list(zip(median_income,zipcode))
#print(zipcode_entry)
#print(list_of_zips)
#print(census_data)
print(len(zipcodes))
print(len(median_incomes))
print(census_data[0])

SimpleZipcode(zipcode='10001', zipcode_type='Standard', major_city='New York', post_office_city='New York, NY', common_city_list=['New York'], county='New York County', state='NY', lat=40.75, lng=-73.99, timezone='Eastern', radius_in_miles=0.9090909090909091, area_code_list=['718', '917', '347', '646'], population=21102, population_density=33959.0, land_area_in_sqmi=0.62, water_area_in_sqmi=0.0, housing_units=12476, occupied_housing_units=11031, median_home_value=650200, median_household_income=81671, bounds_west=-74.008621, bounds_east=-73.984076, bounds_north=40.759731, bounds_south=40.743451)
SimpleZipcode(zipcode='10001', zipcode_type='Standard', major_city='New York', post_office_city='New York, NY', common_city_list=['New York'], county='New York County', state='NY', lat=40.75, lng=-73.99, timezone='Eastern', radius_in_miles=0.9090909090909091, area_code_list=['718', '917', '347', '646'], population=21102, population_density=33959.0, land_area_in_sqmi=0.62, water_area_in_sqmi=0.0

In [271]:
test_zip = search.by_zipcode(10030)

In [348]:
test_zip

SimpleZipcode(zipcode='10030', zipcode_type='Standard', major_city='New York', post_office_city='New York, NY', common_city_list=['New York'], county='New York County', state='NY', lat=40.818, lng=-73.943, timezone='Eastern', radius_in_miles=0.5681818181818182, area_code_list=['212', '646', '917'], population=26999, population_density=96790.0, land_area_in_sqmi=0.28, water_area_in_sqmi=0.0, housing_units=12976, occupied_housing_units=11395, median_home_value=509000, median_household_income=31925, bounds_west=-73.948677, bounds_east=-73.936232, bounds_north=40.824032, bounds_south=40.812791)

In [276]:
list_of_zips

[10001, 10010, 10003, 10038, 10036, 10028, 10027, 10013, 10023]

# Final function to get demographics

In [343]:
def getIncomeByZip(listofzipcodes):
    ''' Take in a list of zipcodes and return the median income for that zipcode'''
    from uszipcode import SearchEngine # must have this installed to use search.by_zipcode()
    search = SearchEngine(simple_zipcode=True) # set simple_zipcode=False to use rich info database
    
    #census_data = []
    median_incomes = []
    zipcodes = []
    pop_density = []
    for zipcode_entry in list_of_zips: #zipcode_entry is a zipcode from the list 
        zipcode_census_info = search.by_zipcode(zipcode_entry)
        zipcodes.append(zipcode_census_info.zipcode)
        median_incomes.append(zipcode_census_info.median_household_income)
        pop_density.append(zipcode_census_info.population_density)
        zipped = list(zip(zipcodes, median_incomes,pop_density))
        zipped.sort(key = lambda zipped: zipped[1]) 
    #census_data.append(zipcode_census_info) # Whole set of census data for all zips
    return (zipped)
    


In [344]:
y = getIncomeByZip(list_of_zips)

In [345]:
y

[('10027', 37872, 68513.0),
 ('10038', 66074, 68214.0),
 ('10036', 66599, 55745.0),
 ('10001', 81671, 33959.0),
 ('10013', 83725, 50154.0),
 ('10003', 92540, 97188.0),
 ('10010', 97955, 81487.0),
 ('10023', 103534, 124357.0),
 ('10028', 104638, 143683.0)]

In [None]:
# list_of_zips = [10001, 10010, 10003, 10038, 10036, 10028, 10027, 10013, 10023]
# 34 ST-PENN STA-- delete
# GRD CNTRL-42 ST-- delete
# 34 ST-HERALD SQ = 10001
# 23rd st zip = 10010
# 14 ST-UNION SQ zip = 10003
# TIMES SQ-42 ST--- delete
# FULTON ST--- zip = 10038
# 42 ST-PORT AUTH--- zip = 10036
# 86 ST--- zip = 10028
# 125 ST--- zip = 10027
# CANAL ST--- zip = 10013
# 59 ST COLUMBUS--- zip = 10023

In [338]:
x

[('10027', 37872, 68513.0),
 ('10038', 66074, 68214.0),
 ('10036', 66599, 55745.0),
 ('10001', 81671, 33959.0),
 ('10013', 83725, 50154.0),
 ('10003', 92540, 97188.0),
 ('10010', 97955, 81487.0),
 ('10023', 103534, 124357.0),
 ('10028', 104638, 143683.0)]