In [None]:
#In this notebook we add American Census collected economic data to each entry of our dataset. Specifically, we 
#determine which American Census Bureau defined neighborhod the start and end bike-stations are located in, and then
#add the start and end neighborhoods, the boroughs of those neighborhoods, and the median incomes of those 
#neighborhoods to the dataset.

In [421]:
import pandas as pd
import geopandas as gpd
import numpy as np
import time 
from shapely.geometry import Point

In [422]:
# Import Neighborhood Tabulation Area defined neighborhoods (ACB defined) shape file as geopandas dataframe.
fname = "nycgiszoningfeatures_201711shp/nynta.shp"
#standard coordinate system
coord_system = {'init': 'epsg:4326'}
ntas = gpd.read_file(fname).to_crs(coord_system)
#Keep wanted boroughs and Reset index (boroughs with Citi bikes)
ntas = ntas.loc[ntas['BoroName'].isin(['Queens','Brooklyn', 'Manhattan'])]

#Reindex dataframe
ntas=ntas.reset_index(drop=True)
#Rename NTACode to GeoID to merge with economic data(see below)
ntas.rename(columns={'NTACode': 'GeoID'}, inplace=True)
ntas.rename(columns={'NTAName': 'Neighborhood'}, inplace = True)
nta = ntas[['Neighborhood', 'GeoID', 'geometry']]


In [None]:
#Read in corresponding NTA financial data from nta folder: nynta_17d
# :eco2013acs5yrntadata.csv from 
# http://catalog.opendata.city/hr/dataset/nyc-neighborhood-acs-economic-data/resource/671ebb5a-672e-4005-9712-45310afd4308
ecos = pd.read_csv('nynta_17d/eco2013acs5yrntadata.csv', encoding = 'latin1')

#Keep desired columns
eco = ecos[['GeoID','Borough', 'MdHHIncE']]

#Keep wanted boroughs and Reset index
eco = eco.loc[eco['Borough'].isin(['Queens','Brooklyn', 'Manhattan'])]
eco = eco.reset_index(drop=True)


#Change string values to numberic and rename
md = eco['MdHHIncE']
eco['MdHHIncE'] = pd.to_numeric((md.str.replace(',', '')))

In [424]:
# Merge the economic neighborhood data and the NTA data on the 'GeoID' column. 
# Note, no reindexing is required
result = pd.merge(nta, eco,on='GeoID')

In [440]:
# GeoFrame Helper functions
#These two functions take the start station long and lat and turn them into shapely geometry points
def make_start_points(row):
    return Point(row['start station longitude'], row['start station latitude'])
def make_end_points(row):
    return Point(row['end station longitude'], row['end station latitude'])

#Takes a data frame and returns a GeoDataFrame with the same data, and 
#where the long/lat points are the 'geometry' for the GeoDataFrame
def make_geoframe(df):
    star=time.time()

    points = df.apply(make_start_points, axis=1)
    gdf = gpd.GeoDataFrame(df, geometry = points)
    gdf = gdf.rename(columns={'geometry': 'start point'}).set_geometry('start point')
    points = df.apply(make_end_points, axis=1)
    gdf = gpd.GeoDataFrame(gdf, geometry = points)
    gdf = gdf.rename(columns={'geometry': 'end point'}).set_geometry('end point')

    #Define CRS
    gdf.crs = {'init': 'epsg:4326'}
    
    end=time.time()
    print('took', end-star, 'seconds to run...')
    
    return gdf

In [None]:
# In case you want to add more Census data, the census csvs are available in zip format. You don't have
# to write a new function like the one below, we've done the hard work for you! 

# Use the census dictionary csvs to identify the neighborhood data traits you want to use. Import the csv with the 
# data you want and create a dataframe. Create a dataframe with only the columns you want (e.g. 'vacant housing
# units) and the 'GeoID' column. 

# Merge that dataframe with NTA GPD df defined above. Just like with income data, merge on the shared 'GeoID' so that 
# you get the corresponding neighborhoods for the GeoID's. It will now just be a matter of merging this dataframe
# on the bikeset dataframe twice. Once on the 'start neighborhood' column and once on the 'end neighborhood' 
# column.


In [451]:
#For each row in a dataframe('data') adds the neighborhood and the associated neighborhood data 
#for the start and end bike stations 
def Add_Start_Station_Finance_And_Neighborhood12(data):
    star=time.time()

    #Create two GPD DF's with start points and end points being being the only columns and geometries
    start_points = gpd.GeoDataFrame(data['start point'], geometry = data['start point'])
    end_points = gpd.GeoDataFrame(data['end point'], geometry = data['end point'])
    
    #Create Series to hold the start and end neighborhood strings
    start_neighborhood =  pd.Series(['NA' for i in range(len(start_points))])
    end_neighborhood = pd.Series(['NA' for i in range(len(start_points))])
    
    #Create Series to hold the start and end borough strings
    start_borough = pd.Series(['NA' for i in range(len(start_points))])
    end_borough = pd.Series(['NA' for i in range(len(start_points))])
    
    #Create Series to hold the start and end income values
    start_neighborhood_MdHHIncE = pd.Series([0 for i in range(len(start_points))])

    end_neighborhood_MdHHIncE = pd.Series([0 for i in range(len(start_points))])

    #A counter variable to track/provide the nta neighborhood and associated information 
    #we  will add to our argument dataframe('data')
    count = 0

    #Iterate through each of the NTA neighborhood geometries and get the
    #start stations located in that geometry and end stations located in that geometry
        #Note result was defined above in the 5th cell: it is a GPD df with neighborhoods, their 
        #geometries, their borough, and their economic data.
    for j in result.geometry:
        temps = start_points[start_points.within(j)].index
        tempe = end_points[end_points.within(j)].index

    
        #Store information for the jth neighborhood in temporary series associated with the stations
        #located in the jth neighborhood
        start_neighborhood[temps] = result['Neighborhood'][count]
        start_borough[temps] = result['Borough'][count]
        start_neighborhood_MdHHIncE[temps] = result['MdHHIncE'][count]

        end_neighborhood[tempe] = result['Neighborhood'][count]
        end_borough[tempe] = result['Borough'][count]
        end_neighborhood_MdHHIncE[tempe] = result['MdHHIncE'][count]

        count+=1

    #Add columns with NTA info to 'data'        
    data['start neighborhood'] = start_neighborhood
    data['start borough'] = start_borough
    data['start median income'] = start_neighborhood_MdHHIncE
    
    data['end neighborhood'] = end_neighborhood
    data['end borough'] = end_borough
    data['end median income'] = end_neighborhood_MdHHIncE

    end=time.time()
    print('took', end-star, 'seconds to run...')

In [455]:
#Add neighborhoods and Median Income to April CSV

April=pd.read_csv('FinalData/April_2016.csv')
August.columns = map(str.lower, August.columns)
April = make_geoframe(April)
Add_Start_Station_Finance_And_Neighborhood12(April)

#Surprisingly, some points are not contained in neighborhoods! (A few stations are lcoated in Jersey,
#We did not study these stations, which was fine since jersey bikes rarely enter NYC and vice-versa.)
#We dropped the rows with stations located in Jersey.

April = April[April['end neighborhood'] != 'NA']
April = April[April['start neighborhood'] != 'NA']
April=April.reset_index(drop=True)
April.to_csv('April_2016_MDincome.csv', index = False)

took 111.8298659324646 seconds to run...
took 5933.925531864166 seconds to run...


In [456]:
#Add neighborhoods and Median Income to August CSV

August=pd.read_csv('FinalData/August_2016.csv')
August.columns = map(str.lower, August.columns)
August = make_geoframe(August)
Add_Start_Station_Finance_And_Neighborhood12(August)
August = August[August['end neighborhood'] != 'NA']
August = August[August['start neighborhood'] != 'NA']
August=August.reset_index(drop=True)
August.to_csv('August_2016_MDincome.csv', index = False)

took 143.19871520996094 seconds to run...
took 5914.577415943146 seconds to run...


In [457]:
#Add neighborhoods and Median Income to December CSV

December=pd.read_csv('FinalData/December_2016.csv')
December.columns = map(str.lower, December.columns)
December = make_geoframe(December)
Add_Start_Station_Finance_And_Neighborhood12(December)
December = December[December['end neighborhood'] != 'NA']
December = December[December['start neighborhood'] != 'NA']
December=December.reset_index(drop=True)
December.to_csv('December_2016_MDincome.csv', index = False)

took 100.42866826057434 seconds to run...
took 5201.549489021301 seconds to run...


In [458]:
#Add neighborhoods and Median Income to February CSV

February=pd.read_csv('FinalData/February_2016.csv')
February.columns = map(str.lower, February.columns)
February = make_geoframe(February)
Add_Start_Station_Finance_And_Neighborhood12(February)
February = February[February['end neighborhood'] != 'NA']
February = February[February['start neighborhood'] != 'NA']
February=February.reset_index(drop=True)
February.to_csv('February_2016_MDincome.csv', index = False)

took 64.71481895446777 seconds to run...
took 3349.4047288894653 seconds to run...


In [459]:
#Add neighborhoods and Median Income to January CSV

January=pd.read_csv('FinalData/January_2016.csv')
January.columns = map(str.lower, January.columns)
January = make_geoframe(January)
Add_Start_Station_Finance_And_Neighborhood12(January)
January = January[January['end neighborhood'] != 'NA']
January = January[January['start neighborhood'] != 'NA']
January=January.reset_index(drop=True)
January.to_csv('January_2016_MDincome.csv', index = False)

took 60.362560987472534 seconds to run...
took 3047.705817937851 seconds to run...


In [460]:
#Add neighborhoods and Median Income to July CSV

July=pd.read_csv('FinalData/July_2016.csv')
July.columns = map(str.lower, July.columns)
July = make_geoframe(July)
Add_Start_Station_Finance_And_Neighborhood12(July)
July = July[July['end neighborhood'] != 'NA']
July = July[July['start neighborhood'] != 'NA']
July=July.reset_index(drop=True)
July.to_csv('July_2016_MDincome.csv', index = False)

took 148.92507600784302 seconds to run...
took 7696.264832019806 seconds to run...


In [461]:
#Add neighborhoods and Median Income to June CSV

June=pd.read_csv('FinalData/June_2016.csv')
June.columns = map(str.lower, June.columns)
June = make_geoframe(June)
Add_Start_Station_Finance_And_Neighborhood12(June)
June = June[June['end neighborhood'] != 'NA']
June = June[June['start neighborhood'] != 'NA']
June=June.reset_index(drop=True)
June.to_csv('June_2016_MDincome.csv', index = False)

took 163.55002188682556 seconds to run...
took 8273.517446279526 seconds to run...


In [462]:
#Add neighborhoods and Median Income to March CSV

March=pd.read_csv('FinalData/March_2016.csv')
March.columns = map(str.lower, March.columns)
March = make_geoframe(March)
Add_Start_Station_Finance_And_Neighborhood12(March)
March = March[March['end neighborhood'] != 'NA']
March = March[March['start neighborhood'] != 'NA']
March=March.reset_index(drop=True)
March.to_csv('March_2016_MDincome.csv', index = False)

took 101.18156123161316 seconds to run...
took 5306.189533948898 seconds to run...


In [463]:
#Add neighborhoods and Median Income to May CSV

May=pd.read_csv('FinalData/May_2016.csv')
May.columns = map(str.lower, May.columns)
May = make_geoframe(May)
Add_Start_Station_Finance_And_Neighborhood12(May)
May = May[May['end neighborhood'] != 'NA']
May = May[May['start neighborhood'] != 'NA']
May=May.reset_index(drop=True)
May.to_csv('May_2016_MDincome.csv', index = False)

took 148.23473572731018 seconds to run...
took 6671.4576761722565 seconds to run...


In [464]:
#Add neighborhoods and Median Income to November CSV

November=pd.read_csv('FinalData/November_2016.csv')
November.columns = map(str.lower, November.columns)
November = make_geoframe(November)
Add_Start_Station_Finance_And_Neighborhood12(November)
November = November[November['end neighborhood'] != 'NA']
November = November[November['start neighborhood'] != 'NA']
November=November.reset_index(drop=True)
November.to_csv('November_2016_MDincome.csv', index = False)

took 137.36907410621643 seconds to run...
took 7348.719714164734 seconds to run...


In [466]:
#Add neighborhoods and Median Income to October CSV

October=pd.read_csv('FinalData/October_2016.csv')
October.columns = map(str.lower, October.columns)
October = make_geoframe(October)
Add_Start_Station_Finance_And_Neighborhood12(October)
October = October[October['end neighborhood'] != 'NA']
October = October[October['start neighborhood'] != 'NA']
October=October.reset_index(drop=True)
October.to_csv('October_2016_MDincome.csv', index = False)

took 245.97137904167175 seconds to run...
took 9540.511973142624 seconds to run...


In [468]:
#Add neighborhoods and Median Income to September CSV

September=pd.read_csv('FinalData/September_2016.csv')
September.columns = map(str.lower, September.columns)
September = make_geoframe(September)
Add_Start_Station_Finance_And_Neighborhood12(September)
September = September[September['end neighborhood'] != 'NA']
September = September[September['start neighborhood'] != 'NA']
September=September.reset_index(drop=True)
September.to_csv('September_2016_MDincome.csv', index = False)

took 437.0577619075775 seconds to run...
took 12574.982161998749 seconds to run...
