In [252]:
import pandas as pd 
import numpy as np

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [170]:
df = pd.read_csv('../data/Locations_of_Docked_Bikeshare_Stations_by_System_and_Year.csv')

In [171]:
#change to lower case column names 
df.columns = ['the_geom', 'id', 'fac_id', 'bike_id', 'system_id', 'system_name',
       'year', 'asofdate', 'fac_name', 'address', 'city', 'state', 'zipcode',
       'cbsa_code', 'longitude', 'latitude']

In [172]:
df = df[df['year']<2020]

In [173]:
#Fix issue with address column - a portion of the dataframe has an extra column of 0s shifting everything off 
df1 = df[df['zipcode']!=0]
df2 = df[df['zipcode']==0]

In [174]:
df2.columns = ['the_geom', 'id', 'fac_id', 'bike_id', 'system_id', 'system_name',
       'year', 'asofdate', 'fac_name', 'city', 'state', 'zipcode','to_delete',
       'cbsa_code', 'longitude', 'latitude']

In [175]:
df2 = df2.drop('to_delete',axis=1)
df1 = df1.drop('address',axis=1)

In [176]:
df = pd.concat([df1,df2],axis=0)

In [177]:
#Fill NA values 
df['system_name'] = df['system_name'].fillna(value='Hubway (03/2018 re-launched as Blue Bikes)')

In [178]:
#Fix zips - make strings, add leading zeros, delete zips associated with 2 states  
df['zipcode'] = df['zipcode'].astype('str')
df['zipcode'] = ["0"+z if len(z)<5 else z for z in df['zipcode']]

temp = pd.DataFrame(df.groupby('zipcode')['state'].agg('nunique').reset_index())
zips_2_states = temp[temp['state']>1]['zipcode']
df = df[~df['zipcode'].isin(np.array(zips_2_states))]

In [179]:
df['asofdate'].unique()

array([201512, 201612, 201712, 201812, 201912])

In [180]:
#year usable - At the start of 2019, the most recent data available is 2018 (made available December of 2018)
df['bs_year_usable'] = df['year']+1
df['bs_year_usable'].unique()

array([2016, 2017, 2018, 2019, 2020])

In [189]:
df = df[df['bs_year_usable']<2020]

In [190]:
#Resulting years 
df['bs_year_usable'].unique()

array([2016, 2017, 2018, 2019])

In [212]:
#Create 2 features: 
# 1) Number of docked bike stations per zip code (bs_total_stations)
# 2) Number of bike sharing systems per zip code (bs_total_systems) -- only 40 zips have more than 1 brand 
bikes = df.groupby(['zipcode','year','asofdate','bs_year_usable'])\
          .agg({'fac_id':'nunique','system_id':'nunique'})\
          .reset_index()
bikes.columns = ['bs_zipcode', 'bs_year', 'bs_asofdate', 'bs_year_usable', 'bs_total_stations', 'bs_total_systems'] 

In [231]:
#Convert years to strings
bikes['bs_year'] = bikes['bs_year'].astype('string')
bikes['bs_asofdate'] = bikes['bs_asofdate'].astype('string')
bikes['bs_year_usable'] = bikes['bs_year_usable'].astype('string')


In [232]:
bikes['bs_zipcode'].nunique()

1489

In [233]:
zri = pd.read_csv('../data/zri_multifamily_v2.csv')
zri['zip'] = zri['zip'].astype('string')
zri['zip'] = ["0"+z if len(z)<5 else z for z in zri['zip']]
zri['year'] = pd.to_datetime(zri['year-month'],format="%Y-%m").dt.year
zri['year'] = zri['year'].astype('string')
zri['year-month'] = pd.to_datetime(zri['year-month'],format="%Y-%m")

In [234]:
zri['zip'].nunique()

1302

In [236]:
zri[['zip','year']].shape

(93744, 2)

In [237]:
bikes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4662 entries, 0 to 4661
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   bs_zipcode         4662 non-null   object
 1   bs_year            4662 non-null   string
 2   bs_asofdate        4662 non-null   string
 3   bs_year_usable     4662 non-null   string
 4   bs_total_stations  4662 non-null   int64 
 5   bs_total_systems   4662 non-null   int64 
dtypes: int64(2), object(1), string(3)
memory usage: 218.7+ KB


In [238]:
zri.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93744 entries, 0 to 93743
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   zip         93744 non-null  object        
 1   City        93744 non-null  object        
 2   State       93744 non-null  object        
 3   Metro       93672 non-null  object        
 4   CountyName  93744 non-null  object        
 5   year-month  93744 non-null  datetime64[ns]
 6   zri         93744 non-null  float64       
 7   year        93744 non-null  string        
dtypes: datetime64[ns](1), float64(1), object(5), string(1)
memory usage: 5.7+ MB


In [247]:
zri_bikes = pd.merge(zri,bikes,how='left',left_on=['zip','year'], right_on=['bs_zipcode','bs_year_usable'])

In [None]:
#Of the 93,744 rows of zips and years, 13,968 have information about bikes while 79776 are nulls 

In [256]:
zri_bikes.sample(4)

Unnamed: 0,zip,City,State,Metro,CountyName,year-month,zri,year,bs_zipcode,bs_year,bs_asofdate,bs_year_usable,bs_total_stations,bs_total_systems
46575,33145,Miami,FL,Miami-Fort Lauderdale-West Palm Beach,Miami-Dade County,2019-04-01,2013.0,2019,33145.0,2018.0,201812.0,2019.0,1.0,1.0
93741,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,2019-10-01,1299.0,2019,,,,,,
17774,10463,New York,NY,New York-Newark-Jersey City,New York County,2019-03-01,1787.0,2019,,,,,,
21639,11225,New York,NY,New York-Newark-Jersey City,Kings County,2017-04-01,2264.0,2017,,,,,,


In [263]:
bikes_features = zri_bikes.drop(['bs_zipcode','bs_year','bs_asofdate','bs_year_usable'],axis=1)

In [287]:
bikes_features = bikes_features.fillna(0)

In [293]:
bikes_features['has_bike_sharing'] = bikes_features['bs_total_stations']>0

In [322]:
bikes_features.head(2)

Unnamed: 0,zip,City,State,Metro,CountyName,year-month,zri,year,bs_total_stations,bs_total_systems,has_bike_sharing
0,1013,Chicopee,MA,Springfield,Hampden County,2014-01-01,928.0,2014,0.0,0.0,False
1,1013,Chicopee,MA,Springfield,Hampden County,2014-02-01,931.0,2014,0.0,0.0,False


In [320]:
#bikes_features[(bikes_features['has_bike_sharing']==True) & (bikes_features['year']=='2014')]['zip'].nunique()

In [321]:
#2014 usable (data from 2013) --> data unavailable, assume no bike sharing
#2015 usable (data from 2014) --> data unavailable, assume no bike sharing 
#2016 usable (data from 2015), there are 160 zips with bike sharing
#2017 usable (data from 2016), there are 292 zips with bike sharing
#2018 usable (data from 2017), there are 340 zips with bike sharing
#2019 usable (data from 2018), there are 372 zips with bike sharing

In [299]:
bikes_features.to_csv('./../data/bikeshare_engineered_features.csv')