In [3]:
import pandas as pd 
import numpy as np

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [4]:
df = pd.read_csv('../data/Locations_of_Docked_Bikeshare_Stations_by_System_and_Year.csv')

In [5]:
#change to lower case column names 
df.columns = ['the_geom', 'id', 'fac_id', 'bike_id', 'system_id', 'system_name',
       'year', 'asofdate', 'fac_name', 'address', 'city', 'state', 'zipcode',
       'cbsa_code', 'longitude', 'latitude']

In [6]:
df = df[df['year']<2020]

In [7]:
#Fix issue with address column - a portion of the dataframe has an extra column of 0s shifting everything off 
df1 = df[df['zipcode']!=0]
df2 = df[df['zipcode']==0]

In [8]:
df2.columns = ['the_geom', 'id', 'fac_id', 'bike_id', 'system_id', 'system_name',
       'year', 'asofdate', 'fac_name', 'city', 'state', 'zipcode','to_delete',
       'cbsa_code', 'longitude', 'latitude']

In [9]:
df2 = df2.drop('to_delete',axis=1)
df1 = df1.drop('address',axis=1)

In [10]:
df = pd.concat([df1,df2],axis=0)

In [11]:
#Fill NA values 
df['system_name'] = df['system_name'].fillna(value='Hubway (03/2018 re-launched as Blue Bikes)')

In [12]:
#Fix zips - make strings, add leading zeros, delete zips associated with 2 states  
df['zipcode'] = df['zipcode'].astype('str')
df['zipcode'] = ["0"+z if len(z)<5 else z for z in df['zipcode']]

temp = pd.DataFrame(df.groupby('zipcode')['state'].agg('nunique').reset_index())
zips_2_states = temp[temp['state']>1]['zipcode']
df = df[~df['zipcode'].isin(np.array(zips_2_states))]

In [13]:
df['asofdate'].unique()

array([201512, 201612, 201712, 201812, 201912])

In [14]:
#year usable - At the start of 2019, the most recent data available is 2018 (made available December of 2018)
df['bs_year_usable'] = df['year']+1
df['bs_year_usable'].unique()

array([2016, 2017, 2018, 2019, 2020])

In [15]:
df = df[df['bs_year_usable']<2020]

In [16]:
#Resulting years 
df['bs_year_usable'].unique()

array([2016, 2017, 2018, 2019])

In [17]:
#Create 2 features: 
# 1) Number of docked bike stations per zip code (bs_total_stations)
# 2) Number of bike sharing systems per zip code (bs_total_systems) -- only 40 zips have more than 1 brand 
bikes = df.groupby(['zipcode','year','asofdate','bs_year_usable'])\
          .agg({'fac_id':'nunique','system_id':'nunique'})\
          .reset_index()
bikes.columns = ['bs_zipcode', 'bs_year', 'bs_asofdate', 'bs_year_usable', 'bs_total_stations', 'bs_total_systems'] 

In [18]:
#Convert years to strings
bikes['bs_year'] = bikes['bs_year'].astype('string')
bikes['bs_asofdate'] = bikes['bs_asofdate'].astype('string')
bikes['bs_year_usable'] = bikes['bs_year_usable'].astype('string')


In [19]:
bikes['bs_zipcode'].nunique()

1489

In [20]:
zri = pd.read_csv('../data/zri_multifamily_v2.csv')
zri['zip'] = zri['zip'].astype('string')
zri['zip'] = ["0"+z if len(z)<5 else z for z in zri['zip']]
zri['year'] = pd.to_datetime(zri['year-month'],format="%Y-%m").dt.year
zri['year'] = zri['year'].astype('string')
zri['year-month'] = pd.to_datetime(zri['year-month'],format="%Y-%m")

In [21]:
zri['zip'].nunique()

1302

In [22]:
zri[['zip','year']].shape

(93744, 2)

In [23]:
bikes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4662 entries, 0 to 4661
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   bs_zipcode         4662 non-null   object
 1   bs_year            4662 non-null   string
 2   bs_asofdate        4662 non-null   string
 3   bs_year_usable     4662 non-null   string
 4   bs_total_stations  4662 non-null   int64 
 5   bs_total_systems   4662 non-null   int64 
dtypes: int64(2), object(1), string(3)
memory usage: 218.7+ KB


In [24]:
zri.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93744 entries, 0 to 93743
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   zip         93744 non-null  object        
 1   City        93744 non-null  object        
 2   State       93744 non-null  object        
 3   Metro       93672 non-null  object        
 4   CountyName  93744 non-null  object        
 5   year-month  93744 non-null  datetime64[ns]
 6   zri         93744 non-null  float64       
 7   year        93744 non-null  string        
dtypes: datetime64[ns](1), float64(1), object(5), string(1)
memory usage: 5.7+ MB


In [25]:
zri_bikes = pd.merge(zri,bikes,how='left',left_on=['zip','year'], right_on=['bs_zipcode','bs_year_usable'])

In [26]:
#Of the 93,744 rows of zips and years, 13,968 have information about bikes while 79776 are nulls 

In [27]:
zri_bikes.sample(4)

Unnamed: 0,zip,City,State,Metro,CountyName,year-month,zri,year,bs_zipcode,bs_year,bs_asofdate,bs_year_usable,bs_total_stations,bs_total_systems
69513,78240,San Antonio,TX,San Antonio-New Braunfels,Bexar County,2016-10-01,1048.0,2016,,,,,,
53164,44111,Cleveland,OH,Cleveland-Elyria,Cuyahoga County,2016-05-01,733.0,2016,,,,,,
68921,78201,San Antonio,TX,San Antonio-New Braunfels,Bexar County,2015-06-01,831.0,2015,,,,,,
6570,2453,Waltham,MA,Boston-Cambridge-Newton,Middlesex County,2015-07-01,2285.0,2015,,,,,,


In [28]:
bikes_features = zri_bikes.drop(['bs_zipcode','bs_year','bs_asofdate','bs_year_usable'],axis=1)

In [29]:
bikes_features = bikes_features.fillna(0)

In [30]:
bikes_features['has_bike_sharing'] = bikes_features['bs_total_stations']>0

In [31]:
bikes_features.head(2)

Unnamed: 0,zip,City,State,Metro,CountyName,year-month,zri,year,bs_total_stations,bs_total_systems,has_bike_sharing
0,1013,Chicopee,MA,Springfield,Hampden County,2014-01-01,928.0,2014,0.0,0.0,False
1,1013,Chicopee,MA,Springfield,Hampden County,2014-02-01,931.0,2014,0.0,0.0,False


In [32]:
#bikes_features[(bikes_features['has_bike_sharing']==True) & (bikes_features['year']=='2014')]['zip'].nunique()

In [33]:
#2014 usable (data from 2013) --> data unavailable, assume no bike sharing
#2015 usable (data from 2014) --> data unavailable, assume no bike sharing 
#2016 usable (data from 2015), there are 160 zips with bike sharing
#2017 usable (data from 2016), there are 292 zips with bike sharing
#2018 usable (data from 2017), there are 340 zips with bike sharing
#2019 usable (data from 2018), there are 372 zips with bike sharing

In [34]:
#bikes_features.to_csv('./../data/bikeshare_engineered_features.csv')

In [45]:
bikes_features[bikes_features['zip']=="07302"]

Unnamed: 0,zip,City,State,Metro,CountyName,year-month,zri,year,bs_total_stations,bs_total_systems,has_bike_sharing
14256,7302,Jersey City,NJ,New York-Newark-Jersey City,Hudson County,2014-01-01,2135.0,2014,0.0,0.0,False
14257,7302,Jersey City,NJ,New York-Newark-Jersey City,Hudson County,2014-02-01,2135.0,2014,0.0,0.0,False
14258,7302,Jersey City,NJ,New York-Newark-Jersey City,Hudson County,2014-03-01,2124.0,2014,0.0,0.0,False
14259,7302,Jersey City,NJ,New York-Newark-Jersey City,Hudson County,2014-04-01,2121.0,2014,0.0,0.0,False
14260,7302,Jersey City,NJ,New York-Newark-Jersey City,Hudson County,2014-05-01,2130.0,2014,0.0,0.0,False
14261,7302,Jersey City,NJ,New York-Newark-Jersey City,Hudson County,2014-06-01,2137.0,2014,0.0,0.0,False
14262,7302,Jersey City,NJ,New York-Newark-Jersey City,Hudson County,2014-07-01,2164.0,2014,0.0,0.0,False
14263,7302,Jersey City,NJ,New York-Newark-Jersey City,Hudson County,2014-08-01,2204.0,2014,0.0,0.0,False
14264,7302,Jersey City,NJ,New York-Newark-Jersey City,Hudson County,2014-09-01,2232.0,2014,0.0,0.0,False
14265,7302,Jersey City,NJ,New York-Newark-Jersey City,Hudson County,2014-10-01,2256.0,2014,0.0,0.0,False
