In [40]:
import pandas as pd
import numpy as np
import fastparquet
import geopandas as gpd
from shapely.geometry import Point

In [41]:
df_raw = pd.read_csv("../../data/philadelphia_2016.csv")

In [42]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 655058 entries, 0 to 655057
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   start_time          655058 non-null  object 
 1   end_time            655058 non-null  object 
 2   start_station_id    655048 non-null  float64
 3   end_station_id      655058 non-null  int64  
 4   bike_id             655058 non-null  int64  
 5   user_type           655058 non-null  object 
 6   end_station_name    655058 non-null  object 
 7   start_station_name  655048 non-null  object 
dtypes: float64(1), int64(2), object(5)
memory usage: 40.0+ MB


You can see from the df_raw.info(), that for start and end station id and name there are 10 missing rows, otherwise there aren't any, in addition the that start_station_id is a float when in reality it should be int, due to having no decimal place.
Also the times are not formatted in the proper datetime format.

In [43]:
#Only take the rows that are not na in name or id
df = df_raw[df_raw['start_station_id'].notna() & df_raw['start_station_name'].notna()].copy()
df['start_station_id'] = df['start_station_id'].astype('int64')

#Format to Datetime
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 655048 entries, 0 to 655057
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   start_time          655048 non-null  datetime64[ns]
 1   end_time            655048 non-null  datetime64[ns]
 2   start_station_id    655048 non-null  int64         
 3   end_station_id      655048 non-null  int64         
 4   bike_id             655048 non-null  int64         
 5   user_type           655048 non-null  object        
 6   end_station_name    655048 non-null  object        
 7   start_station_name  655048 non-null  object        
dtypes: datetime64[ns](2), int64(3), object(3)
memory usage: 45.0+ MB


You can see now, that there are no null values and everything is formatted correctly

In [45]:
df['duration'] = (df['end_time'] - df['start_time'])

print(f"Shortest: {df['duration'].min()}")
print(f"Longest: {df['duration'].max()}")

Shortest: -1 days +23:04:00
Longest: 19 days 00:59:00


In [46]:
#Remove all the invalid times
df = df[(df['duration'] <= pd.Timedelta("1d")) & (df['duration'] >= pd.Timedelta("1m")) | ((df['duration'] <= pd.Timedelta("5m")) & (df['start_station_id'] == df['end_station_id']))]
#Also Remove Station 3000 Called "Virtual Station" supposedly for Test Trips
df = df[(df['start_station_id'] != 3000) & (df['end_station_id'] != 3000)]

In [47]:
#Look at different Types
df['user_type'].unique()

array(['Indego30', 'Walk-up', 'IndegoFlex'], dtype=object)

In [48]:
df_stations = pd.read_parquet('../../data/stations.parquet')
del df_stations['name']

Merge Station data into the dataframe

In [49]:
df = df.merge(df_stations, left_on='start_station_id', right_on='id')
del df['id']
df['start_lat'] = df['lat']
del df['lat']
df['start_lon'] = df['lon']
del df['lon']

df = df.merge(df_stations, left_on='end_station_id', right_on='id')
del df['id']
df['end_lat'] = df['lat']
del df['lat']
df['end_lon'] = df['lon']
del df['lon']


Use Haversine function to calculate dist, between two GPS points

In [50]:
# https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
def haversine(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [51]:
df["distance"] = haversine(
    df["start_lat"],
    df["start_lon"],
    df["end_lat"],
    df["end_lon"],
)

print(f"Smallest distance: {df['distance'].min()} km")
print(f"Greatest distance: {df['distance'].max()} km")

Smallest distance: 0.0 km
Greatest distance: 15.114165029636501 km


Calculate Speed to filter out unrealistic values. The maximum allowed speed of an ebike in the US is 20 mph, assuming the person has done no stops and as we are calculating the airline, these trips are almost certain to be faulty

In [52]:
df["speed"] = df["distance"] / df["duration"].apply(
    lambda duration: duration.total_seconds() / (60 * 60)
)

max_allowed_kmh = 20 * 1.60934 # 20mp/h in km/h

df = df[df['speed'] < max_allowed_kmh]

Calculate the "average timepoint" of every ride, this is a simplification to not have to juggle start and end time all the time

In [53]:
df['average_time'] = df['end_time'] - (df['duration'] / 2)

Because there are weather datapoints missing, merge to the nearest, with pd.merge_asof

In [54]:
df_weather = pd.read_parquet("../../data/weather_hourly_philadelphia_cleaned.parquet")

df_weather= df_weather.reset_index()
df_weather.head()

df = df.sort_values('average_time')
df_weather_and_trips = pd.merge_asof(df, df_weather, left_on='average_time', right_on='date_time', direction='nearest')
del df['average_time']
df_weather_and_trips.to_parquet('../../data/bike_trips_cleaned.parquet')

In [55]:
## Neighborhood Feature

In [56]:
df_bike_trips = df_weather_and_trips
df_bike_trips.head()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,end_station_name,start_station_name,duration,start_lat,start_lon,end_lat,end_lon,distance,speed,average_time,date_time,max_temp,min_temp,precip
0,2016-01-01 00:04:00,2016-01-01 00:14:00,3046,3041,3564,Indego30,"Girard Station, MFL",2nd & Market,0 days 00:10:00,39.949895,-75.143749,39.969241,-75.13618,1.005648,6.033888,2016-01-01 00:09:00,2016-01-01 00:00:00,6.7,6.7,0.0
1,2016-01-01 00:07:00,2016-01-01 00:29:00,3028,3019,2629,Indego30,6th & Race,4th & Bainbridge,0 days 00:22:00,39.940735,-75.14937,39.954694,-75.149496,0.397798,1.084905,2016-01-01 00:18:00,2016-01-01 00:00:00,6.7,6.7,0.0
2,2016-01-01 00:14:00,2016-01-01 00:27:00,3007,3066,3430,Indego30,19th & Lombard,"11th & Pine, Kahn Park",0 days 00:13:00,39.945123,-75.159952,39.945664,-75.173272,1.480278,6.83205,2016-01-01 00:20:30,2016-01-01 00:00:00,6.7,6.7,0.0
3,2016-01-01 00:20:00,2016-01-01 00:30:00,3045,3028,3715,Indego30,4th & Bainbridge,13th & Locust,0 days 00:10:00,39.947947,-75.162361,39.940735,-75.14937,1.458133,8.748797,2016-01-01 00:25:00,2016-01-01 00:00:00,6.7,6.7,0.0
4,2016-01-01 00:20:00,2016-01-01 00:42:00,3005,3064,3351,Walk-up,"18th & Washington, Chew Playground","Welcome Park, NPS",0 days 00:22:00,39.947383,-75.144145,39.93828,-75.173873,3.313616,9.037136,2016-01-01 00:31:00,2016-01-01 01:00:00,7.2,7.2,0.0


In [57]:
# von github? find source
neighborhoods= gpd.read_file('../../data/Neighborhoods_Philadelphia.geojson')


In [58]:
neighborhoods = neighborhoods.reset_index()

In [59]:
# create an empty dictionary to store the mapping
neighborhood_map = {}


# iterate over the rows of the DataFrame
for index, row in neighborhoods.iterrows():
    neighborhood_index = row['index']
    neighborhood_name = row['listname']
    # add the mapping to the dictionary
    neighborhood_map[neighborhood_name] = neighborhood_index

neighborhood_map["None"]=158
print(neighborhood_map)
%store neighborhood_map

{'Pennypack Park': 0, 'Overbrook': 1, 'Germantown, Southwest': 2, 'East Parkside': 3, 'Germany Hill': 4, 'Mount Airy, East': 5, 'Mechanicsville': 6, 'Dearnley Park': 7, 'Wissahickon Hills': 8, 'Wissinoming': 9, 'Bella Vista': 10, 'Allegheny West': 11, 'Glenwood': 12, 'Greenwich': 13, 'Francisville': 14, 'Penrose': 15, 'Powelton': 16, 'Garden Court': 17, 'Belmont': 18, 'Normandy Village': 19, 'Pennsport': 20, 'Stanton': 21, 'West Oak Lane': 22, 'Winchester Park': 23, 'Burholme': 24, 'Fern Rock': 25, 'Dunlap': 26, 'Melrose Park Gardens': 27, 'Franklinville': 28, 'Hawthorne': 29, 'Bartram Village': 30, 'East Poplar': 31, 'Bustleton': 32, 'Cedarbrook': 33, 'Ludlow': 34, 'Bridesburg': 35, 'East Falls': 36, 'North Central': 37, 'Port Richmond': 38, 'Grays Ferry': 39, 'Frankford': 40, 'Chestnut Hill': 41, 'Olney': 42, 'Mount Airy, West': 43, 'Somerton': 44, 'Mantua': 45, 'Northeast Phila Airport': 46, 'Modena': 47, 'Roxborough Park': 48, 'Wissahickon Park': 49, 'Morrell Park': 50, 'Riverfront

In [60]:
# add columns for geojson 
df_bike_trips['start_geometry'] = df_bike_trips.apply(lambda row: Point(np.array([row['start_lon'], row['start_lat']])), axis=1)
# df_bike_trips['end_geometry'] = df_bike_trips.apply(lambda row: Point(np.array([row['end_lon'], row['end_lat']])), axis=1)

  arr = construct_1d_object_array_from_listlike(values)


In [61]:
# adjust crs of df_bike_trips to prepare for merge
gdf = gpd.GeoDataFrame(df_bike_trips, geometry='start_geometry')
gdf.crs = 'epsg:4326'
print(neighborhoods.crs)
print(gdf.crs)

epsg:4326
epsg:4326


In [62]:
merged = gpd.sjoin(gdf, neighborhoods, op='within')

In [63]:
merged.tail()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,end_station_name,start_station_name,duration,start_lat,...,index_right,index,name,listname,mapname,shape_leng,shape_area,cartodb_id,created_at,updated_at
644034,2016-12-28 09:42:00,2016-12-28 10:07:00,3117,3119,2633,Indego30,42nd & Lancaster,ParkWest Town Center,0 days 00:25:00,39.978087,...,126,126,WEST_PARKSIDE,West Parkside,West Parkside,11392.249332,7502885.0,132,2013-03-19T17:41:50,2013-03-19T17:41:50
644861,2016-12-29 05:04:00,2016-12-29 05:24:00,3117,3020,2502,Indego30,University City Station,ParkWest Town Center,0 days 00:20:00,39.978087,...,126,126,WEST_PARKSIDE,West Parkside,West Parkside,11392.249332,7502885.0,132,2013-03-19T17:41:50,2013-03-19T17:41:50
645108,2016-12-29 15:29:00,2016-12-29 16:04:00,3117,3056,3279,Indego30,Broad & Oxford,ParkWest Town Center,0 days 00:35:00,39.978087,...,126,126,WEST_PARKSIDE,West Parkside,West Parkside,11392.249332,7502885.0,132,2013-03-19T17:41:50,2013-03-19T17:41:50
646038,2016-12-30 15:24:00,2016-12-30 15:34:00,3117,3111,2611,Indego30,"Parkside & Belmont, Case Building",ParkWest Town Center,0 days 00:10:00,39.978087,...,126,126,WEST_PARKSIDE,West Parkside,West Parkside,11392.249332,7502885.0,132,2013-03-19T17:41:50,2013-03-19T17:41:50
646374,2016-12-30 22:44:00,2016-12-30 23:08:00,3117,3078,11004,Indego30,19th & Market,ParkWest Town Center,0 days 00:24:00,39.978087,...,126,126,WEST_PARKSIDE,West Parkside,West Parkside,11392.249332,7502885.0,132,2013-03-19T17:41:50,2013-03-19T17:41:50


In [64]:
most_common_listname = merged.groupby(df_bike_trips['start_time'].dt.floor('H'))['listname'].agg(lambda x:x.value_counts().index[0])
# most_common_listname = merged.groupby(df_bike_trips['start_time'].dt.floor('H'))['name'].agg(lambda x:x.value_counts().index[0])

In [65]:
print(len(most_common_listname))

8534


In [66]:
# most_common_listname.head()
most_common_listname.isna().sum()

0

In [67]:
# most_common_listname.head()
most_common_listname = most_common_listname.to_frame()
most_common_listname.to_parquet('../../data/most_common_starting_neighborhoods.parquet')
