In [188]:
import pandas as pd
import numpy as np
import fastparquet

In [189]:
df_raw = pd.read_csv("../../data/philadelphia_2016_raw.csv")

In [190]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 655058 entries, 0 to 655057
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   start_time          655058 non-null  object 
 1   end_time            655058 non-null  object 
 2   start_station_id    655048 non-null  float64
 3   end_station_id      655058 non-null  int64  
 4   bike_id             655058 non-null  int64  
 5   user_type           655058 non-null  object 
 6   end_station_name    655058 non-null  object 
 7   start_station_name  655048 non-null  object 
dtypes: float64(1), int64(2), object(5)
memory usage: 40.0+ MB


You can see from the df_raw.info(), that for start and end station id and name there are 10 missing rows, otherwise there aren't any, in addition the that start_station_id is a float when in reality it should be int, due to having no decimal place.
Also the times are not formatted in the proper datetime format.

In [191]:
#Only take the rows that are not na in name or id
df = df_raw[df_raw['start_station_id'].notna() & df_raw['start_station_name'].notna()]
df['start_station_id'] = df['start_station_id'].astype('int64')

#Format to Datetime
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['start_station_id'] = df['start_station_id'].astype('int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['start_time'] = pd.to_datetime(df['start_time'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['end_time'] = pd.to_datetime(df['end_time'])


In [192]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 655048 entries, 0 to 655057
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   start_time          655048 non-null  datetime64[ns]
 1   end_time            655048 non-null  datetime64[ns]
 2   start_station_id    655048 non-null  int64         
 3   end_station_id      655048 non-null  int64         
 4   bike_id             655048 non-null  int64         
 5   user_type           655048 non-null  object        
 6   end_station_name    655048 non-null  object        
 7   start_station_name  655048 non-null  object        
dtypes: datetime64[ns](2), int64(3), object(3)
memory usage: 45.0+ MB


You can see now, that there are no null values and everything is formatted correctly

In [193]:
df['duration'] = (df['end_time'] - df['start_time'])

print(f"Shortest trip: {df['duration'].min()}")
print(f"Longest trip: {df['duration'].max()}")

Shortest trip: -1 days +23:04:00
Longest trip: 19 days 00:59:00


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['duration'] = (df['end_time'] - df['start_time'])


In [194]:
#Remove all the invalid times
df = df[(df['duration'] <= pd.Timedelta("1d")) & (df['duration'] >= pd.Timedelta("1m")) | ((df['duration'] <= pd.Timedelta("5m")) & (df['start_station_id'] == df['end_station_id']))]
#Also Remove Station 3000 Called "Virtual Station" supposedly for Test Trips
df = df[(df['start_station_id'] != 3000) & (df['end_station_id'] != 3000)]

In [195]:
#Look at different Types
df['user_type'].unique()

array(['Indego30', 'Walk-up', 'IndegoFlex'], dtype=object)

In [196]:
df_stations = pd.read_csv('../../data/stations.csv')
del df_stations['name']

In [197]:
df = df.merge(df_stations, left_on='start_station_id', right_on='id')
del df['id']
df['start_lat'] = df['lat']
del df['lat']
df['start_lon'] = df['lon']
del df['lon']

df = df.merge(df_stations, left_on='end_station_id', right_on='id')
del df['id']
df['end_lat'] = df['lat']
del df['lat']
df['end_lon'] = df['lon']
del df['lon']


In [198]:
def haversine(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [199]:
df["distance"] = haversine(
    df["start_lat"],
    df["start_lon"],
    df["end_lat"],
    df["end_lon"],
)

print(f"Smallest distance: {df['distance'].min()} km")
print(f"Greatest distance: {df['distance'].max()} km")

Smallest distance: 0.0 km
Greatest distance: 15.114165029636501 km


Calculate Speed to filter out unrealistic values. The maximum allowed speed of an ebike in the US is 20 mph, assuming the person has done no stops and as we are calculating the airline, these trips are almost certain to be faulty

In [200]:
df["speed"] = df["distance"] / df["duration"].apply(
    lambda duration: duration.total_seconds() / (60 * 60)
)

max_allowed_kmh = 20 * 1.60934 # 20mp/h in km/h

df = df[df['speed'] < max_allowed_kmh]

In [201]:
df['average_time'] = df['end_time'] - (df['duration'] / 2)

In [203]:
#Because we have weather datapoints missing, we are merging to the nearest existing weather datapoint
df_weather = pd.read_csv("../../data/weather_hourly_philadelphia_cleaned.parquet")

df_weather['date_time'] = pd.to_datetime(df_weather['date_time'])

df_weather = df_weather.sort_values('date_time')

df = df.sort_values('average_time')

df_weather_and_trips = pd.merge_asof(df, df_weather, left_on='average_time', right_on='date_time', direction='nearest')

In [205]:
del df['average_time']

In [209]:
df_weather_and_trips.to_parquet('../../data/bike_trips_cleaned.parquet')