## Import Libraries and Files

In [158]:
# Import Libraries 
import pandas as pd
import numpy as np 
import os 
import requests
import json
from datetime import datetime
from dotenv import load_dotenv
import geopandas as gpd
from shapely.geometry import Point
from geopy.distance import geodesic
import itertools

In [160]:
# Import bikesharing data for 2023
folderpaths = r'../Data/Original/2023'

In [162]:
filepaths = [os.path.join(folderpaths, name) 
            for name in os.listdir(folderpaths)]

In [163]:
# Concatenate files
bike = pd.concat((pd.read_csv(f) for f in filepaths), ignore_index = True)

In [690]:
bike.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual'],
      dtype='object')

In [9]:
bike.shape

(988851, 13)

In [110]:
# Import weather data
load_dotenv() # Load the .env filea

True

In [112]:
#Token 
token = os.getenv('APItoken')

In [114]:
# Get Data
r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&datatypeid=TAVG&limit=1000&stationid=GHCND:USW00014732&startdate=2023-01-01&enddate=2023-12-31', headers={'token':token})

In [116]:
# Load the API responses Json
d = json.loads(r.text)

In [118]:
# Load stations file
stations = pd.read_csv(r'../Data/Original/stations.csv', index_col=0)

## Data Cleaning

In [872]:
bike.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,E2E964A161F786AB,classic_bike,2023-08-07 19:37:47,2023-08-07 19:41:14,6 St & Grand St,HB302,Madison St & 10 St,HB503,40.744398,-74.034501,40.749943,-74.035865,member
1,0660F2E48E3BB87F,classic_bike,2023-08-01 13:16:22,2023-08-01 13:26:02,6 St & Grand St,HB302,6 St & Grand St,HB302,40.744398,-74.034501,40.744398,-74.034501,member
2,940FC7C675232897,classic_bike,2023-08-15 17:28:23,2023-08-15 17:50:35,Heights Elevator,JC059,Heights Elevator,JC059,40.748721,-74.04048,40.748716,-74.040443,member
3,E967660CC5CD585B,classic_bike,2023-08-01 12:44:24,2023-08-01 12:49:45,Hoboken Ave at Monmouth St,JC105,Hoboken Terminal - River St & Hudson Pl,HB102,40.735279,-74.04683,40.736068,-74.029127,member
4,D997CB0B855FE2D6,classic_bike,2023-08-08 12:31:16,2023-08-08 12:40:18,Hoboken Ave at Monmouth St,JC105,Hoboken Terminal - River St & Hudson Pl,HB102,40.735208,-74.046964,40.736068,-74.029127,member


In [875]:
# Look for Duplicates
dup = bike.duplicated()
dup.sum()

0

In [877]:
# Look for missing values
bike.isna().sum()

ride_id                  0
rideable_type            0
started_at               0
ended_at                 0
start_station_name      85
start_station_id        85
end_station_name      3319
end_station_id        3319
start_lat                0
start_lng                0
end_lat                954
end_lng                954
member_casual            0
dtype: int64

In [11]:
# Isolate NaN values for in depth check
bike_nan = bike[bike['start_station_name'].isna()]

In [880]:
bike_nan.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
215557,55E37C3B31D0CA41,classic_bike,2023-12-21 20:47:37,2023-12-21 20:53:27,,,Marshall St & 2 St,HB408,40.74,-74.03,40.740802,-74.042521,casual
215703,698206F9E30736D0,classic_bike,2023-12-13 19:36:28,2023-12-13 19:55:50,,,Newark Ave,JC032,40.72,-74.05,40.721525,-74.046305,member
215855,E259F6D2E4000E7A,classic_bike,2023-12-06 00:35:31,2023-12-06 00:40:25,,,11 St & Washington St,HB502,40.74,-74.03,40.749985,-74.02715,member
215864,1C14C743E76756E0,classic_bike,2023-12-13 00:13:52,2023-12-13 01:14:01,,,,,40.73,-74.03,40.73,-74.03,member
253486,F3741EB4D3CCABD9,classic_bike,2023-04-18 09:19:02,2023-04-18 10:19:52,,,,,40.73,-74.06,40.73,-74.06,casual


In [19]:
# Check stations DataFrame 
stations.head()

Unnamed: 0,station_id,name,lat,lon
0,1965202298784063998,Lafayette Ave & Franklin Ave,40.689271,-73.957324
1,66de63cd-0aca-11e7-82f6-3863bb44ef7c,Columbia St & Degraw St,40.68593,-74.002424
2,8d650b03-920d-4d44-a48f-8d9a1a0ba264,W 190 St & Broadway,40.856487,-73.93297
3,5645b05e-85be-460d-8506-cacd20bff233,36 St & 3 Ave,40.655716,-74.006664
4,1846085734612252774,98 St & 41 Ave,40.74773,-73.86649


In [11]:
# Parameters for sampling
sample_size = 500

In [13]:
# Take a random sample of stations (ensure reproducibility with random_state)
sample = bike.sample(n=min(sample_size, len(stations)), random_state=42)

In [15]:
# Calculate average distance between the stations
def calculate_distance(row1, row2):
    loc1 = (row1['start_lat'], row1['start_lng'])
    loc2 = (row2['start_lat'], row2['start_lng'])
    return geodesic(loc1, loc2).meters  

In [17]:
# Check if sample has more than 1 station
if len(sample) < 2:
    print("Not enough stations in the sample to calculate distances.")
else:
    # Get all pairs of stations in the sample
    pairs = list(itertools.combinations(sample.iterrows(), 2))

In [19]:
# Calculate the distances for each pair and store them
distances = []
for pair in pairs:
    row1, row2 = pair[0][1], pair[1][1]  # Extract the rows (skip the index)
    distance = calculate_distance(row1, row2)
    distances.append(distance)

In [21]:
# Ensure distances list is not empty before calculating the average
if distances:
        average_distance = sum(distances) / len(distances)
        print(f"Average distance between sampled stations: {average_distance:.2f} meters")
else:
        print("No valid pairs of stations to calculate distance.")

Average distance between sampled stations: 2113.66 meters


In [166]:
# Convert bike and stations DataFrames to GeoDataFrames for end stations
bike['geometry'] = bike.apply(lambda row: Point(row['end_lng'], row['end_lat']), axis=1)
stations['geometry'] = stations.apply(lambda row: Point(row['lon'], row['lat']), axis=1)

In [168]:
# Convert both DataFrames to GeoDataFrames
bike_gdf = gpd.GeoDataFrame(bike, geometry='geometry')
stations_gdf = gpd.GeoDataFrame(stations, geometry='geometry')

In [170]:
# Set coordinate reference system (CRS) for both GeoDataFrames
bike_gdf.set_crs("EPSG:4326", allow_override=True, inplace=True)
stations_gdf.set_crs("EPSG:4326", allow_override=True, inplace=True)

Unnamed: 0,station_id,name,lat,lon,geometry
0,1965202298784063998,Lafayette Ave & Franklin Ave,40.689271,-73.957324,POINT (-73.95732 40.68927)
1,66de63cd-0aca-11e7-82f6-3863bb44ef7c,Columbia St & Degraw St,40.685930,-74.002424,POINT (-74.00242 40.68593)
2,8d650b03-920d-4d44-a48f-8d9a1a0ba264,W 190 St & Broadway,40.856487,-73.932970,POINT (-73.93297 40.85649)
3,5645b05e-85be-460d-8506-cacd20bff233,36 St & 3 Ave,40.655716,-74.006664,POINT (-74.00666 40.65572)
4,1846085734612252774,98 St & 41 Ave,40.747730,-73.866490,POINT (-73.86649 40.74773)
...,...,...,...,...,...
2224,5aa81a37-ea1d-4bf6-ab40-4751c6b4fb08,Fairmount Ave,40.725726,-74.071959,POINT (-74.07196 40.72573)
2225,46813ecf-8df4-4c8f-9579-0179e0b36ba6,Marshall St & 2 St,40.740802,-74.042521,POINT (-74.04252 40.7408)
2226,66ddd93e-0aca-11e7-82f6-3863bb44ef7c,Jersey & 6th St,40.725289,-74.045572,POINT (-74.04557 40.72529)
2227,66ddddb4-0aca-11e7-82f6-3863bb44ef7c,Monmouth and 6th,40.725685,-74.048790,POINT (-74.04879 40.72569)


In [174]:
# Reproject both GeoDataFrames to a projected CRS (e.g., EPSG:3857 for Web Mercator)
bike_gdf = bike_gdf.to_crs("EPSG:3857")
stations_gdf = stations_gdf.to_crs("EPSG:3857")

In [176]:
# Perform the spatial join - find the nearest station within a distance threshold 
joined_gdf = gpd.sjoin_nearest(bike_gdf, stations_gdf, how="left", max_distance=2000)

In [178]:
# Access the `end_station_name` from the `stations` dataset and update `bike` DataFrame
bike['end_station_name'] = joined_gdf['name']
bike['end_station_id'] = joined_gdf['station_id']

In [180]:
# Look for missing values
bike.isna().sum()

ride_id                  0
rideable_type            0
started_at               0
ended_at                 0
start_station_name      85
start_station_id        85
end_station_name      1074
end_station_id        1074
start_lat                0
start_lng                0
end_lat                954
end_lng                954
member_casual            0
geometry                 0
dtype: int64

In [182]:
# Convert bike and stations DataFrames to GeoDataFrames for start stations
bike['geometry'] = bike.apply(lambda row: Point(row['start_lng'], row['start_lat']), axis=1)

In [183]:
# Convert to GeoDataFrames
bike_gdf = gpd.GeoDataFrame(bike, geometry='geometry')

In [186]:
# Set coordinate reference system (CRS)
bike_gdf.set_crs("EPSG:4326", allow_override=True, inplace=True)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,geometry
0,E2E964A161F786AB,classic_bike,2023-08-07 19:37:47,2023-08-07 19:41:14,6 St & Grand St,HB302,Madison St & 10 St,e385c162-6a1a-4e7a-90c1-45d5b6fa1195,40.744398,-74.034501,40.749943,-74.035865,member,POINT (-74.0345 40.7444)
1,0660F2E48E3BB87F,classic_bike,2023-08-01 13:16:22,2023-08-01 13:26:02,6 St & Grand St,HB302,6 St & Grand St,9d344652-976b-4c2d-bede-2ef19b0fbf13,40.744398,-74.034501,40.744398,-74.034501,member,POINT (-74.0345 40.7444)
2,940FC7C675232897,classic_bike,2023-08-15 17:28:23,2023-08-15 17:50:35,Heights Elevator,JC059,Heights Elevator,66dd63de-0aca-11e7-82f6-3863bb44ef7c,40.748721,-74.040480,40.748716,-74.040443,member,POINT (-74.04048 40.74872)
3,E967660CC5CD585B,classic_bike,2023-08-01 12:44:24,2023-08-01 12:49:45,Hoboken Ave at Monmouth St,JC105,Hoboken Terminal - River St & Hudson Pl,0b56ffe7-056e-43ef-a0ac-9e1790f02f3d,40.735279,-74.046830,40.736068,-74.029127,member,POINT (-74.04683 40.73528)
4,D997CB0B855FE2D6,classic_bike,2023-08-08 12:31:16,2023-08-08 12:40:18,Hoboken Ave at Monmouth St,JC105,Hoboken Terminal - River St & Hudson Pl,0b56ffe7-056e-43ef-a0ac-9e1790f02f3d,40.735208,-74.046964,40.736068,-74.029127,member,POINT (-74.04696 40.73521)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
988846,EE6CF90DF3984253,classic_bike,2023-11-22 16:20:23,2023-11-22 16:26:38,Glenwood Ave,JC094,Bergen Ave & Sip Ave,581211b2-4e42-48f2-8a8f-5f968cb1c5df,40.727551,-74.071061,40.731009,-74.064437,member,POINT (-74.07106 40.72755)
988847,DB574978E7FABC08,classic_bike,2023-11-21 07:01:57,2023-11-21 07:06:14,Glenwood Ave,JC094,Bergen Ave & Sip Ave,581211b2-4e42-48f2-8a8f-5f968cb1c5df,40.727551,-74.071061,40.731009,-74.064437,member,POINT (-74.07106 40.72755)
988848,E8E26D6A35E922FA,classic_bike,2023-11-14 07:51:51,2023-11-14 08:04:20,Glenwood Ave,JC094,Montgomery St,6107e75a-2493-4e3d-a3e0-d4886d3416e5,40.727755,-74.071096,40.719420,-74.050990,member,POINT (-74.0711 40.72775)
988849,0D6ECB6E18E32C63,classic_bike,2023-11-17 08:15:06,2023-11-17 08:26:57,Glenwood Ave,JC094,Montgomery St,6107e75a-2493-4e3d-a3e0-d4886d3416e5,40.727624,-74.071103,40.719420,-74.050990,member,POINT (-74.0711 40.72762)


In [188]:
# Re-project to a projected CRS (e.g., EPSG:3857)
bike_gdf = bike_gdf.to_crs(epsg=3857)

In [190]:
# Perform the spatial join - find the nearest station within a distance threshold 
joined_gdf = gpd.sjoin_nearest(bike_gdf, stations_gdf, how="left", max_distance=2000)

In [192]:
# Access the `start_station_name` from the `stations` dataset and update `bike` DataFrame
bike['start_station_name'] = joined_gdf['name']
bike['start_station_id'] = joined_gdf['station_id']

In [194]:
# Look for missing values
bike.isna().sum()

ride_id                  0
rideable_type            0
started_at               0
ended_at                 0
start_station_name      15
start_station_id        15
end_station_name      1074
end_station_id        1074
start_lat                0
start_lng                0
end_lat                954
end_lng                954
member_casual            0
geometry                 0
dtype: int64

## Data Wrangling

In [196]:
# Get all items that correspond to TAVG
avg_temps = [item for item in d['results'] if item['datatype']=='TAVG']

In [198]:
# Get only all average temperature readings
data_temp = [item['date'] for item in avg_temps]

In [200]:
# Get the temperature from all average temperature readings
temps = [item['value'] for item in avg_temps]

In [201]:
# Put lists in Dataframe
df_temp = pd.DataFrame()

In [204]:
# Convert date format
df_temp['date'] = [datetime.strptime(d, "%Y-%m-%dT%H:%M:%S") for d in data_temp]

In [206]:
# Convert temperature format
df_temp['avg_temp'] = [float(v)/10.0 for v in temps]

In [207]:
# Convert bike format to datetime
bike['started_at'] = pd.to_datetime(bike['started_at'], dayfirst=False)
bike['date'] = pd.to_datetime(bike['started_at'], format='%Y-%m-%d').dt.date

In [208]:
# Match date format
bike['date'] = pd.to_datetime(bike['started_at'], format='%Y-%m-%d').dt.date
df_temp['date'] = pd.to_datetime(df_temp['date'], format='%Y-%m-%d').dt.date

In [212]:
%%time
df_merged = bike.merge(df_temp, how = 'left', on = 'date', indicator = True)

CPU times: user 379 ms, sys: 124 ms, total: 502 ms
Wall time: 573 ms


In [214]:
# Check Merge
df_merged['_merge'].value_counts(dropna = False)

_merge
both          988851
left_only          0
right_only         0
Name: count, dtype: int64

In [216]:
# Drop _merge col and geometry cols.
df_merged.drop(columns=['_merge', 'geometry'], inplace=True)

In [218]:
## Summary Statistics
df_merged.describe()

Unnamed: 0,started_at,start_lat,start_lng,end_lat,end_lng,avg_temp
count,988851,988851.0,988851.0,987897.0,987897.0,988851.0
mean,2023-07-10 23:54:00.570720512,40.732335,-74.040128,40.732309,-74.039856,16.491544
min,2023-01-01 00:06:36,40.678334,-74.087223,40.64507,-74.19,-10.7
25%,2023-04-27 20:39:55.500000,40.721124,-74.045572,40.721124,-74.044247,10.1
50%,2023-07-16 01:12:25,40.735208,-74.037683,40.735208,-74.037683,17.3
75%,2023-09-23 23:27:24,40.742301,-74.031028,40.742258,-74.03097,23.4
max,2023-12-31 23:59:57,40.863943,-73.941173,40.86448,-73.888719,30.7
std,,0.012192,0.011929,0.012319,0.011992,7.820234


In [220]:
# Rename cols
df_merged.rename(columns={'rideable_type':'bike_type', 'started_at':'start_time', 'ended_at':'end_time', 'start_station_name':'start_station', 'end_station_name':'end_station', 'member_casual':'membership'}, inplace=True)

## Save File

In [222]:
df_merged.to_pickle(r'../Data/Prepared/bike_final.pkl')