# Importing Libraries and Data

In [2]:
# Importing Libraries
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import geopandas as gpd
from keplergl import KeplerGl
from pyproj import CRS

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="keplergl")

In [3]:
# Importing Data
Path = r'D:\Data_Analysis\05-12-2025_Bike_Dashboard\02.Data'
df_bikes = pd.read_pickle(os.path.join(Path, 'Prepared Data', 'Bike_Trips_Clean.pkl'))
df_bikes.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,date,avgTemp,daily_rides,trip_duration
0,115C78C3039FFA89,electric_bike,2022-01-01 09:21:14,2022-01-01 09:35:46,Essex Light Rail,JC038,Essex Light Rail,JC038,40.712774,-74.036486,40.712774,-74.036486,member,2022-01-01,11.6,592,872.0
1,7FFD810CAA7A919E,classic_bike,2022-01-01 02:43:56,2022-01-01 02:43:57,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member,2022-01-01,11.6,592,1.0
2,E715E8432031B72C,classic_bike,2022-01-01 02:13:33,2022-01-01 02:18:42,Essex Light Rail,JC038,Washington St,JC098,40.712774,-74.036486,40.724294,-74.035483,member,2022-01-01,11.6,592,309.0
3,BF1B7B1E1961A87B,electric_bike,2022-01-01 17:18:46,2022-01-01 18:55:25,Grand St,JC102,W 27 St & 7 Ave,6247.06,40.715178,-74.037683,40.746647,-73.993915,casual,2022-01-01,11.6,592,5799.0
4,4A01F0E53C6F4386,electric_bike,2022-01-01 11:23:32,2022-01-01 11:29:27,Christ Hospital,JC034,Hoboken Terminal - Hudson St & Hudson Pl,HB101,40.734786,-74.050444,40.735938,-74.030305,member,2022-01-01,11.6,592,355.0


# Data Preprocessing

In [4]:
# Aggregated data frame with routes based on start and end stations
df_bikes['value'] = 1
df_route = df_bikes.groupby(['start_station_name', 'end_station_name'])['value'].count().reset_index()
print(df_route.head())
print(f'\n{df_route.tail()}')

      start_station_name                   end_station_name  value
0  11 St & Washington St              11 St & Washington St   1132
1  11 St & Washington St                   12 Ave & W 40 St      1
2  11 St & Washington St               12 St & Sinatra Dr N    253
3  11 St & Washington St  14 St Ferry - 14 St & Shipyard Ln    395
4  11 St & Washington St                    4 St & Grand St    350

        start_station_name      end_station_name  value
6948  York St & Marin Blvd        Van Vorst Park     18
6949  York St & Marin Blvd             Warren St     42
6950  York St & Marin Blvd         Washington St     16
6951  York St & Marin Blvd    Willow Ave & 12 St      1
6952  York St & Marin Blvd  York St & Marin Blvd     47


In [None]:
# verifying previous grouping step execution worked
print(f"df_route              {df_route['value'].sum()}")
print(df_bikes[['start_station_name', 'end_station_name']].count())
print(f"Captured {df_route['value'].sum()/df_bikes[['start_station_name', 'end_station_name']].count().iloc[0]:.2%} of all trips.")

df_route              892281
start_station_name    895475
end_station_name      892281
dtype: int64
Captured a 99.64% of all trips.


Based on the end_station_name and df_route value counts of 892281, all start station and end station combinations with non-null data were combined successfully. With it, we captured 99.64% of all routes with remaining 3,194 (895475-892281) potential routes accounting for less than 0.4%.

In [6]:
# Creating start and end coordinate data frames
start_coordinates = df_bikes[['start_station_name', 'start_lat', 'start_lng']].drop_duplicates(subset='start_station_name')
end_coordinates = df_bikes[['end_station_name', 'end_lat', 'end_lng']].drop_duplicates(subset='end_station_name')
print(start_coordinates.info())
print(f'\n{end_coordinates.info()}')

<class 'pandas.core.frame.DataFrame'>
Index: 84 entries, 0 to 688182
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   start_station_name  84 non-null     object 
 1   start_lat           84 non-null     float64
 2   start_lng           84 non-null     float64
dtypes: float64(2), object(1)
memory usage: 2.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 321 entries, 0 to 891558
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   end_station_name  320 non-null    object 
 1   end_lat           320 non-null    float64
 2   end_lng           320 non-null    float64
dtypes: float64(2), object(1)
memory usage: 10.0+ KB

None


In [7]:
# Adding start coordinates to df_route
df_route = df_route.merge(start_coordinates, on='start_station_name', how='left', indicator='mergedStr')
df_route['mergedStr'].value_counts(dropna=False)

mergedStr
both          6953
left_only        0
right_only       0
Name: count, dtype: int64

In [8]:
# Adding end coordinates to df_route
df_route = df_route.merge(end_coordinates, on='end_station_name', how='left', indicator='mergedEnd')
df_route['mergedEnd'].value_counts(dropna=False)

mergedEnd
both          6953
left_only        0
right_only       0
Name: count, dtype: int64

In [9]:
df_route.head()

Unnamed: 0,start_station_name,end_station_name,value,start_lat,start_lng,mergedStr,end_lat,end_lng,mergedEnd
0,11 St & Washington St,11 St & Washington St,1132,40.749985,-74.02715,both,40.749985,-74.02715,both
1,11 St & Washington St,12 Ave & W 40 St,1,40.749985,-74.02715,both,40.760875,-74.002777,both
2,11 St & Washington St,12 St & Sinatra Dr N,253,40.749985,-74.02715,both,40.750604,-74.02402,both
3,11 St & Washington St,14 St Ferry - 14 St & Shipyard Ln,395,40.749985,-74.02715,both,40.752961,-74.024353,both
4,11 St & Washington St,4 St & Grand St,350,40.749985,-74.02715,both,40.742258,-74.035111,both


In [10]:
# Freeing memory with cyclic garbage collector
import gc
gc.collect()

0

In [11]:
df_route.drop(columns=['mergedStr', 'mergedEnd'], axis=1, inplace=True)
df_route.rename(columns={'value':'trips'}, inplace=True)
df_route.tail()

Unnamed: 0,start_station_name,end_station_name,trips,start_lat,start_lng,end_lat,end_lng
6948,York St & Marin Blvd,Van Vorst Park,18,40.716615,-74.042412,40.718489,-74.047727
6949,York St & Marin Blvd,Warren St,42,40.716615,-74.042412,40.721124,-74.038051
6950,York St & Marin Blvd,Washington St,16,40.716615,-74.042412,40.724294,-74.035483
6951,York St & Marin Blvd,Willow Ave & 12 St,1,40.716615,-74.042412,40.751867,-74.030377
6952,York St & Marin Blvd,York St & Marin Blvd,47,40.716615,-74.042412,40.716615,-74.042412


In [12]:
df_route.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6953 entries, 0 to 6952
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   start_station_name  6953 non-null   object 
 1   end_station_name    6953 non-null   object 
 2   trips               6953 non-null   int64  
 3   start_lat           6953 non-null   float64
 4   start_lng           6953 non-null   float64
 5   end_lat             6953 non-null   float64
 6   end_lng             6953 non-null   float64
dtypes: float64(4), int64(1), object(2)
memory usage: 380.4+ KB


In [14]:
df_route.to_csv(os.path.join(Path, 'Prepared Data', 'CitiBike_Routes2022.csv'), index=False)
df_route.to_pickle(os.path.join(Path, 'Prepared Data', 'CitiBike_Routes2022.pkl'))

# Plotting the Map

In [15]:
map = KeplerGl(height=700)
map1 = map.add_data(df_route, "Trip Routes")
map

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'Trip Routes': {'index': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,…

In order to identify stations requiring attention, it was important to discover the most and least popular bike trip stations and their surroundings with The Origin Destination Flow Map. For it, I used the data-agnostic and high-performance geolocation functions of Kepler.gl to create a custom map visualization on a positron base style that focused attention on stations and routes. I proceeded to customize the 3 layers to enhance the map exploration interactivity of the trip data: fill color green with white outline for start stations to signal the start of a key spot, orange with red outline to signal distinct end points or stop areas and start-end arc as yellow for source and light blue for target to show routes with stroke widths mapped to the number of trips between stations. Activated filtering with an initial 850 to 999 trip view, geocoder interaction to enable location name or coordinate searches, and 3D polygon buildings to get a more realistic view of the route CitiBike users may take from the start to the end station. This aided in easily viewing distance, most popular station pairs and zones.

# Exporting Config and Final Visualization Map

In [None]:
# Saving the kepler map settings to object config
config = map.config
config

{'version': 'v1',
 'config': {'visState': {'filters': [{'dataId': ['Trip Routes'],
     'id': 'za7cusemo',
     'name': ['trips'],
     'type': 'range',
     'value': [850, 999],
     'plotType': 'histogram',
     'animationWindow': 'free',
     'yAxis': None,
     'view': 'side',
     'speed': 1,
     'enabled': True}],
   'layers': [{'id': 'ulgktnb',
     'type': 'point',
     'config': {'dataId': 'Trip Routes',
      'label': 'start',
      'color': [184, 196, 54],
      'highlightColor': [252, 242, 26, 255],
      'columns': {'lat': 'start_lat', 'lng': 'start_lng'},
      'isVisible': True,
      'visConfig': {'radius': 10,
       'fixedRadius': False,
       'opacity': 0.8,
       'outline': True,
       'thickness': 0,
       'strokeColor': [255, 254, 230],
       'colorRange': {'name': 'Global Warming',
        'type': 'sequential',
        'category': 'Uber',
        'colors': ['#5A1846',
         '#900C3F',
         '#C70039',
         '#E3611C',
         '#F1920E',
         '

In [None]:
# Saving the config file to the prepared data folder
import json
savingPath = r'D:\Data_Analysis\05-12-2025_Bike_Dashboard\02.Data\Prepared Data\config.json'
with open(savingPath, "w") as outfile:
    json.dump(config, outfile, indent=4)

In [None]:
# Creating function that resizes the browser automatically after 150 ms to open keplergl map on full screen
def full_browser_map(map_object, file_path, config=None):
    # Creating Kepler HTML string
    dir_path = os.path.dirname(file_path)
    if dir_path:
        os.makedirs(dir_path, exist_ok=True)
    html_content = map_object._repr_html_(config=config)

    if isinstance(html_content, bytes):
        html_content = html_content.decode('utf-8')

    # Browser page Auto-resizing
    resizing = ('<script>window.addEventListener("load",()=>{'
               'setTimeout(()=>window.dispatchEvent(new Event("resize")),150)'
               '})</script>')
    
    # Find insertion point and build final HTML
    body_close_idx = html_content.rfind("</body>")
    if body_close_idx != -1:
        fixed_html = (html_content[:body_close_idx] + resizing + html_content[body_close_idx:])
    else: fixed_html = html_content + resizing

    # Write file
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(fixed_html)

In [None]:
# Executing function to export interactive map as HTML file
full_browser_map(map_object=map, file_path=r"D:\Data_Analysis\05-12-2025_Bike_Dashboard\02.Data\Prepared Data\CitiBike_Trip_Routes_Map.html", config=config)

# Interpreting Results

The final interactive 3D Arc map reveals that the origin-destination pair Marshall St. & 2nd St. to City Hall – Washington St. & 1 St. is the most popular trip start and end points with a total of 999 trips, but also one of the shortest with only 14 blocks away. While South Waterfront Walkway – Sinatra Dr. & 1st St to Bloomfield St. & 15th St is the 2nd longest among the most popular with a total of 977 trips. Hoboken Terminal -River St. & Hudson Place is the start station with the highest number of trips followed by Columbus Park – Clinton St. & 9th St. making Hoboken City the busiest zone with the most intra-zonal flows and New Jersey City the second busiest zone. It is interesting that the least popular zone is New York City, which also shows many end stations with little to no activity, per our dataset.