### This Script contains the following:
1. Importing Libraries and Data
2. Data Preprocessing

## 1. Importing Libraries and Dataset

In [2]:
%%capture

import pandas as pd
import os
try:
  from keplergl import KeplerGl
except:
  !pip install keplergl
  from keplergl import KeplerGl
from pyproj import CRS
import numpy as np
from matplotlib import pyplot as plt

In [3]:
# Define the folder path
folderpath = "../Citibike_Project/Data/Prepared_data"
# Load the DataFrame from the pickle file
df = pd.read_pickle(os.path.join(folderpath, 'cleaned_nyc_bike_weather_data.pkl'))

In [4]:
# Check the shape
df.shape

(29838166, 21)

In [5]:
df.dtypes

ride_id                       object
rideable_type               category
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name          category
start_station_id            category
end_station_name            category
end_station_id              category
start_lat                    float32
start_lng                    float32
end_lat                      float32
end_lng                      float32
member_casual                 object
date                  datetime64[ns]
avgTemp                      float32
trip_duration                  int64
month                          int64
season                        object
value                          int64
bike_rides_daily               int64
log_trip_duration            float64
dtype: object

In [6]:
# Convert from category to object (string)
df['start_station_name'] = df['start_station_name'].astype(str)
df['end_station_name'] = df['end_station_name'].astype(str)

In [7]:
df.dtypes

ride_id                       object
rideable_type               category
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            object
start_station_id            category
end_station_name              object
end_station_id              category
start_lat                    float32
start_lng                    float32
end_lat                      float32
end_lng                      float32
member_casual                 object
date                  datetime64[ns]
avgTemp                      float32
trip_duration                  int64
month                          int64
season                        object
value                          int64
bike_rides_daily               int64
log_trip_duration            float64
dtype: object

## 2. Data Preprocessing

### 2.1. FINDING TOTAL TRIPS FROM AND TO EACH STATION

In [9]:
# Create a value column and group by start and end station
df['value'] = 1
df_group = df.groupby(['start_station_name', 'end_station_name'])['value'].count().reset_index()

In [10]:
# Check the output
df_group

Unnamed: 0,start_station_name,end_station_name,value
0,1 Ave & E 110 St,1 Ave & E 110 St,791
1,1 Ave & E 110 St,1 Ave & E 18 St,2
2,1 Ave & E 110 St,1 Ave & E 30 St,4
3,1 Ave & E 110 St,1 Ave & E 39 St,1
4,1 Ave & E 110 St,1 Ave & E 44 St,12
...,...,...,...
1015109,Yankee Ferry Terminal,West St & Liberty St,4
1015110,Yankee Ferry Terminal,West Thames St,1
1015111,Yankee Ferry Terminal,Yankee Ferry Terminal,5759
1015112,Yankee Ferry Terminal,,73


In [11]:
# Rename the value column for clarity
df_group.rename(columns = {'value' : 'trips'}, inplace = True)

In [12]:
#ADDING GEOSPATIAL COORDINATES TO TOTAL TRIPS

In [13]:
# Isolate the start and end coordinates of routes taken
stations = df[['start_station_name', 'end_station_name', 'start_lat', 'start_lng', 'end_lat', 'end_lng']].drop_duplicates().reset_index(drop=True)
     

In [14]:
stations.head()

Unnamed: 0,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng
0,West End Ave & W 107 St,Mt Morris Park W & W 120 St,40.802116,-73.968178,40.804039,-73.945923
1,Riverside Dr & W 91 St,Riverside Dr & W 82 St,40.793137,-73.977005,40.787209,-73.981277
2,Henry St & Atlantic Ave,Clinton St & Tillary St,40.690891,-73.996124,40.696232,-73.991417
3,Cleveland Pl & Spring St,Rivington St & Ridge St,40.722103,-73.997246,40.718502,-73.983299
4,1 Ave & E 62 St,Allen St & Rivington St,40.761227,-73.960938,40.720196,-73.989975


In [15]:
stations.shape


(4939318, 6)

### 2.2. USING THE MEDIAN TO APPLY COORDINATES

In [17]:
# Group the dataframe by start and end station name, and median the coordinates
stations = stations.groupby(['start_station_name', 'end_station_name'])[['start_lat', 'start_lng', 'end_lat', 'end_lng']].median().reset_index()
     

In [18]:
stations.shape


(1015114, 6)

In [19]:
# Merge the two dataframes
df_final = df_group.merge(stations, how='inner', on=['start_station_name', 'end_station_name'], indicator = 'merge_flag')
     

In [20]:
df_final.shape

(1015114, 8)

In [21]:
df_final['merge_flag'].value_counts()


both          1015114
left_only           0
right_only          0
Name: merge_flag, dtype: int64

In [22]:
# Replace NaN values with empty strings or zeros
df_final = df_final.fillna("")

In [23]:
# Replace infinite values with NaN, then fill them with an appropriate value
df_final.replace([np.inf, -np.inf], np.nan, inplace=True)
df_final = df_final.fillna("")

## 3. Geospatial Plotting

In [24]:
# Create KeplerGl instance

m = KeplerGl(height = 700, data={"data_1": df_final})
m

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'data_1':             start_station_name       end_station_name  trips  start_lat  \
0         …

### Kepler.gl Map Customization and Analysis  
#### Customizations Made  
- **Color Gradient:** I applied a gradient to the arcs, using **yellow** for the source and **blue** for the target. This helps visualize the flow of trips and makes it easier to see the directionality of movement.    
- **Start and End Points:** I set both start and end points to **red** to maintain a consistent visual theme while keeping them distinct from the arcs.   
- **Trip Frequency Filter:** I added a filter to **only show trips with at least 1,500 occurrences**. This helps highlight the most commonly used routes and removes noise from infrequent trips.  
#### Key Observations  
**Busy Zones:** The densest areas of trips are concentrated in **Midtown and Lower Manhattan**, with heavy traffic around Times Square and there are significant movements between Manhattan and Brooklyn. These areas are major commercial and transit hubs, which explains the high trip volume.  

These patterns align with New York City's transit infrastructure, where business districts and major transportation hubs generate the highest trip activity. The visualization helps understand how people move through the city and which areas see the most traffic.  


## 4. Exporting Visualization

In [25]:
config = m.config

In [26]:
import json
with open("config.json", "w") as outfile:
    json.dump(config, outfile)
     

In [27]:
m.save_to_html(file_name = 'Citi_Bike_Trips.html', read_only = False, config = config)


Map saved to Citi_Bike_Trips.html!
