#### Imports

In [2]:
import pandas as pd
import boto3
import requests
import geopandas as gpd
import gzip
import folium
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from IPython.display import display
from shapely.geometry import Point


#### Data 

In [3]:
bike_data = pd.read_csv("202409-citibike-tripdata/202409-citibike-tripdata/202409-citibike-tripdata_1.csv.zip", compression="zip")
bike_data.info()

  bike_data = pd.read_csv("202409-citibike-tripdata/202409-citibike-tripdata/202409-citibike-tripdata_1.csv.zip", compression="zip")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   ride_id             1000000 non-null  object 
 1   rideable_type       1000000 non-null  object 
 2   started_at          1000000 non-null  object 
 3   ended_at            1000000 non-null  object 
 4   start_station_name  999566 non-null   object 
 5   start_station_id    999566 non-null   object 
 6   end_station_name    999885 non-null   object 
 7   end_station_id      999768 non-null   object 
 8   start_lat           1000000 non-null  float64
 9   start_lng           1000000 non-null  float64
 10  end_lat             999991 non-null   float64
 11  end_lng             999991 non-null   float64
 12  member_casual       1000000 non-null  object 
dtypes: float64(4), object(9)
memory usage: 99.2+ MB


In [4]:
nypd_data = pd.read_csv("Motor_Vehicle_Collisions_-_Crashes_20241028.csv")
print(nypd_data.shape)
display(nypd_data.head())
nypd_data.info()

  nypd_data = pd.read_csv("Motor_Vehicle_Collisions_-_Crashes_20241028.csv")


(2129381, 29)


Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,09/11/2021,2:39,,,,,,WHITESTONE EXPRESSWAY,20 AVENUE,,...,Unspecified,,,,4455765,Sedan,Sedan,,,
1,03/26/2022,11:45,,,,,,QUEENSBORO BRIDGE UPPER,,,...,,,,,4513547,Sedan,,,,
2,06/29/2022,6:55,,,,,,THROGS NECK BRIDGE,,,...,Unspecified,,,,4541903,Sedan,Pick-up Truck,,,
3,09/11/2021,9:35,BROOKLYN,11208.0,40.667202,-73.8665,"(40.667202, -73.8665)",,,1211 LORING AVENUE,...,,,,,4456314,Sedan,,,,
4,12/14/2021,8:13,BROOKLYN,11233.0,40.683304,-73.917274,"(40.683304, -73.917274)",SARATOGA AVENUE,DECATUR STREET,,...,,,,,4486609,,,,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129381 entries, 0 to 2129380
Data columns (total 29 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   CRASH DATE                     object 
 1   CRASH TIME                     object 
 2   BOROUGH                        object 
 3   ZIP CODE                       object 
 4   LATITUDE                       float64
 5   LONGITUDE                      float64
 6   LOCATION                       object 
 7   ON STREET NAME                 object 
 8   CROSS STREET NAME              object 
 9   OFF STREET NAME                object 
 10  NUMBER OF PERSONS INJURED      float64
 11  NUMBER OF PERSONS KILLED       float64
 12  NUMBER OF PEDESTRIANS INJURED  int64  
 13  NUMBER OF PEDESTRIANS KILLED   int64  
 14  NUMBER OF CYCLIST INJURED      int64  
 15  NUMBER OF CYCLIST KILLED       int64  
 16  NUMBER OF MOTORIST INJURED     int64  
 17  NUMBER OF MOTORIST KILLED      int64  
 18  CO

#### Data Cleaning and Transformation

In [5]:
# drop NaNs and assure that times are datef
bike_data = bike_data.dropna()
# ensure there are no duplicates
bike_data = bike_data.drop_duplicates(subset="ride_id")
# ensure datetime format for time points
bike_data["started_at"] = pd.to_datetime(bike_data['started_at'])
bike_data["ended_at"] = pd.to_datetime(bike_data['ended_at'])

print(bike_data.shape)

(999338, 13)


In [7]:
# filter NYPD data for bicycle-related accidents
nypd_data = nypd_data[(nypd_data["NUMBER OF CYCLIST INJURED"] > 0) | (nypd_data["NUMBER OF CYCLIST KILLED"] > 0)] 


In [17]:
nypd_data.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
27,12/14/2021,12:54,BROOKLYN,11217.0,40.687534,-73.9775,"(40.687534, -73.9775)",FULTON STREET,SAINT FELIX STREET,,...,Unspecified,,,,4487052,Sedan,Bike,,,
31,12/14/2021,16:25,,,40.784615,-73.953964,"(40.784615, -73.953964)",EAST 93 STREET,,,...,Driver Inattention/Distraction,,,,4486581,Van,Bike,,,
51,04/24/2022,15:35,MANHATTAN,10019.0,40.767242,-73.986206,"(40.767242, -73.986206)",WEST 56 STREET,9 AVENUE,,...,Unspecified,,,,4521853,Station Wagon/Sport Utility Vehicle,Bike,,,
66,12/09/2021,20:20,BROOKLYN,11223.0,40.59207,-73.96299,"(40.59207, -73.96299)",EAST 7 STREET,CRAWFORD AVENUE,,...,Unspecified,,,,4485150,Bike,,,,
72,12/09/2021,23:15,BROOKLYN,11218.0,40.640835,-73.98967,"(40.640835, -73.98967)",12 AVENUE,41 STREET,,...,Driver Inattention/Distraction,,,,4485355,Sedan,Bike,,,


In [8]:
print(bike_data.shape)
display(bike_data.head())

(999338, 13)


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,D86F678648E7A867,electric_bike,2024-09-10 22:50:16.212,2024-09-10 23:30:44.697,Hudson St & W 13 St,6115.06,Broadway & W 58 St,6948.1,40.740057,-74.005274,40.766953,-73.981693,casual
1,032D1788CD512084,electric_bike,2024-09-22 05:51:00.609,2024-09-22 05:56:50.446,W 37 St & 5 Ave,6398.06,9 Ave & W 45 St,6717.06,40.75038,-73.98339,40.760193,-73.991255,member
2,DA55381E5121F0F9,electric_bike,2024-09-24 11:07:40.618,2024-09-24 11:29:23.460,Greenpoint Ave & West St,5752.09,2 Ave & E 72 St,6925.09,40.729803,-73.959099,40.768762,-73.958408,member
3,F67A042C028C6367,classic_bike,2024-09-03 14:25:28.732,2024-09-03 14:33:51.075,E 85 St & 3 Ave,7212.05,2 Ave & E 72 St,6925.09,40.778012,-73.954071,40.768762,-73.958408,member
4,31F722D5EAB9C780,electric_bike,2024-09-09 15:46:50.376,2024-09-09 15:50:16.411,7 Ave & Park Pl,4125.07,Carroll St & 6 Ave,4019.06,40.677615,-73.973243,40.674089,-73.978728,member


#### Spatial

In [9]:
# Convert CitiBike start and end locations to GeoDataFrames
bike_start_gdf = gpd.GeoDataFrame(bike_data, geometry=gpd.points_from_xy(bike_data.start_lng, bike_data.start_lat))
bike_end_gdf = gpd.GeoDataFrame(bike_data, geometry=gpd.points_from_xy(bike_data.end_lng, bike_data.end_lat))


In [11]:
# Convert NYPD crash data to GeoDataFrame
nypd_spatial = nypd_data.dropna(subset=['LATITUDE', 'LONGITUDE'])
nypd_gdf = gpd.GeoDataFrame(nypd_spatial,
                            geometry=gpd.points_from_xy(nypd_spatial['LONGITUDE'], nypd_spatial['LATITUDE']))

In [12]:
bike_start_gdf = bike_start_gdf.set_crs("EPSG:2263")
bike_end_gdf = bike_end_gdf.set_crs("EPSG:2263")
nypd_gdf = nypd_gdf.set_crs("EPSG:2263")

In [13]:
# Perform spatial join to find crashes near CitiBike stations within 100 meters
#nypd_near_start = gpd.sjoin_nearest(nypd_gdf, bike_start_gdf, distance_col="distance", max_distance=100)
#nypd_near_end = gpd.sjoin_nearest(nypd_gdf, bike_end_gdf, distance_col="distance", max_distance=100)
nypd_gdf["LATITUDE"].unique()

array([40.687534, 40.784615, 40.767242, ..., 40.50958 , 40.625103,
       40.769352])

#### MAP

In [16]:
display(high_risk_gdf.head())

Unnamed: 0,LATITUDE,LONGITUDE,incident_count,geometry
0,0.0,0.0,197,POINT (0 0)
354,40.579197,-73.98195,11,POINT (-73.98195 40.5792)
460,40.58196,-73.959946,10,POINT (-73.95995 40.58196)
526,40.583725,-73.89372,14,POINT (-73.89372 40.58372)
608,40.58667,-73.966156,16,POINT (-73.96616 40.58667)


In [14]:
# Aggregate by location and count incidents
high_risk_zones = nypd_gdf.groupby(['LATITUDE', 'LONGITUDE']).size().reset_index(name='incident_count')
high_risk_gdf = gpd.GeoDataFrame(high_risk_zones, geometry=gpd.points_from_xy(high_risk_zones['LONGITUDE'], high_risk_zones['LATITUDE']))

# Define a threshold for high-risk locations
threshold = 10  # Adjust based on data to highlight significant hotspots
high_risk_gdf = high_risk_gdf[high_risk_gdf['incident_count'] >= threshold]

In [15]:
high_risk_gdf["incident_count"].unique()

array([197,  11,  10,  14,  16,  13,  12,  23,  20,  22,  15,  17,  33,
        18,  19,  38,  26,  29,  21])

In [35]:

# Initialize a map centered around New York City
nyc_map = folium.Map(location=[40.7128, -74.0060], zoom_start=12)

for _, row in high_risk_gdf.iterrows():
    folium.CircleMarker(
        location=(row['LATITUDE'], row['LONGITUDE']),
        radius=row['incident_count'] * 0.5,  # Scale radius by incident count
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.6,
        popup=f"Incidents: {row['incident_count']}"
    ).add_to(nyc_map)


LATITUDE                  0.0
LONGITUDE                 0.0
incident_count            197
geometry          POINT (0 0)
Name: 0, dtype: object


ValueError: 

In [34]:
nyc_map

#### 1. Risk modeling and accident prediction

In [None]:
nyc_map

spatial analysis - what are the risky stations?
temporal analysis - what are the risky hours?

In [None]:
# Define features and target for risk modeling
features = citibike_data[['start_hour', 'day_of_week', 'location_risk_score']]
target = merged_data['accident_occurrence']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

In [None]:



import folium

# Create map with risk zones
map_nyc = folium.Map(location=[40.7128, -74.0060], zoom_start=12)
for _, row in high_risk_zones.iterrows():
    folium.CircleMarker(
        location=(row['lat'], row['lon']),
        radius=5,
        color='red',
        fill=True,
        fill_opacity=0.6
    ).add_to(map_nyc)
