In [1]:
import pandas as pd
from sklearn.cluster import DBSCAN
import numpy as np

df = pd.read_csv("cleaned_dataset.csv")
print(df)

# Selecting only the spatial coordinates for clustering
coordinates = df[['Latitude', 'Longitude']]


        Accident_Index  Location_Easting_OSGR  Location_Northing_OSGR  \
0        200501BS00001               525680.0                178240.0   
1        200501BS00002               524170.0                181650.0   
2        200501BS00003               524520.0                182240.0   
3        200501BS00004               526900.0                177530.0   
4        200501BS00005               528060.0                179040.0   
...                ...                    ...                     ...   
1041497  201201LX50301               530140.0                170990.0   
1041498  201201LX50302               529820.0                170510.0   
1041499  201201LX50303               530650.0                176730.0   
1041500  201201LX50304               532220.0                172500.0   
1041501  201201LX50305               531180.0                177000.0   

         Longitude   Latitude Local_Authority_(District)  \
0        -0.191170  51.489096     Kensington and Chelsea   
1  

In [3]:
# DBSCAN Clustering
# eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other
# min_samples: The number of samples in a neighborhood for a point to be considered as a core point

dbscan = DBSCAN(eps=0.01, min_samples=10)  # these parameters need to be tuned
clusters = dbscan.fit_predict(coordinates)

# Adding the cluster labels to your dataset
df['Cluster'] = clusters


In [4]:
# Count the number of accidents in each cluster
cluster_counts = df['Cluster'].value_counts()
threshold = 50

# Identify potential hotspots (clusters with high counts)
hotspots = cluster_counts[cluster_counts > threshold]  # define an appropriate threshold

print(hotspots)


Cluster
 0       601442
-1        64380
 147      30489
 2279     22601
 1299     18888
          ...  
 1089        51
 1266        51
 541         51
 1882        51
 1290        51
Name: count, Length: 512, dtype: int64


In [5]:
import folium

sample_data = df.sample(n=1000)  # Adjust n to your needs
# Then use sample_data instead of new_dataset for plotting


# Create a map centered around an average location
map_center = [sample_data['Latitude'].mean(), sample_data['Longitude'].mean()]
map = folium.Map(location=map_center, zoom_start=12)

# Add each accident point to the map
for _, row in sample_data.iterrows():
    cluster = row['Cluster']
    if cluster != -1:  # -1 is for noise points
        folium.CircleMarker(location=[row['Latitude'], row['Longitude']],
                            radius=3,
                            color='blue' if cluster == 0 else 'red',  # Change colors if you have more clusters
                            fill=True,
                            fill_color='blue' if cluster == 0 else 'red',
                            fill_opacity=0.6).add_to(map)

# Display the map
map


In [2]:
import pandas as pd
#Data preperation.

# Group by Local_Authority_(District) and count the accidents
district_counts = df.groupby('Local_Authority_(District)').agg({'Latitude': 'mean', 'Longitude': 'mean', 'Accident_Index': 'count'}).reset_index()
district_counts.rename(columns={'Accident_Index': 'Accident_Count'}, inplace=True)

In [5]:
import folium

average_latitude = df['Latitude'].mean()
average_longitude = df['Longitude'].mean()


# Create a base map
map = folium.Map(location=[average_latitude, average_longitude], zoom_start=6)

# Add markers to the map
for _, row in district_counts.iterrows():
    location = [row['Latitude'], row['Longitude']]
    tooltip = f"{row['Local_Authority_(District)']}: {row['Accident_Count']} accidents"
    
    folium.Marker(
        location=location,
        popup=tooltip,
        icon=folium.Icon(color='blue', icon='info-sign')
    ).add_to(map)

# Display the map
map
