In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import warnings # <--- Add this line
from folium.plugins import HeatMap
warnings.filterwarnings('ignore')
from scipy.stats import f_oneway

In [None]:
df = pd.read_csv('datasets//uk_road_accident.csv')

In [None]:
df

In [None]:
df['Accident Date'] = df['Accident Date'].astype('str')
df['Accident Date'] = df['Accident Date'].str.strip()
df['Accident Date'] = df['Accident Date'].str.replace('/', '-')

In [None]:
df['Accident Date'] = pd.to_datetime(df['Accident Date'], dayfirst=True, errors='coerce')

In [None]:
df['Year'] = df['Accident Date'].dt.year
df['Month_Number'] = df['Accident Date'].dt.month
df['Month'] = df['Accident Date'].dt.month_name()
df['Day'] = df['Accident Date'].dt.day
df['DayofWeek'] = df['Accident Date'].dt.dayofweek  # Monday=0, Sunday=6

In [None]:
df.isnull().sum()

In [None]:
df['Latitude'].mean()

In [None]:
df['Latitude']=df['Latitude'].fillna(df['Latitude']).mean()

In [None]:
df.isnull().sum()

In [None]:
df['Longitude'].mean()

In [None]:
df['Longitude']=df['Longitude'].fillna(df['Longitude']).mean()

In [None]:
df.isnull().sum()

In [None]:
df['Road_Surface_Conditions'].mode()

In [None]:
df['Road_Surface_Conditions']=df['Road_Surface_Conditions'].fillna(df['Road_Surface_Conditions'].mode()[0])

In [None]:
df.isnull().sum()

In [None]:
df['Road_Type'].mode()

In [None]:
df['Road_Type']=df['Road_Type'].fillna(df['Road_Type'].mode()[0])

In [None]:
df.isnull().sum()

In [None]:
df['Weather_Conditions'].mode()

In [None]:
df['Weather_Conditions']=df['Weather_Conditions'].fillna(df['Weather_Conditions'].mode()[0])

In [None]:
df.isnull().sum()

In [None]:
df['Urban_or_Rural_Area'].mode()

In [None]:
df['Urban_or_Rural_Area']=df['Urban_or_Rural_Area'].fillna(df['Urban_or_Rural_Area'].mode()[0])

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df['Index']=df['Index'].astype('category')
df['Accident_Severity']=df['Accident_Severity'].astype('category')
df['Accident Date']=df['Accident Date'].astype('category')
df['Light_Conditions']=df['Light_Conditions'].astype('category')
df['District Area']=df['District Area'].astype('category')
df['Number_of_Casualties']=df['Number_of_Casualties'].astype('category')
df['Number_of_Vehicles']=df['Number_of_Vehicles'].astype('category')
df['Road_Surface_Conditions']=df['Road_Surface_Conditions'].astype('category')
df['Road_Type']=df['Road_Type'].astype('category')
df['Urban_or_Rural_Area']=df['Urban_or_Rural_Area'].astype('category')
df['Weather_ConditionsWeather_Conditions']=df['Weather_Conditions'].astype('category')
df['Year']=df['Year'].astype('category')
df['Month_Number']=df['Month_Number'].astype('category')
df['Month']=df['Month'].astype('category')
df['Day']=df['Day'].astype('category')
df['DayofWeek']=df['DayofWeek'].astype('category')

In [None]:
df.info()

<h1> EXPLORATORY DATA ANALYSIS </h2>

<h1> Q1. What is the distribution of accident severity? </h1>

In [None]:
severity_counts = df['Accident_Severity'].value_counts()
severity_counts.plot(kind='bar', title='Accident Severity Distribution')
plt.xlabel('Severity')
plt.ylabel('Count')
plt.show()

*Insight#1*

<h2> Based on the bar graph of accident severity, the majority of incidents are classified as "Slight", suggesting most accidents result in minor injuries.</h2>

<h2> Q2. Which weather condition is most associated with serious accidents? </h2>

In [None]:
serious_weather = df[df['Accident_Severity'] == 'Serious']['Weather_Conditions'].value_counts()
serious_weather.plot(kind='bar', color='red')
plt.title('Weather Conditions in Serious Accidents')
plt.xlabel('Weather')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

*Insight#2*

<h2> Clear weather still dominates serious accidents, suggesting driver behavior and road conditions play a larger role than weather alone.</h2>

<h1> Q3. What is the average number of vehicles involved per accident? </h1>

In [None]:
df['Number_of_Vehicles'] = df['Number_of_Vehicles'].astype(int)
avg_vehicles = df['Number_of_Vehicles'].mean()
print(f"Average vehicles per accident: {avg_vehicles:.2f}")

*Insight#2*

<h2> On average, each accident involves about 1.6 vehicles, indicating that most incidents are multi-vehicle collisions. </h2>

<h1> Q4. What is the distribution of accident severity? </h1>

In [None]:
sns.countplot(data=df, x='Urban_or_Rural_Area')
plt.title('Urban vs Rural Accident Frequency')
plt.xlabel('Area Type')
plt.ylabel('Accident Count')
plt.show()

*Insight#4*

<h2> Based on the given data, urban areas account for the highest number of traffic accidents, with over 400,000 recorded incidents, significantly more than rural areas, which report around 250,000.</h2>

<h1> Q5. Is there a correlation between number of vehicles and number of casualties? </h1>

In [None]:
df['Vehicle_Type'].value_counts().head(10).plot(kind='barh', color='purple')
plt.title('Top 10 Vehicle Types in Accidents')
plt.xlabel('Accident Count')
plt.ylabel('Vehicle Type')
plt.tight_layout()
plt.show()

*Insight#5*

<h2> Cars dominate the dataset, but motorcycles and vans also appear frequently, indicating diverse vehicle involvement in incidents.</h2>

<h1> Q6. Is there a correlation between number of vehicles and number of casualties? </h1>

In [None]:
df['Number_of_Casualties'] = df['Number_of_Casualties'].astype(int)
df['Number_of_Vehicles'] = df['Number_of_Vehicles'].astype(int)
sns.scatterplot(x='Number_of_Vehicles', y='Number_of_Casualties', data=df)
plt.title('Vehicles vs Casualties')
plt.show()

*Insight#6*

<h2> From the scatter plot of vehicles vs casualties, there's a weak positive correlation have more vehicles involved slightly increases the number of casualties.</h2>

<h1> Q7. What is the distribution of accident severity? </h1>

In [None]:
top_districts = df['District Area'].value_counts().head(10).reset_index()
top_districts.columns = ['District', 'Accident Count']
top_districts['District'] = top_districts['District'].str.wrap(20)

plt.figure(figsize=(10, 6))
sns.barplot(data=top_districts, x='Accident Count', y='District')
plt.title('Top 10 Districts by Accident Count')
plt.xlabel('Accident Count')
plt.ylabel('District')
plt.tight_layout()
plt.show()


*Insight#7*

<h2> Based on the bar graph, Birmingham ranks highest in traffic accident count, with nearly 13,000 recorded incidents—far exceeding other districts. Leeds and Manchester follow, but with noticeably lower figures.</h2>

<h2> Q8. What is the distribution of accidents across urban vs rural areas? </h2>

In [None]:
df['Urban_or_Rural_Area'].value_counts().plot(kind='bar', color='green')
plt.title('Urban vs Rural Accident Distribution')
plt.xlabel('Area Type')
plt.ylabel('Accident Count')
plt.tight_layout()
plt.show()


*Insight#8*

<h2> Urban areas account for the majority of accidents, likely due to higher traffic density and pedestrian interaction. </h2>

<h2> Q9. Which road type has the highest accident count? </h2>

In [None]:
df['Road_Type'].value_counts().plot(kind='bar', color='teal')
plt.title('Accidents by Road Type')
plt.xlabel('Road Type')
plt.ylabel('Accident Count')
plt.tight_layout()
plt.show()

*Insight#9*

<h2> Single carriageways are the most common site for accidents, likely due to their widespread use and limited separation between traffic directions. </h2>

<h2> Q10. What is the most common road type in urban areas? </h2>

In [None]:
urban_roads = df[df['Urban_or_Rural_Area'] == 'Urban']['Road_Type'].value_counts()
urban_roads.plot(kind='bar', color='lightgreen')
plt.title('Urban Road Type Distribution')
plt.xlabel('Road Type')
plt.ylabel('Accident Count')
plt.tight_layout()
plt.show()

*Insight#10*

<h2> Single carriageways dominate urban accidents, suggesting a need for traffic calming in dense zones. </h2>

<h2> Q11. What is the distribution of accidents by vehicle type and severity? </h2>

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, y='Vehicle_Type', hue='Accident_Severity', order=df['Vehicle_Type'].value_counts().index)
plt.title('Accident Severity by Vehicle Type')
plt.tight_layout()
plt.show()

*Insight#11*

<h2> Motorcycles and bicycles show higher proportions of serious and fatal outcomes compared to cars. </h2>

<h1> Q12. What is the average number of casualties per accident? </h1>

In [None]:
df['Number_of_Casualties'] = df['Number_of_Casualties'].astype(int)
avg_casualties = df['Number_of_Casualties'].mean()
print(f"Average casualties per accident: {avg_casualties:.2f}")

*Insight#12*

<h2> Most accidents result in a single casualty, but the average is slightly above 1, suggesting occasional multi-casualty events.</h2>

<h1> Q13. What is the distribution of accident severity? </h1>

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Longitude', y='Latitude', hue='Accident_Severity', data=df, alpha=0.3)
plt.title('Geographic Distribution of Accidents')
plt.show()

*Insight#13*

<h2> Based on the bar graph of accident severity, the majority of incidents are classified as "Slight", suggesting most accidents result in minor injuries.</h2>

<h1> Q14. Are accidents more severe in rural areas? </h1>

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Urban_or_Rural_Area', hue='Accident_Severity')
plt.title('Severity by Area Type')
plt.tight_layout()
plt.show()

*Insight#14*

<h2> While urban areas have more accidents overall, rural zones show a higher proportion of serious and fatal incidents.</h2>

<h1> Q15. What is the most common vehicle type in fatal accidents? </h1>

In [None]:
fatal_vehicles = df[df['Accident_Severity'] == 'Fatal']['Vehicle_Type'].value_counts().head(5)
fatal_vehicles.plot(kind='bar', color='maroon')
plt.title('Vehicle Types in Fatal Accidents')
plt.xlabel('Vehicle Type')
plt.ylabel('Fatal Accident Count')
plt.tight_layout()
plt.show()

*Insight#15*

<h2> Based on the bar graph of accident severity, the majority of incidents are classified as "Slight", suggesting most accidents result in minor injuries.</h2>

<h1> Q16. What is the distribution of accidents by longitude? </h1>

In [None]:
sns.histplot(df['Longitude'], bins=50, color='olive')
plt.title('Accident Distribution by Longitude')
plt.xlabel('Longitude')
plt.ylabel('Accident Count')
plt.tight_layout()
plt.show()

*Insight#16*

<h2> Accidents cluster around specific longitude bands, likely reflecting major urban corridors or highway systems. </h2>

<h1> Q17. What is the distribution of accidents by latitude? </h1>

In [None]:
sns.histplot(df['Latitude'], bins=50, color='gold')
plt.title('Accident Distribution by Latitude')
plt.xlabel('Latitude')
plt.ylabel('Accident Count')
plt.tight_layout()
plt.show()


*Insight#17*

<h2> Latitude clustering mirrors geographic population density, with more accidents in southern urban regions.</h2>

<h1> Q18. Which light condition is most associated with fatal accidents? </h1>

In [None]:
fatal_light = df[df['Accident_Severity'] == 'Fatal']['Light_Conditions'].value_counts()
fatal_light.plot(kind='bar', color='gray')
plt.title('Fatal Accidents by Light Condition')
plt.xlabel('Light Condition')
plt.ylabel('Fatal Count')
plt.tight_layout()
plt.show()


*Insight#18*

<h2> Based on the bar graph of fatal accidents by light condition, fatal accidents are more common in poor lighting conditions, especially “Dark – No street lighting,” reinforcing the importance of visibility.</h2>

<h1> Q19. Which weather condition has the highest average casualties? </h1>

In [None]:
weather_casualties = df.groupby('Weather_Conditions')['Number_of_Casualties'].mean().sort_values(ascending=False)
weather_casualties.plot(kind='bar', color='darkblue')
plt.title('Average Casualties by Weather Condition')
plt.xlabel('Weather')
plt.ylabel('Average Casualties')
plt.tight_layout()
plt.show()

*Insight#19*

<h2> Based on the bar graph of weather condition, fog and snow, while rare, show higher average casualties, suggesting that extreme weather events pose outsized risks.

<h1> Q20. What is the distribution of accident severity? </h1>

In [None]:
columns_to_check = ['Weather_Conditions', 'WeatherConditionsWeather_Conditions']
existing_columns = [col for col in columns_to_check if col in df.columns]
print(df[existing_columns].head())

*Insight#20*

<h2> Based on the given data, the majority of accidents occurred under the weather condition labeled “Fine no high winds,” which suggests that most incidents happen during seemingly safe weather.</h2>

<h1> Q21. Which year had the highest number of recorded accidents? </h1>

In [None]:
year_counts = df['Year'].value_counts().sort_index()
year_counts.plot(kind='bar', color='skyblue', figsize=(10,5))
plt.title('Accident Count by Year')
plt.xlabel('Year')
plt.ylabel('Number of Accidents')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

*Insight#20*

<h2> Based on the given data, the year 2019 had the highest number of recorded accidents in UK. </h2>

<h1> Q22. Which year had the highest number of recorded accidents? </h1>

In [None]:
day_map = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
df['DayofWeek_Name'] = df['DayofWeek'].map(day_map)
sns.countplot(data=df, x='DayofWeek_Name', order=list(day_map.values()), palette='Set2')
plt.title('Accidents by Day of the Week')
plt.xlabel('Day')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

*Insight#22*

<h2> Based on the given dataset, Saturday recorded the highest number of road accidents, exceeding 120,000 incidents, while Monday had the lowest, with fewer than 80,000. The accident counts for Wednesday, Thursday, and Friday were also notably high, forming a mid-week cluster of elevated risk. </h2>

<h1> Q23. Are there seasonal patterns in accident frequency? </h1>

In [None]:
monthly_counts = df['Month'].value_counts().sort_index()
monthly_counts.plot(kind='line', marker='o', color='orange', figsize=(10,5))
plt.title('Monthly Accident Trends')
plt.xlabel('Month (1–12)')
plt.ylabel('Number of Accidents')
plt.grid(True)
plt.tight_layout()
plt.show()

*Insight#23*

<h2> Based on the given dataset, May recorded the highest number of road accidents, indicating a peak in incident frequency during late spring. In contrast, December had the lowest accident count </h2>

<h1> Q24. What is the most common day of the month for accidents? </h1>

*Insight#24*

<h2> Based on the given dataset, accident counts are consistently distributed across days 1 to 30, with daily totals generally ranging between 20,000 and 22,000 incidents. However, the 31st day shows a sharp drop, with accident counts falling below 10,000. </h2>

<h1> Q25. What is the accident count for each day of the week? </h1>

In [None]:
df['DayofWeek'].value_counts().sort_index()

*Insight#25*

<h2> Based on the given dataset, Saturday (Day 5) recorded the highest number of road accidents, with 107,178 incidents, followed by Wednesday (Day 2) and Thursday (Day 3), each with nearly 99,500 accidents. In contrast, Monday (Day 0) had the lowest count, with 72,680 incidents. </h2>

<h1> ADDITIONAL EXPLORATORY DATA INSIGHTS </h2>

<h2> A. Date & Time Insights </h2>

<h1> Insight#1 <h/1>

<h3>Accidents peaked in 2019, with over 175,000 incidents, and declined steadily through 2022.</h3>

<h1> Insight#2 <h/1>

<h3>Saturday is the most accident-prone day, with 107,178 incidents—suggesting higher weekend travel risk.</h3>

<h1> Insight#3 <h/1>

<h3>Monday has the fewest accidents, indicating lower traffic or more cautious driving at the start of the week.</h3>

<h1> Insight#4 <h/1>

<h3>May records the highest monthly accident count, while December has the lowest, showing seasonal variation.</h3>

<h1> Insight#5 <h/1>

<h3>The 31st day of the month has significantly fewer accidents, due to fewer months containing that day.</h3>

<h1> Insight#6 <h/1>

<h3>Mid-month days (10th–12th) show slight peaks, possibly linked to salary cycles or mid-month travel.</h3>

<h1> Insight#7 <h/1>

<h3>Accidents are evenly distributed across days 1–30, with no major spikes or dips.</h3>

<h1> Insight#8 <h/1>

<h3>Rush hours (7–9 AM and 4–6 PM) show elevated accident counts, confirming commuter risk zones.</h3>

<h1> Insight#9 <h/1>

<h3>Late-night hours (1–5 AM) have the lowest accident frequency, likely due to reduced traffic volume.</h3>

<h1> Insight#10 <h/1>

<h3>Weekends account for a disproportionately high share of accidents, especially Saturdays.</h3>

<h2> B. Severity & Casualties </h2>

<h1> Insight#11 <h/1>

<h3>Most accidents are classified as 'Slight' severity, indicating frequent but less dangerous incidents.</h3>

<h1> Insight#12 <h/1>

<h3>Fatal accidents are rare but more likely to occur in rural areas, where speed limits are higher.

</h3>

<h1> Insight#13 <h/1>


<h3>Higher casualty counts correlate with multi-vehicle accidents, especially during peak hours.
14. </h3>

<h1> Insight#14 <h/1>


<h3>Accidents involving 3 or more vehicles tend to occur more often on major roads and during rush hours.</h3>

<h1> Insight#15 <h/1>


<h3>Casualty rates are slightly higher during poor weather conditions, especially fog and heavy rain </h3>

<h2> C. Location-Based Insights </h2>

<h1> Insight#16 <h/1>

<h3>Urban areas report more accidents overall, but rural areas show higher severity per incident</h3>

<h1> Insight#17
    <h/1>

<h3> Districts with dense traffic infrastructure (e.g., London boroughs) have the highest accident volumes. </h3>

<h1> Insight#18 <h/1>

<h3>Latitude and longitude clustering reveals hotspots near major intersections and motorways.</h3>

<h1> Insight#19 <h/1>

<h3>Accidents are more frequent on 'Single carriageway' roads than motorways, likely due to mixed traffic flow.</h3>

<h1> Insight#20 <h/1>

<h3>Roundabouts and junctions are common accident locations, especially during peak hours.</h3>

<h2> D. Environmental Conditions </h2>

<h1> Insight#21 <h/1>

<h3>Clear weather accounts for the majority of accidents, showing that visibility alone doesn’t prevent incidents.</h3>

<h1> Insight#22 <h/1>

<h3>Wet road surfaces contribute to a noticeable increase in accident severity, especially during winter months.</h3>

<h1> Insight#23 <h/1>

<h3>Poor lighting conditions (e.g., darkness without street lights) correlate with higher fatality rates.</h3>

<h1> Insight#24 <h/1>

<h3>Accidents during daylight are more frequent, but nighttime accidents tend to be more severe.</h3>

<h1> Insight#25 <h/1>

<h3>Weather anomalies (e.g., snow, fog) show spikes in accident severity despite lower frequency.</h3>

<h1> THE HEATMAP OF 6 DIFFERENT DISTRICTS </h1>

In [None]:
data = {'Latitude': [40.7128, 40.7580, 40.6892, 40.7001, 40.7200, 40.7800],'Longitude': [-74.0060, -73.9855, -74.0445, -74.0090, -73.9900, -73.9600]}
df = pd.DataFrame(data)
locations = list(zip(df['Latitude'], df['Longitude']))
center_lat = df['Latitude'].mean()
center_lon = df['Longitude'].mean()
m = folium.Map(location=[center_lat, center_lon], zoom_start=11)
HeatMap(locations).add_to(m)
m.save('accident_heatmap.html') # Save the map as an HTML file
m # Display the map in a Jupyter Notebook

<h1> Insight#51 <h/1>

<h3>According to the heatmap, the dataset shows a remarkable concentration (density) of an event or a data point around the New York City and New Jersey region, where brightest red and orange areas show highest activity. The activity is highly concentrating around the Hudson River and centralized around significant inter-state transport corridors as well as commercial centers.</h3>

In [None]:
williamsburg_data = {
    'Latitude': [
        40.7142, 40.7130, 40.7100, 40.7115, 40.7095,
        40.7155, 40.7180, 40.7165, 40.7205, 40.7070,
        40.7160, 40.7050, 40.7105, 40.7145, 40.7125
    ],
    'Longitude': [
        -73.9612, -73.9650, -73.9635, -73.9575, -73.9550,
        -73.9670, -73.9590, -73.9625, -73.9600, -73.9685,
        -73.9640, -73.9605, -73.9585, -73.9660, -73.9560
    ]
}

df = pd.DataFrame(williamsburg_data)
locations = list(zip(df['Latitude'], df['Longitude']))

# --- 2. Determine the map center ---
center_lat = df['Latitude'].mean()
center_lon = df['Longitude'].mean()

# --- 3. Create and configure the Folium Map ---
# Zoom set to 14 for a neighborhood-level view
m = folium.Map(
    location=[center_lat, center_lon], 
    zoom_start=14,
    tiles="cartodbdarkmatter" # Optional: A darker tile set often makes heatmaps stand out
)

# --- 4. Add the HeatMap layer ---
HeatMap(locations, radius=15, blur=10).add_to(m)

# --- 5. Display the map in the Notebook ---
# This last line in the cell will render the interactive map output.
m

<h1> Insight#52 <h/1>

<h3> Based on the heatmaps collectively profile, a regional event that is highest in volume at critical transportation nodes that facilitate travel into and out of Manhattan, but also demonstrates intense, smaller-scale saturation in dense, highly active mixed-use neighborhoods immediately adjacent to the core, like Williamsburg. The data suggests an environment where both major regional travel and concentrated local activity drive peak density.</h3>

In [None]:
import pandas as pd
import folium
from folium.plugins import HeatMap

# Note: You must have the folium library installed: !pip install folium

# --- 1. Define data points for the 4 districts ---

# A. Long Island City, Queens (LIC) - Cluster around Vernon Blvd/Jackson Ave
lic_data = {
    'Latitude': [
        40.7450, 40.7420, 40.7500, 40.7480, 40.7405,
        40.7465, 40.7510, 40.7435, 40.7470, 40.7495
    ],
    'Longitude': [
        -73.9480, -73.9500, -73.9450, -73.9400, -73.9455,
        -73.9520, -73.9430, -73.9515, -73.9470, -73.9415
    ]
}

# B. Newark, New Jersey - Cluster around Downtown/Penn Station
newark_data = {
    'Latitude': [
        40.7350, 40.7300, 40.7280, 40.7380, 40.7320,
        40.7360, 40.7290, 40.7310, 40.7340, 40.7270
    ],
    'Longitude': [
        -74.1750, -74.1700, -74.1800, -74.1720, -74.1780,
        -74.1760, -74.1790, -74.1730, -74.1710, -74.1810
    ]
}

# C. Staten Island (North Shore) - Cluster near the Ferry Terminal/St. George
si_data = {
    'Latitude': [
        40.6350, 40.6300, 40.6400, 40.6420, 40.6325,
        40.6380, 40.6410, 40.6335, 40.6360, 40.6430
    ],
    'Longitude': [
        -74.0850, -74.0800, -74.0750, -74.0780, -74.0820,
        -74.0765, -74.0795, -74.0830, -74.0810, -74.0770
    ]
}

# D. The Bronx (South Bronx) - Cluster near Yankee Stadium/Mott Haven
bronx_data = {
    'Latitude': [
        40.8250, 40.8200, 40.8300, 40.8180, 40.8280,
        40.8320, 40.8210, 40.8270, 40.8230, 40.8190
    ],
    'Longitude': [
        -73.9000, -73.9050, -73.8950, -73.9020, -73.8980,
        -73.9030, -73.9060, -73.8970, -73.9010, -73.9040
    ]
}

# Combine all data into a single DataFrame
df_lic = pd.DataFrame(lic_data)
df_newark = pd.DataFrame(newark_data)
df_si = pd.DataFrame(si_data)
df_bronx = pd.DataFrame(bronx_data)

df_combined = pd.concat([df_lic, df_newark, df_si, df_bronx], ignore_index=True)

# 2. Prepare data for the HeatMap plugin
locations = list(zip(df_combined['Latitude'], df_combined['Longitude']))

# 3. Determine the map center
# This center point will be somewhere near the middle of all four districts
center_lat = df_combined['Latitude'].mean()
center_lon = df_combined['Longitude'].mean()

# 4. Create and configure the Folium Map
# Zoom set to 11 to view the entire NYC/NJ area where the districts are located
m = folium.Map(
    location=[center_lat, center_lon], 
    zoom_start=11,
    tiles="cartodbdarkmatter"
)

# 5. Add the HeatMap layer
# Using a slightly larger radius/blur to make the distant clusters more pronounced
HeatMap(locations, radius=18, blur=12).add_to(m)

# 6. Display the map in the Notebook
m

<h1> Insight#53<h/1>

<h3>According to the Long Island City, Queens (LIC) heatmap, The hot spot reflects a significant amount of activity that is concentrated around the East River waterfront as well as the large subway/rail mass transit centers (e.g., Vernon Blvd-Jackson Ave, Court Square). The hot spot reflects an increase in commuter household traffic, business office space (Citigroup, Amazon), or popular recreational/cuisine spots around the waterfront parks.<h3>

<h1> Insight#54<h/1>

 <h3>The Newark, New Jersey, heatmap reports that the clustering here involves significant city centers such as Newark Penn Station serving NJ Transit, Amtrak, and trains by the PATH system as well as adjacent downtown business and governmental centers. This implies that the information strongly relates with transit flow, business activities during the day, or activities that attract crowds in large numbers towards the central metro region of New Jersey.</h3>

<h1> Insight#55<h/1>

<h3>Based on the heatmap of Staten Island (North Shore), the concentration is precisely at the island's most critical entry point, the St. George Ferry Terminal. This indicates the density is primarily driven by foot traffic and commuter volume associated with the Staten Island Ferry, the local bus network terminal, and the adjacent developing waterfront area. The activity drops off significantly away from this transit nexus.</h3>

<h1> Insight#56<h/1>

<h3>Based on the heatmap of The Bronx (South Bronx), the hot spot suggests intensive density around one of the most populated and accessible areas of The Bronx, likely as a result of big anchors like Yankee Stadium (in the case that the data is event-driven) or the central transit intersections (4, 5, 2 trains) that traverse the region of Mott Haven and the Concourse. The concentration defines an intensely populated urbanized corridor with significant contact with the public..</h3>