In [1]:
import os
current_dir = os.getcwd()
current_dir

'/Users/mahahussain/Desktop/UK-Road-Accident-Analysis/UK-Road-Accident-Analysis/jupyter_notebooks'

In [2]:
import os
current_dir = os.getcwd()
current_dir

'/Users/mahahussain/Desktop/UK-Road-Accident-Analysis/UK-Road-Accident-Analysis/jupyter_notebooks'

In [3]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


In [4]:
current_dir = os.getcwd()
current_dir

'/Users/mahahussain/Desktop/UK-Road-Accident-Analysis/UK-Road-Accident-Analysis'

In [5]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
import plotly.express as px

In [None]:
df = pd.read_csv(os.path.join(current_dir, 'filtered_accident_data_set.csv'))
df.head()

In [None]:
df['Accident Date'].head()

## Hypothesis 1: Road conditions affect the number of casualties

In [None]:
import pandas as pd
import scipy.stats as stats

# Check hypothesis 1
# Convert date to datetime format
df['Accident Date'] = pd.to_datetime(df['Accident Date'], format='%d-%m-%Y')

# Group the data
wet = df[df['Road_Surface_Conditions'] == 'Wet or damp']['Number_of_Casualties']
dry = df[df['Road_Surface_Conditions'] == 'Dry']['Number_of_Casualties']

# Mann-Whitney U test (if data is not normally distributed) or t-test (if normally distributed)
stat, p = stats.mannwhitneyu(wet, dry, alternative='two-sided')
print(f"Road Surface Conditions Test: U={stat}, p-value={p}")


This means road surface conditions significantly affect the number of casualties. Wet/damp conditions likely result in more casualties than dry conditions.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 5))
sns.boxplot(data=df, x="Road_Surface_Conditions", y="Number_of_Casualties", palette="coolwarm")
plt.xticks(rotation=45)
plt.title("Impact of Road Surface Conditions on Number of Casualties")
plt.xlabel("Road Surface Conditions")
plt.ylabel("Number of Casualties")
plt.show()


## Hypothesis 2: Rural accidents are more severe than urban accidents

In [None]:
# Check hypothesis 2

urban = df[df['Urban_or_Rural_Area'] == 'Urban']['Number_of_Casualties']
rural = df[df['Urban_or_Rural_Area'] == 'Rural']['Number_of_Casualties']

# Mann-Whitney U test
stat, p = stats.mannwhitneyu(urban, rural, alternative='greater')
print(f"Urban vs Rural Test: U={stat}, p-value={p}")



In [None]:
sns.barplot(data=df, 
            x="Urban_or_Rural_Area", 
            y="Number_of_Casualties", 
            hue="Urban_or_Rural_Area",  
            palette="Set2", 
            errorbar=None, 
            legend=False)  


In [None]:
# Calculate the average number of casualties for Urban and Rural areas
urban_avg = df[df['Urban_or_Rural_Area'] == 'Urban']['Number_of_Casualties'].mean()
rural_avg = df[df['Urban_or_Rural_Area'] == 'Rural']['Number_of_Casualties'].mean()

# Print the string with the information
print(f"Average number of casualties in Urban areas: {urban_avg:.2f}")
print(f"Average number of casualties in Rural areas: {rural_avg:.2f}")


There is no significant difference in the number of casualties between urban and rural areas, even though there are far more accidents in urban areas.

## Hypothesis 3: More accidents occur on weekends than weekdays

In [None]:
# Check hypothesis 3

df['Day_of_Week'] = df['Accident Date'].dt.dayofweek  # 0 - Monday, 6 - Sunday
weekend = df[df['Day_of_Week'] >= 5]['Number_of_Casualties']
weekday = df[df['Day_of_Week'] < 5]['Number_of_Casualties']

# Mann-Whitney U test
stat, p = stats.mannwhitneyu(weekday, weekend, alternative='two-sided')
print(f"Weekend vs Weekday Test: U={stat}, p-value={p}")



In [None]:
# Calculate the average number of casualties for Weekdays and Weekend 

weekdays_avg = weekday.mean()
weekends_avg = weekend.mean()

# Print the string with the information
print(f"Average number of casualties in weekdays: {weekdays_avg:.2f}")
print(f"Average number of casualties in weekends: {weekends_avg:.2f}")


In [None]:
df['Day_of_Week'] = df['Accident Date'].dt.dayofweek  # 0=Monday, ..., 6=Sunday
df['Weekend'] = df['Day_of_Week'].apply(lambda x: "Weekend" if x >= 5 else "Weekday")

casualties_by_day_type = df.groupby('Weekend')['Number_of_Casualties'].sum().reset_index()


plt.figure(figsize=(6, 4))
sns.barplot(data=casualties_by_day_type, x="Weekend", y="Number_of_Casualties", palette="muted")


plt.title("Total Number of Casualties on Weekends vs. Weekdays")
plt.xlabel("Day Type")
plt.ylabel("Number of Casualties")
plt.show()


This means accidents are significantly more frequent on weekends than on weekdays.

## Hypothesis 4: Lighting conditions affect accident severity

In [None]:
# Check hypothesis 4

light_conditions = df.groupby("Light_Conditions")["Accident_Severity"].value_counts(normalize=True).unstack()

# Chi-square test to check for dependency
stat, p, dof, expected = stats.chi2_contingency(light_conditions.fillna(0))
print(f"Light Conditions & Severity Test: χ²={stat}, p-value={p}")


This means lighting conditions do not significantly impact accident severity.

In [None]:
import plotly.express as px

severity_counts = df.groupby("Light_Conditions")["Accident_Severity"].value_counts().unstack().fillna(0)
severity_counts.plot(kind="bar", stacked=True, figsize=(10, 5), colormap="coolwarm")

plt.title("Accident Severity by Light Conditions")
plt.xlabel("Light Conditions")
plt.ylabel("Number of Accidents")
plt.xticks(rotation=45)
plt.legend(title="Accident Severity")
plt.show()


In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

In [None]:
current_dir = os.getcwd()
current_dir

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
import plotly.express as px

In [None]:
df = pd.read_csv(os.path.join(current_dir, 'filtered_accident_data_set.csv'))
df.head()