<h1 style='text-align: center;'>UK Road Accidents Data Analysis</h1>

<h2>Import Libraries</h2>

In [1]:
import numpy as np
import pandas as pd
import warnings
from scipy.stats import f_oneway
warnings.filterwarnings("ignore")

<h2>Load Dataset into Data Frame</h2>

In [2]:
data = pd.read_csv('dataset\\uk_road_accident.csv')

<h2>Check DataFrame Information</h2>

In [3]:
data

Unnamed: 0,Index,Accident_Severity,Accident Date,Latitude,Light_Conditions,District Area,Longitude,Number_of_Casualties,Number_of_Vehicles,Road_Surface_Conditions,Road_Type,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type
0,200701BS64157,Serious,5/6/2019,51.506187,Darkness - lights lit,Kensington and Chelsea,-0.209082,1,2,Dry,Single carriageway,Urban,Fine no high winds,Car
1,200701BS65737,Serious,2/7/2019,51.495029,Daylight,Kensington and Chelsea,-0.173647,1,2,Wet or damp,Single carriageway,Urban,Raining no high winds,Car
2,200701BS66127,Serious,26-08-2019,51.517715,Darkness - lighting unknown,Kensington and Chelsea,-0.210215,1,3,Dry,,Urban,,Taxi/Private hire car
3,200701BS66128,Serious,16-08-2019,51.495478,Daylight,Kensington and Chelsea,-0.202731,1,4,Dry,Single carriageway,Urban,Fine no high winds,Bus or coach (17 or more pass seats)
4,200701BS66837,Slight,3/9/2019,51.488576,Darkness - lights lit,Kensington and Chelsea,-0.192487,1,2,Dry,,Urban,,Other vehicle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660674,201091NM01760,Slight,18-02-2022,57.374005,Daylight,Highland,-3.467828,2,1,Dry,Single carriageway,Rural,Fine no high winds,Car
660675,201091NM01881,Slight,21-02-2022,57.232273,Darkness - no lighting,Highland,-3.809281,1,1,Frost or ice,Single carriageway,Rural,Fine no high winds,Car
660676,201091NM01935,Slight,23-02-2022,57.585044,Daylight,Highland,-3.862727,1,3,Frost or ice,Single carriageway,Rural,Fine no high winds,Car
660677,201091NM01964,Serious,23-02-2022,57.214898,Darkness - no lighting,Highland,-3.823997,1,2,Wet or damp,Single carriageway,Rural,Fine no high winds,Motorcycle over 500cc


<h2>Basic Descriptive Statistic</h2>

In [4]:
data.describe()

Unnamed: 0,Latitude,Longitude,Number_of_Casualties,Number_of_Vehicles
count,660654.0,660653.0,660679.0,660679.0
mean,52.553866,-1.43121,1.35704,1.831255
std,1.406922,1.38333,0.824847,0.715269
min,49.91443,-7.516225,1.0,1.0
25%,51.49069,-2.332291,1.0,1.0
50%,52.315641,-1.411667,1.0,2.0
75%,53.453452,-0.232869,1.0,2.0
max,60.757544,1.76201,68.0,32.0


<h2>Check for Null Values</h2>

In [5]:
data.isnull().sum()

Index                          0
Accident_Severity              0
Accident Date                  0
Latitude                      25
Light_Conditions               0
District Area                  0
Longitude                     26
Number_of_Casualties           0
Number_of_Vehicles             0
Road_Surface_Conditions      726
Road_Type                   4520
Urban_or_Rural_Area           15
Weather_Conditions         14128
Vehicle_Type                   0
dtype: int64

<h2>Fill Null Values</h2>

In [6]:
data['Latitude'] = data['Latitude'].fillna(data['Latitude'].mean())
data['Longitude'] = data['Longitude'].fillna(data['Longitude'].mean())
data['Road_Surface_Conditions'] = data['Road_Surface_Conditions'].fillna(data['Road_Surface_Conditions'].mode()[0])
data['Road_Type'] = data['Road_Type'].fillna(data['Road_Type'].mode()[0])
data['Urban_or_Rural_Area'] = data['Urban_or_Rural_Area'].fillna(data['Urban_or_Rural_Area'].mode()[0])
data['Weather_Conditions'] = data['Weather_Conditions'].fillna(data['Weather_Conditions'].mode()[0])

In [7]:
data.isnull().sum()

Index                      0
Accident_Severity          0
Accident Date              0
Latitude                   0
Light_Conditions           0
District Area              0
Longitude                  0
Number_of_Casualties       0
Number_of_Vehicles         0
Road_Surface_Conditions    0
Road_Type                  0
Urban_or_Rural_Area        0
Weather_Conditions         0
Vehicle_Type               0
dtype: int64

<h1 style='text-align: center;'>Exploratory Data Analysis</h1>

<h2>Question 1</h2>
<h3>Which District area has the most fatal accidents?</h3>

In [8]:
fatal_accidents = data[data['Accident_Severity'] == 'Fatal']
fatal_accidents['District Area'].value_counts().sort_values(ascending=False)

District Area
Birmingham                  105
Leeds                        93
Highland                     88
East Riding of Yorkshire     85
Bradford                     71
                           ... 
Havant                        2
Orkney Islands                2
Harlow                        1
Clackmannanshire              1
Stevenage                     1
Name: count, Length: 422, dtype: int64

<h3>Insight:</h3>
<h4>The data shows that the Birmingham district area has the most fatal accidents among the district areas</h4>
<hr>

<h2>Question 2</h2>
<h3>What are the different categories of accident severity, and how many accidents fall into each?</h3>

In [9]:
data["Accident_Severity"].value_counts()

Accident_Severity
Slight     563801
Serious     88217
Fatal        8661
Name: count, dtype: int64

<h3>Insight:</h3>
<h4>Most accidents are slight, and fatal accidents rarely occurs</h4>
<hr>

<h2>Question 3</h2>
<h3>Does the weather condition affect the severity of the accident?</h3>

In [10]:
bobo = data.groupby(['Weather_Conditions', 'Accident_Severity']).size().unstack(fill_value=0)
bobo

Accident_Severity,Fatal,Serious,Slight
Weather_Conditions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fine + high winds,175,1245,7134
Fine no high winds,7207,73285,454521
Fog or mist,82,483,2963
Other,165,1801,15184
Raining + high winds,145,1261,8209
Raining no high winds,848,9468,69380
Snowing + high winds,3,109,773
Snowing no high winds,36,565,5637


<h3>Insight:</h3>
<h4>We can see that weather Conditions plays a significant role in the severity of an accident.</h4>
<hr>

<h2>Question 4</h2>
<h3>Where do Serious Accidents mostly occur? Is it on Urban or Rural areas?</h3>

In [11]:
severe_accidents = data[data['Accident_Severity'] == 'Serious']

severe_accidents['Urban_or_Rural_Area'].value_counts()

Urban_or_Rural_Area
Urban          50904
Rural          37312
Unallocated        1
Name: count, dtype: int64

<h3>Insight:</h3>
<h4>The data analysis shown above says that Serious accidents usually happen in Urban areas</h4>
<hr>

<h2>Question 5</h2>
<h3>What is the most common weather condition during accidents?</h3>

In [12]:
data["Weather_Conditions"].value_counts()

Weather_Conditions
Fine no high winds       535013
Raining no high winds     79696
Other                     17150
Raining + high winds       9615
Fine + high winds          8554
Snowing no high winds      6238
Fog or mist                3528
Snowing + high winds        885
Name: count, dtype: int64

<h3>Insight:</h3>
<h4>Most accidents happened in fine weather, suggesting driver behavior is a bigger factor than poor weather.</h4>
<hr>

<h2>Question 5</h2>
<h3>Which vehicle type has the highest number of casualties?</h3>

In [13]:
data.groupby('Vehicle_Type')['Number_of_Casualties'].sum().sort_values(ascending=False)

Vehicle_Type
Car                                      676692
Van / Goods 3.5 tonnes mgw or under       46271
Bus or coach (17 or more pass seats)      34915
Motorcycle over 500cc                     34879
Goods 7.5 tonnes mgw and over             23397
Motorcycle 125cc and under                20348
Taxi/Private hire car                     18195
Motorcycle over 125cc and up to 500cc     10286
Motorcycle 50cc and under                 10167
Goods over 3.5t. and under 7.5t            8308
Other vehicle                              7554
Minibus (8 - 16 passenger seats)           2659
Agricultural vehicle                       2613
Pedal cycle                                 270
Data missing or out of range                  9
Ridden horse                                  5
Name: Number_of_Casualties, dtype: int64

<h3>Insight:</h3>
<h4>The data shows that cars contribute the most casualties.</h4>
<hr>

<h2>Question 6</h2>
<h3>What proportion of accidents occur in urban vs rural areas?</h3>

In [14]:
data["Urban_or_Rural_Area"].value_counts(normalize=True) * 100

Urban_or_Rural_Area
Urban          63.824944
Rural          36.173391
Unallocated     0.001665
Name: proportion, dtype: float64

<h3>Insight:</h3>
<h4>About two-thirds of accidents occurred in urban areas, likely because of heavier traffic.</h4>
<hr>

<h2>Question 7</h2>
<h3>Is there a correlation between the number of vehicle and the number of casualties?</h3>

In [15]:
data['Number_of_Casualties'].corr(data['Number_of_Vehicles'])

np.float64(0.2288888612692756)

<h3>Insight:</h3>
<h4>There is a positive correlation between number of vehicles and number of casualties</h4>
<hr>

<h2>Question 8</h2>
<h3>Which road type has the highest number of accidents?</h3>

In [16]:
data["Road_Type"].value_counts()

Road_Type
Single carriageway    496663
Dual carriageway       99424
Roundabout             43992
One way street         13559
Slip road               7041
Name: count, dtype: int64

<h3>Insight:</h3>
<h4>Most accidents occurred on single carriageways, meaning it is more dangerous than others.
</h4>
<hr>

<h2>Question 9</h2>
<h3>In what light condition do accidents often happen?</h3>

In [17]:
data['Light_Conditions'].value_counts()

Light_Conditions
Daylight                       484880
Darkness - lights lit          129335
Darkness - no lighting          37437
Darkness - lighting unknown      6484
Darkness - lights unlit          2543
Name: count, dtype: int64

<h3>Insight:</h3>
<h4>The data analysis shown above says that its usually in Daylight that most accidents in the UK happends.
</h4>
<hr>

<h2>Question 10</h2>
<h3>Which weather condition has the highest average number of casualties?</h3>

In [18]:
data.groupby("Weather_Conditions")["Number_of_Casualties"].mean()

Weather_Conditions
Fine + high winds        1.386018
Fine no high winds       1.347397
Fog or mist              1.452948
Other                    1.354869
Raining + high winds     1.416641
Raining no high winds    1.408214
Snowing + high winds     1.418079
Snowing no high winds    1.341776
Name: Number_of_Casualties, dtype: float64

<h3>Insight:</h3>
<h4>The highest average casualties per accident happen during fog or mist, showing low visibility is particularly dangerous.
</h4>
<hr>

<h2>Question 11</h2>
<h3>Which district area had the most casualties in an accident?</h3>

In [19]:
highest_casualties = data['Number_of_Casualties'].max()
data[data['Number_of_Casualties'] == highest_casualties]

Unnamed: 0,Index,Accident_Severity,Accident Date,Latitude,Light_Conditions,District Area,Longitude,Number_of_Casualties,Number_of_Vehicles,Road_Surface_Conditions,Road_Type,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type
117980,200743N002017,Fatal,3/1/2019,51.497547,Darkness - lights lit,South Bucks,-0.496697,68,1,Wet or damp,Slip road,Rural,Raining no high winds,Car


<h3>Insight:</h3>
<h4>The data analysis shown above says that the accident with the most casualties ever recorded in the dataset is in South Bucks district.</h4>
<hr>

<h2>Question 12</h2>
<h3>What's the average number of casualties by the severity of accident?</h3>

In [20]:
np.round(data.groupby('Accident_Severity')['Number_of_Casualties'].mean(),2)

Accident_Severity
Fatal      1.90
Serious    1.47
Slight     1.33
Name: Number_of_Casualties, dtype: float64

<h3>Insight:</h3>
<h4>The data above shows that the average number of casualties, be it in every severity of accidents does not exceeds to more than 2.</h4>
<hr>

<h2>Question 13</h2>
<h3>Which road surface condition has the highest accident severity?</h3>

In [21]:
data.groupby("Road_Surface_Conditions")["Accident_Severity"].value_counts()

Road_Surface_Conditions  Accident_Severity
Dry                      Slight               381049
                         Serious               61708
                         Fatal                  5790
Flood over 3cm. deep     Slight                  842
                         Serious                 152
                         Fatal                    23
Frost or ice             Slight                16317
                         Serious                2007
                         Fatal                   193
Snow                     Slight                 5290
                         Serious                 565
                         Fatal                    35
Wet or damp              Slight               160303
                         Serious               23785
                         Fatal                  2620
Name: count, dtype: int64

<h3>Insight:</h3>
<h4>Accidents on dry roads happens more frequently, but wet/icy conditions increase the likelihood of serious or fatal accidents..</h4>
<hr>

<h2>Question 14</h2>
<h3>In Urban areas, what type of road surface condition has the most accident happen? </h3>

In [22]:
urban_area = data[data['Urban_or_Rural_Area'] == 'Urban']

urban_area['Road_Surface_Conditions'].value_counts()

Road_Surface_Conditions
Dry                     303397
Wet or damp             107698
Frost or ice              7564
Snow                      2788
Flood over 3cm. deep       231
Name: count, dtype: int64

<h3>Insight:</h3>
<h4>Accidents mostly happens on dry roads in Urban areas.</h4>
<hr>

<h2>Question 15</h2>
<h3>Does the weather condition affect the severity of the accident?</h3>

In [23]:
data.groupby('Accident_Severity')['Weather_Conditions'].describe()

Unnamed: 0_level_0,count,unique,top,freq
Accident_Severity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fatal,8661,8,Fine no high winds,7207
Serious,88217,8,Fine no high winds,73285
Slight,563801,8,Fine no high winds,454521


<h3>Insight:</h3>
<h4>The weather condition does not affect the assident's severity</h4>
<hr>

<h2>Question 16</h2>
<h3>How many total accidents are in the dataset?</h3>

In [24]:
len(data)

660679

<h3>Insight:</h3>
<h4>660,679 recorded accidents.</h4>
<hr>

<h2>Question 17</h2>
<h3>What is the average number of vehicles involved in accidents?</h3>

In [25]:
data["Number_of_Vehicles"].mean()

np.float64(1.8312554205597575)

<h3>Insight:</h3>
<h4>Most accidents involves only one vehicle</h4>
<hr>

<h2>Question 18</h2>
<h3>What is the maximum number of casualties recorded in a single accident?</h3>

In [26]:
data["Number_of_Casualties"].max()

np.int64(68)

<h3>Insight:</h3>
<h4>Maximum number of casualties in a single accident: 68</h4>
<hr>

<h2>Question 19</h2>
<h3>Which district area had the most accidents involving cars?</h3>

In [27]:
datas = data[data['Vehicle_Type'] == 'Car']['District Area'].value_counts().head(5)
datas

District Area
Birmingham    9600
Leeds         6875
Manchester    5248
Bradford      4749
Sheffield     4306
Name: count, dtype: int64

<h3>Insight:</h3>
<h4>The data shows that Birmingham had the most car road accidents</h4>
<hr>

<h2>Question 20</h2>
<h3>Which district area had the most accidents with slight severity </h3>

In [28]:
slight_accidents = data[data['Accident_Severity'] == 'Slight']
slight_accidents['District Area'].describe()

count         563801
unique           422
top       Birmingham
freq           11912
Name: District Area, dtype: object

<h3>Insight:</h3>
<h4>The data shows that Birmingham had the most slightly severe accidents</h4>
<hr>