In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway
import warnings
warnings.filterwarnings('ignore')

<h1>UK ROAD ACCIDENT DATA ANALYSIS</h1>
<h2>INCLUSIVE YEAR 2019 - 2022</h2>
<h3>Analyst: Concha, Vaughn Allystair P.</h3>

In [85]:
accident = pd.read_csv("datasets//accident_data.csv")

In [86]:
accident

Unnamed: 0,Index,Accident_Severity,Accident Date,Latitude,Light_Conditions,District Area,Longitude,Number_of_Casualties,Number_of_Vehicles,Road_Surface_Conditions,Road_Type,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type
0,200701BS64157,Serious,5/6/2019,51.506187,Darkness - lights lit,Kensington and Chelsea,-0.209082,1,2,Dry,Single carriageway,Urban,Fine no high winds,Car
1,200701BS65737,Serious,2/7/2019,51.495029,Daylight,Kensington and Chelsea,-0.173647,1,2,Wet or damp,Single carriageway,Urban,Raining no high winds,Car
2,200701BS66127,Serious,26-08-2019,51.517715,Darkness - lighting unknown,Kensington and Chelsea,-0.210215,1,3,Dry,,Urban,,Taxi/Private hire car
3,200701BS66128,Serious,16-08-2019,51.495478,Daylight,Kensington and Chelsea,-0.202731,1,4,Dry,Single carriageway,Urban,Fine no high winds,Bus or coach (17 or more pass seats)
4,200701BS66837,Slight,3/9/2019,51.488576,Darkness - lights lit,Kensington and Chelsea,-0.192487,1,2,Dry,,Urban,,Other vehicle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660674,201091NM01760,Slight,18-02-2022,57.374005,Daylight,Highland,-3.467828,2,1,Dry,Single carriageway,Rural,Fine no high winds,Car
660675,201091NM01881,Slight,21-02-2022,57.232273,Darkness - no lighting,Highland,-3.809281,1,1,Frost or ice,Single carriageway,Rural,Fine no high winds,Car
660676,201091NM01935,Slight,23-02-2022,57.585044,Daylight,Highland,-3.862727,1,3,Frost or ice,Single carriageway,Rural,Fine no high winds,Car
660677,201091NM01964,Serious,23-02-2022,57.214898,Darkness - no lighting,Highland,-3.823997,1,2,Wet or damp,Single carriageway,Rural,Fine no high winds,Motorcycle over 500cc


In [87]:
accident.describe()

Unnamed: 0,Latitude,Longitude,Number_of_Casualties,Number_of_Vehicles
count,660654.0,660653.0,660679.0,660679.0
mean,52.553866,-1.43121,1.35704,1.831255
std,1.406922,1.38333,0.824847,0.715269
min,49.91443,-7.516225,1.0,1.0
25%,51.49069,-2.332291,1.0,1.0
50%,52.315641,-1.411667,1.0,2.0
75%,53.453452,-0.232869,1.0,2.0
max,60.757544,1.76201,68.0,32.0


In [88]:
accident.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660679 entries, 0 to 660678
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Index                    660679 non-null  object 
 1   Accident_Severity        660679 non-null  object 
 2   Accident Date            660679 non-null  object 
 3   Latitude                 660654 non-null  float64
 4   Light_Conditions         660679 non-null  object 
 5   District Area            660679 non-null  object 
 6   Longitude                660653 non-null  float64
 7   Number_of_Casualties     660679 non-null  int64  
 8   Number_of_Vehicles       660679 non-null  int64  
 9   Road_Surface_Conditions  659953 non-null  object 
 10  Road_Type                656159 non-null  object 
 11  Urban_or_Rural_Area      660664 non-null  object 
 12  Weather_Conditions       646551 non-null  object 
 13  Vehicle_Type             660679 non-null  object 
dtypes: f

In [132]:
for column in["Accident_Severity","Latitude", "Light_Conditions", "District Area", 
           "Longitude", "Road_Surface_Conditions", "Road_Type", "Urban_or_Rural_Area",
          "Weather_Conditions", "Vehicle_Type"]:
    accident[column] = accident[column].astype('category')

accident['Accident Date'] = pd.to_datetime(accident['Accident Date'], dayfirst = True, errors = "coerce")

In [133]:
accident.isnull().sum()

Index                           0
Accident_Severity               0
Accident Date              395672
Latitude                        0
Light_Conditions                0
District Area                   0
Longitude                       0
Number_of_Casualties            0
Number_of_Vehicles              0
Road_Surface_Conditions         0
Road_Type                       0
Urban_or_Rural_Area             0
Weather_Conditions          14128
Vehicle_Type                    0
dtype: int64

In [134]:
accident['Latitude'] = accident['Latitude'].fillna(accident['Latitude'].mode()[0])
accident['Longitude'] = accident['Longitude'].fillna(accident['Longitude'].mode()[0])
accident['Road_Surface_Conditions'] = accident['Road_Surface_Conditions'].fillna(accident['Road_Surface_Conditions'].mode()[0])
accident['Road_Type'] = accident['Road_Type'].fillna(accident['Road_Type'].mode()[0])
accident['Urban_or_Rural_Area'] = accident['Urban_or_Rural_Area'].fillna(accident['Urban_or_Rural_Area'].mode()[0])

<h1 style="color:#2596be">Insight No. 1</h1>
<h2 style="color:pink">which of the road types has the highest number of accidents?</h2>
<h3>The road with the highest percentage of accident is the "single carriageway" which is 63.50% of all severities and road types</h3>

In [92]:
road_serverity = accident.groupby(["Road_Type", "Accident_Severity"]).size()

In [93]:
road_serverity.unstack()

Accident_Severity,Fatal,Serious,Slight
Road_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dual carriageway,1815,11746,85863
One way street,95,1655,11809
Roundabout,142,3665,40185
Single carriageway,6560,70540,419563
Slip road,49,611,6381


<h1 style="color:#2596be">Insight No. 2</h1>
<h2 style="color:pink">What is the average number of casualties? </h2>
<h3>The average number of casualties is 1.4 </h3>

In [95]:
avg_casualties = np.round(accident['Number_of_Casualties'].mean(), 1)

In [96]:
avg_casualties

np.float64(1.4)

<h1 style="color:#2596be">Insight No. 3</h1>
<h2 style="color:pink">Which area have higher number of accidents? Urban vs. Rural</h2>
<h3>The Urban has the higher number of accidents than Rural</h3>

In [94]:
accident["Urban_or_Rural_Area"].value_counts()

Urban_or_Rural_Area
Urban          421678
Rural          238990
Unallocated        11
Name: count, dtype: int64

<h1 style="color:#2596be">Insight No. 4</h1>
<h2 style="color:pink">What is the most common accident severity ? </h2>
<h3>The most common accident severity is 'Slight'</h3>

In [99]:
avg_severity = accident['Accident_Severity'].mode()[0]

In [100]:
avg_severity

'Slight'

<h1 style="color:#2596be">Insight No. 5</h1>
<h2 style="color:pink">What is the average number of vehicles in an accident? </h2>
<h3>The average number of vehicles in an accident is 1.8 </h3>

In [97]:
avg_vehicles = np.round(accident['Number_of_Vehicles'].mean(), 1)

In [98]:
avg_vehicles

np.float64(1.8)

<h1 style="color:#2596be">Insight No. 6</h1>
<h2 style="color:pink">In which area has the has the higher number of casualties?</h2>
<h3>Urban area has a higher number of casualties</h3>

In [101]:
area_casualties = accident.groupby('Urban_or_Rural_Area')['Number_of_Casualties'].sum()

In [102]:
area_casualties

Urban_or_Rural_Area
Rural          353515
Unallocated        13
Urban          543040
Name: Number_of_Casualties, dtype: int64

<h1 style="color:#2596be">Insight No. 7</h1>
<h2 style="color:pink">On which light conditions has the highest number of accident?</h2>
<h3>Daylight has the highest number of accidents</h3>

In [135]:
lightcon_accidents = accident['Light_Conditions'].value_counts()

In [136]:
lightcon_accidents

Light_Conditions
Daylight                       484880
Darkness - lights lit          129335
Darkness - no lighting          37437
Darkness - lighting unknown      6484
Darkness - lights unlit          2543
Name: count, dtype: int64

<h1 style="color:#2596be">Insight No. 8</h1>
<h2 style="color:pink">On which weather conditions has the highest number of accident?</h2>
<h3>Fine no high winds has the highest number of accidents</h3>

In [105]:
weatherCdt_accidents = accident['Weather_Conditions'].value_counts()

In [106]:
weatherCdt_accidents

Weather_Conditions
Fine no high winds       520885
Raining no high winds     79696
Other                     17150
Raining + high winds       9615
Fine + high winds          8554
Snowing no high winds      6238
Fog or mist                3528
Snowing + high winds        885
Name: count, dtype: int64

<h1 style="color:#2596be">Insight No. 9</h1>
<h2 style="color:pink">What road type has the highest number of accident?</h2>
<h3>The road type that has the highest accident is the single carriageway</h3>

In [107]:
roadtype_accidents = accident['Road_Type'].value_counts()

In [108]:
roadtype_accidents

Road_Type
Single carriageway    496663
Dual carriageway       99424
Roundabout             43992
One way street         13559
Slip road               7041
Name: count, dtype: int64

<h1 style="color:#2596be">Insight No. 10</h1>
<h2 style="color:pink">What road surface conditions has the highest number of accident?</h2>
<h3>The road surface conditions that has the highest accident is a dry surface.</h3>

In [109]:
roadsurface_accidents = accident['Road_Surface_Conditions'].value_counts()

In [110]:
roadsurface_accidents

Road_Surface_Conditions
Dry                     448547
Wet or damp             186708
Frost or ice             18517
Snow                      5890
Flood over 3cm. deep      1017
Name: count, dtype: int64

<h1 style="color:#2596be">Insight No. 11</h1>
<h2 style="color:pink">What is the average number of casualties for each level of severity?</h2>
<h3>The average number of casualties for fatal accidents is 1.9, for the serious is 1.5 , and for the slight is 1.3.</h3>

In [111]:
avg_casualties_per_severity = np.round(accident.groupby('Accident_Severity')['Number_of_Casualties'].mean(), 1)

In [112]:
avg_casualties_per_severity

Accident_Severity
Fatal      1.9
Serious    1.5
Slight     1.3
Name: Number_of_Casualties, dtype: float64

<h1 style="color:#2596be">Insight No. 12</h1>
<h2 style="color:pink">What is the average number of casualties in the urban, rural and unallocated areas?</h2>
<h3>The average number of casualties for rural area accidents is 1.5, for the urban area is 1.3 , and for unallocated areas is 1.2.</h3>

In [113]:
avg_casualties_per_area = np.round(accident.groupby('Urban_or_Rural_Area')['Number_of_Casualties'].mean(), 1)

In [114]:
avg_casualties_per_area

Urban_or_Rural_Area
Rural          1.5
Unallocated    1.2
Urban          1.3
Name: Number_of_Casualties, dtype: float64

<h1 style="color:#2596be">Insight No. 13</h1>
<h2 style="color:pink">Each weather conditions average number of casualties</h2>
<h3>(Fine with no winds, fine with high winds, raining with high winds, raining with no winds, snowing with high winds, and other weather conditions) have an average of 1.4 casualties, then (snowing with no high winds) has an average of 1.3 casualties, while (fog or mist) has 1.5 casualties.</h3>

In [115]:
avg_casualties_per_weather = np.round(accident.groupby('Weather_Conditions')['Number_of_Casualties'].mean(), 1)

In [116]:
avg_casualties_per_weather

Weather_Conditions
Fine + high winds        1.4
Fine no high winds       1.4
Fog or mist              1.5
Other                    1.4
Raining + high winds     1.4
Raining no high winds    1.4
Snowing + high winds     1.4
Snowing no high winds    1.3
Name: Number_of_Casualties, dtype: float64

<h1 style="color:#2596be">Insight No. 14</h1>
<h2 style="color:pink">In which road area has the higher number of accidents?</h2>
<h3>Urban roads have a higher accidents than roads in rural areas.</h3>

In [117]:
accidents_per_area_road = accident.groupby(['Urban_or_Rural_Area', 'Road_Type']).size().unstack().T

In [118]:
accidents_per_area_road

Urban_or_Rural_Area,Rural,Unallocated,Urban
Road_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dual carriageway,48715,1,50708
One way street,1193,0,12366
Roundabout,15545,1,28446
Single carriageway,169243,9,327411
Slip road,4294,0,2747


<h1 style="color:#2596be">Insight No. 15</h1>
<h2 style="color:pink">Which combinations of weather and light conditions result in the highest number of casualties?</h2>
<h3>Daylight and fine with no high winds have the highest number of casualties.</h3>

In [119]:
casualties_by_weather_light = accident.groupby(['Weather_Conditions', 'Light_Conditions'])['Number_of_Casualties'].sum().unstack()

In [120]:
casualties_by_weather_light

Light_Conditions,Darkness - lighting unknown,Darkness - lights lit,Darkness - lights unlit,Darkness - no lighting,Daylight
Weather_Conditions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fine + high winds,82,2543,75,1213,7943
Fine no high winds,4903,125686,2251,38800,531805
Fog or mist,89,1323,48,1267,2399
Other,234,6535,177,2824,13466
Raining + high winds,147,4185,115,2173,7001
Raining no high winds,808,31563,620,9538,69700
Snowing + high winds,21,354,7,247,626
Snowing no high winds,119,2411,37,1129,4674


<h1 style="color:#2596be">Insight No. 16</h1>
<h2 style="color:pink">Does having more vehicles results to more casualties?</h2>
<h3>No, having many vehicles doesn't lead to more casualties.</h3>
<h3>The number of vehicles has no correlation with the number of casualties</h3>

In [121]:
vehicles_casualties_corr = accident['Number_of_Vehicles'].corr(accident['Number_of_Casualties'])

In [122]:
vehicles_casualties_corr

np.float64(0.22888886126927557)

<h1 style="color:#2596be">Insight No. 17</h1>
<h2 style="color:pink">What vehicle type has the highest number of accidents</h2>
<h3>A car has the highest number of accidents</h3>

In [123]:
vehicletype_accidents = accident['Vehicle_Type'].value_counts()

In [124]:
vehicletype_accidents

Vehicle_Type
Car                                      497992
Van / Goods 3.5 tonnes mgw or under       34160
Bus or coach (17 or more pass seats)      25878
Motorcycle over 500cc                     25657
Goods 7.5 tonnes mgw and over             17307
Motorcycle 125cc and under                15269
Taxi/Private hire car                     13294
Motorcycle over 125cc and up to 500cc      7656
Motorcycle 50cc and under                  7603
Goods over 3.5t. and under 7.5t            6096
Other vehicle                              5637
Minibus (8 - 16 passenger seats)           1976
Agricultural vehicle                       1947
Pedal cycle                                 197
Data missing or out of range                  6
Ridden horse                                  4
Name: count, dtype: int64

<h1 style="color:#2596be">Insight No. 18</h1>
<h2 style="color:pink">What district area has the highest number of accidents</h2>
<h3>Birmingham has the highest number of accidents</h3>

In [125]:
districtarea_accidents = accident['District Area'].value_counts()

In [126]:
districtarea_accidents

District Area
Birmingham            13491
Leeds                  8898
Manchester             6720
Bradford               6212
Sheffield              5710
                      ...  
Berwick-upon-Tweed      153
Teesdale                142
Shetland Islands        133
Orkney Islands          117
Clackmannanshire         91
Name: count, Length: 422, dtype: int64

<h1 style="color:#2596be">Insight No. 19</h1>
<h2 style="color:pink">Which district area has the highest average number of casualties?</h2>
<h3>Blaeu Gwent has the highest average number of casualties</h3>

In [127]:
avg_casualties_per_district = np.round(accident.groupby('District Area')['Number_of_Casualties'].mean(), 1).sort_values(ascending = False)

In [128]:
avg_casualties_per_district

District Area
Blaeu Gwent                1.7
Merthyr Tydfil             1.6
North East Lincolnshire    1.6
Blaenau Gwent              1.6
Teesdale                   1.6
                          ... 
Kensington and Chelsea     1.1
Hammersmith and Fulham     1.1
City of London             1.1
Camden                     1.1
Aberdeen City              1.1
Name: Number_of_Casualties, Length: 422, dtype: float64

<h1 style="color:#2596be">Insight No. 20</h1>
<h2 style="color:pink">Do different road surface conditions impact the number of casualties?</h2>
<h3>The road surface conditions impact the number of casualties</h3>
<h3>There is correlation between road surface condtions and number of casualties</h3>

In [129]:
dry = accident[accident['Road_Surface_Conditions'] == 'Dry']['Number_of_Casualties']
wet = accident[accident['Road_Surface_Conditions'] == 'Wet or damp']['Number_of_Casualties']
f_stat, p_value = f_oneway(dry, wet)
print(p_value)

5.097050919646033e-249
