In [1]:
#Import modules
import numpy as np
import holidays
import pandas as pd
import datetime
import sklearn
import scipy
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import math
from sklearn.model_selection import cross_val_score
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
df1 = pd.read_csv(r'Accident_Information.csv')
df2 = pd.read_csv(r'Vehicle_Information.csv')

In [3]:
print(df1.columns)
print(df2.columns)

Index(['Accident_Index', '1st_Road_Class', '1st_Road_Number', '2nd_Road_Class',
       '2nd_Road_Number', 'Accident_Severity', 'Carriageway_Hazards', 'Date',
       'Day_of_Week', 'Did_Police_Officer_Attend_Scene_of_Accident',
       'Junction_Control', 'Junction_Detail', 'Latitude', 'Light_Conditions',
       'Local_Authority_(District)', 'Local_Authority_(Highway)',
       'Location_Easting_OSGR', 'Location_Northing_OSGR', 'Longitude',
       'LSOA_of_Accident_Location', 'Number_of_Casualties',
       'Number_of_Vehicles', 'Pedestrian_Crossing-Human_Control',
       'Pedestrian_Crossing-Physical_Facilities', 'Police_Force',
       'Road_Surface_Conditions', 'Road_Type', 'Special_Conditions_at_Site',
       'Speed_limit', 'Time', 'Urban_or_Rural_Area', 'Weather_Conditions',
       'Year', 'InScotland'],
      dtype='object')
Index(['Accident_Index', 'Age_Band_of_Driver', 'Age_of_Vehicle',
       'Driver_Home_Area_Type', 'Driver_IMD_Decile', 'Engine_Capacity_.CC.',
       'Hit_Object_i

In [4]:
print("df1 shape:", df1.shape)
print("df2 shape:", df2.shape)

df1 shape: (2047256, 34)
df2 shape: (2177205, 24)


In [5]:
df3 = df1.merge(df2, on='Accident_Index')

In [6]:
df3.head()

Unnamed: 0,Accident_Index,1st_Road_Class,1st_Road_Number,2nd_Road_Class,2nd_Road_Number,Accident_Severity,Carriageway_Hazards,Date,Day_of_Week,Did_Police_Officer_Attend_Scene_of_Accident,...,Skidding_and_Overturning,Towing_and_Articulation,Vehicle_Leaving_Carriageway,Vehicle_Location.Restricted_Lane,Vehicle_Manoeuvre,Vehicle_Reference,Vehicle_Type,Was_Vehicle_Left_Hand_Drive,X1st_Point_of_Impact,Year_y
0,200501BS00002,B,450.0,C,0.0,Slight,,2005-01-05,Wednesday,1.0,...,,No tow/articulation,Did not leave carriageway,0.0,Slowing or stopping,1,Bus or coach (17 or more pass seats),No,Nearside,2005
1,200501BS00003,C,0.0,,0.0,Slight,,2005-01-06,Thursday,1.0,...,,No tow/articulation,Did not leave carriageway,0.0,Going ahead right-hand bend,1,Bus or coach (17 or more pass seats),No,Nearside,2005
2,200501BS00004,A,3220.0,,0.0,Slight,,2005-01-07,Friday,1.0,...,,No tow/articulation,Did not leave carriageway,0.0,Going ahead other,1,Car,No,Front,2005
3,200501BS00005,Unclassified,0.0,,0.0,Slight,,2005-01-10,Monday,1.0,...,Skidded,No tow/articulation,Did not leave carriageway,0.0,Going ahead other,1,Motorcycle 125cc and under,No,Front,2005
4,200501BS00006,Unclassified,0.0,,0.0,Slight,,2005-01-11,Tuesday,1.0,...,,No tow/articulation,Did not leave carriageway,0.0,Moving off,1,Car,No,Did not impact,2005


In [7]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2058408 entries, 0 to 2058407
Data columns (total 57 columns):
Accident_Index                                 object
1st_Road_Class                                 object
1st_Road_Number                                float64
2nd_Road_Class                                 object
2nd_Road_Number                                float64
Accident_Severity                              object
Carriageway_Hazards                            object
Date                                           object
Day_of_Week                                    object
Did_Police_Officer_Attend_Scene_of_Accident    float64
Junction_Control                               object
Junction_Detail                                object
Latitude                                       float64
Light_Conditions                               object
Local_Authority_(District)                     object
Local_Authority_(Highway)                      object
Location_Easting_OS

In [8]:
print("df3 shape:",df3.shape)

df3 shape: (2058408, 57)


In [9]:
df3.isnull().sum().sort_values(ascending=False)/2058408*100

2nd_Road_Class                                 40.222201
Driver_IMD_Decile                              33.486559
Age_of_Vehicle                                 16.418611
model                                          14.543132
Engine_Capacity_.CC.                           12.160709
Propulsion_Code                                11.348479
LSOA_of_Accident_Location                       6.762848
make                                            5.384987
2nd_Road_Number                                 0.919497
Pedestrian_Crossing-Physical_Facilities         0.066556
Vehicle_Location.Restricted_Lane                0.054605
Pedestrian_Crossing-Human_Control               0.031772
Time                                            0.007093
Longitude                                       0.006073
Location_Easting_OSGR                           0.006024
Location_Northing_OSGR                          0.006024
Latitude                                        0.006024
Did_Police_Officer_Attend_Scene

In [10]:
df3 = df3.drop(['2nd_Road_Class', 'Driver_IMD_Decile','Year_y'], axis=1)

df3 = df3.rename(columns={'Year_x':'Year'})

In [11]:
#standardize all columns to lowercase, and remove some characters 
#for ease of use in querying
df3.columns = map(str.lower, df3.columns)
df3.columns = df3.columns.str.replace('.','')
df3.columns = df3.columns.str.replace('(','')
df3.columns = df3.columns.str.replace(')','')
#verify
df3.columns

Index(['accident_index', '1st_road_class', '1st_road_number',
       '2nd_road_number', 'accident_severity', 'carriageway_hazards', 'date',
       'day_of_week', 'did_police_officer_attend_scene_of_accident',
       'junction_control', 'junction_detail', 'latitude', 'light_conditions',
       'local_authority_district', 'local_authority_highway',
       'location_easting_osgr', 'location_northing_osgr', 'longitude',
       'lsoa_of_accident_location', 'number_of_casualties',
       'number_of_vehicles', 'pedestrian_crossing-human_control',
       'pedestrian_crossing-physical_facilities', 'police_force',
       'road_surface_conditions', 'road_type', 'special_conditions_at_site',
       'speed_limit', 'time', 'urban_or_rural_area', 'weather_conditions',
       'year', 'inscotland', 'age_band_of_driver', 'age_of_vehicle',
       'driver_home_area_type', 'engine_capacity_cc',
       'hit_object_in_carriageway', 'hit_object_off_carriageway',
       'journey_purpose_of_driver', 'junction_l

In [12]:
year = range(2005,2018)
uk_holidays = holidays.UnitedKingdom(years=year)

In [13]:
uk_holidays.years

{2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017}

In [14]:
ukh  = pd.DataFrame(list(uk_holidays.items()))

In [15]:
ukh.dtypes

0    object
1    object
dtype: object

In [16]:
ukh = ukh.rename(columns={0:"date", 1:"holiday"})

In [17]:
df = pd.concat([df3,ukh])

In [18]:
df.head()

Unnamed: 0,1st_road_class,1st_road_number,2nd_road_number,accident_index,accident_severity,age_band_of_driver,age_of_vehicle,carriageway_hazards,date,day_of_week,...,urban_or_rural_area,vehicle_leaving_carriageway,vehicle_locationrestricted_lane,vehicle_manoeuvre,vehicle_reference,vehicle_type,was_vehicle_left_hand_drive,weather_conditions,x1st_point_of_impact,year
0,B,450.0,0.0,200501BS00002,Slight,36 - 45,3.0,,2005-01-05,Wednesday,...,Urban,Did not leave carriageway,0.0,Slowing or stopping,1.0,Bus or coach (17 or more pass seats),No,Fine no high winds,Nearside,2005.0
1,C,0.0,0.0,200501BS00003,Slight,26 - 35,5.0,,2005-01-06,Thursday,...,Urban,Did not leave carriageway,0.0,Going ahead right-hand bend,1.0,Bus or coach (17 or more pass seats),No,Fine no high winds,Nearside,2005.0
2,A,3220.0,0.0,200501BS00004,Slight,46 - 55,4.0,,2005-01-07,Friday,...,Urban,Did not leave carriageway,0.0,Going ahead other,1.0,Car,No,Fine no high winds,Front,2005.0
3,Unclassified,0.0,0.0,200501BS00005,Slight,46 - 55,10.0,,2005-01-10,Monday,...,Urban,Did not leave carriageway,0.0,Going ahead other,1.0,Motorcycle 125cc and under,No,Fine no high winds,Front,2005.0
4,Unclassified,0.0,0.0,200501BS00006,Slight,46 - 55,1.0,,2005-01-11,Tuesday,...,Urban,Did not leave carriageway,0.0,Moving off,1.0,Car,No,Raining no high winds,Did not impact,2005.0


In [19]:
#Check Missing Data
df.isnull().sum().sort_values(ascending=False)

holiday                                        2058408
age_of_vehicle                                  338152
model                                           299547
engine_capacity_cc                              250507
propulsion_code                                 233788
lsoa_of_accident_location                       139397
make                                            111035
2nd_road_number                                  19117
pedestrian_crossing-physical_facilities           1560
vehicle_locationrestricted_lane                   1314
pedestrian_crossing-human_control                  844
time                                               336
longitude                                          315
latitude                                           314
location_northing_osgr                             314
location_easting_osgr                              314
did_police_officer_attend_scene_of_accident        304
speed_limit                                        255
inscotland

In [20]:
df['holiday'].fillna("Non-Holiday", inplace=True)
df = df.dropna(subset=['accident_index'])

In [21]:
print('df shape', df.shape)

df shape (2058408, 55)


In [22]:
#Check Missing Data
df.isnull().sum().sort_values(ascending=False)/2058598*100

age_of_vehicle                                 16.417096
model                                          14.541790
engine_capacity_cc                             12.159586
propulsion_code                                11.347432
lsoa_of_accident_location                       6.762224
make                                            5.384490
2nd_road_number                                 0.919412
pedestrian_crossing-physical_facilities         0.066550
vehicle_locationrestricted_lane                 0.054600
pedestrian_crossing-human_control               0.031769
time                                            0.007092
longitude                                       0.006072
latitude                                        0.006024
location_easting_osgr                           0.006024
location_northing_osgr                          0.006024
did_police_officer_attend_scene_of_accident     0.005538
speed_limit                                     0.003157
inscotland                     

In [None]:
#Checking the unique values for 'age_of_vehicle'
print("Distinct values for age_of_vehicle:\n", set(df['age_of_vehicle']))