In [254]:
import pandas as pd 
import numpy as np

In [255]:
df_orig = pd.read_csv("traffic_accidents.csv")

df_orig.head()

Unnamed: 0,Date,Accident Spot,Area,County,Road/ Highway,Brief Accident Details/Cause,Victims,Total people confirmed dead,Time of the Accidents,Weather conditions
0,2023-08-08,Sobea,Sobea,Nakuru,Nakuru-Eldoret Highway,Head on Collision,Passengers,4,4.30 pm,
1,2023-08-07,Maai-Mahiu,Naivasha,Nakuru,Maai-Mahiu Naivasha Highway,vehicle and motorcycle collision,Passengers,1,5.50 pm,
2,2023-07-25,Ntulele,Ntulele,Narok,Narok Mai Mahiu road,Head on Collision,Drivers/Occupants,4,,
3,2022-12-02,Suswa,Suswa,Narok,Narok Mai Mahiu road,Head on Collision,Driver and passengers,3,6.00 pm,
4,2022-12-01,Mutira,Mutira,Kirinyaga,Kerugoya-Karatina Road,Run over,Pedestrian,1,,


In [256]:
# View Missing Values
df_orig.isna().sum()

Date                             1
Accident Spot                    0
Area                             0
County                           0
Road/ Highway                    1
Brief Accident Details/Cause     2
Victims                         18
Total people confirmed dead      9
Time of the Accidents           56
Weather conditions              80
dtype: int64

In [257]:
df_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Date                          80 non-null     object
 1   Accident Spot                 81 non-null     object
 2   Area                          81 non-null     object
 3   County                        81 non-null     object
 4   Road/ Highway                 80 non-null     object
 5   Brief Accident Details/Cause  79 non-null     object
 6   Victims                       63 non-null     object
 7   Total people confirmed dead   72 non-null     object
 8   Time of the Accidents         25 non-null     object
 9   Weather conditions            1 non-null      object
dtypes: object(10)
memory usage: 6.5+ KB


### Handle Missing Values

In [258]:
# Make a copy of the db
df = df_orig.copy()

df.head()

Unnamed: 0,Date,Accident Spot,Area,County,Road/ Highway,Brief Accident Details/Cause,Victims,Total people confirmed dead,Time of the Accidents,Weather conditions
0,2023-08-08,Sobea,Sobea,Nakuru,Nakuru-Eldoret Highway,Head on Collision,Passengers,4,4.30 pm,
1,2023-08-07,Maai-Mahiu,Naivasha,Nakuru,Maai-Mahiu Naivasha Highway,vehicle and motorcycle collision,Passengers,1,5.50 pm,
2,2023-07-25,Ntulele,Ntulele,Narok,Narok Mai Mahiu road,Head on Collision,Drivers/Occupants,4,,
3,2022-12-02,Suswa,Suswa,Narok,Narok Mai Mahiu road,Head on Collision,Driver and passengers,3,6.00 pm,
4,2022-12-01,Mutira,Mutira,Kirinyaga,Kerugoya-Karatina Road,Run over,Pedestrian,1,,


In [259]:
# Rename Columns
df = df.rename(
  columns={
    'Accident Spot': 'accident-spot',
    'Road/ Highway': 'road',
    'Brief Accident Details/Cause': 'accident-details',
    'Total people confirmed dead': 'total-people-confirmed-dead',
    'Time of the Accidents': 'accident-time',
    'Weather conditions': 'weather'
  }
)

In [260]:
df.columns

Index(['Date', 'accident-spot', 'Area', 'County', 'road', 'accident-details',
       'Victims', 'total-people-confirmed-dead', 'accident-time', 'weather'],
      dtype='object')

In [261]:

# Drop Columns (Weather Condition, Time of accident)
df = df.drop(columns=['weather', 'accident-time'], axis=1)

df.columns

Index(['Date', 'accident-spot', 'Area', 'County', 'road', 'accident-details',
       'Victims', 'total-people-confirmed-dead'],
      dtype='object')

In [262]:
df.head()

Unnamed: 0,Date,accident-spot,Area,County,road,accident-details,Victims,total-people-confirmed-dead
0,2023-08-08,Sobea,Sobea,Nakuru,Nakuru-Eldoret Highway,Head on Collision,Passengers,4
1,2023-08-07,Maai-Mahiu,Naivasha,Nakuru,Maai-Mahiu Naivasha Highway,vehicle and motorcycle collision,Passengers,1
2,2023-07-25,Ntulele,Ntulele,Narok,Narok Mai Mahiu road,Head on Collision,Drivers/Occupants,4
3,2022-12-02,Suswa,Suswa,Narok,Narok Mai Mahiu road,Head on Collision,Driver and passengers,3
4,2022-12-01,Mutira,Mutira,Kirinyaga,Kerugoya-Karatina Road,Run over,Pedestrian,1


In [263]:
# View Missing Values
df.isna().sum()

Date                            1
accident-spot                   0
Area                            0
County                          0
road                            1
accident-details                2
Victims                        18
total-people-confirmed-dead     9
dtype: int64

In [264]:
# Drop the victims column
df.drop('Victims', axis=1, inplace=True)

df.columns

Index(['Date', 'accident-spot', 'Area', 'County', 'road', 'accident-details',
       'total-people-confirmed-dead'],
      dtype='object')

In [265]:
# Replace non-numeric values with NaN's
df['total-people-confirmed-dead'] = pd.to_numeric(df['total-people-confirmed-dead'], errors='coerce')

In [266]:
# Fill the missing values in confirmed dead using the median values
df["total-people-confirmed-dead"].fillna(df['total-people-confirmed-dead'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["total-people-confirmed-dead"].fillna(df['total-people-confirmed-dead'].median(), inplace=True)


In [267]:
# Convert date column to datetime type
df["Date"] = pd.to_datetime(df['Date'], format='%Y-%m-%d')


# Extract Date Data to Different Columns
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day

# Drop the date column
df = df.drop("Date", axis=1)

In [268]:
df.isna().sum()

accident-spot                  0
Area                           0
County                         0
road                           1
accident-details               2
total-people-confirmed-dead    0
year                           1
month                          1
day                            1
dtype: int64

In [269]:
df = df.dropna(subset=['month'])

df.isna().sum()

accident-spot                  0
Area                           0
County                         0
road                           1
accident-details               2
total-people-confirmed-dead    0
year                           0
month                          0
day                            0
dtype: int64

In [270]:
df = df.dropna()

df.isna().sum()

accident-spot                  0
Area                           0
County                         0
road                           0
accident-details               0
total-people-confirmed-dead    0
year                           0
month                          0
day                            0
dtype: int64

In [271]:
file = '1-traffic.csv'

df.to_csv(file, index=False)

In [272]:
df.head()

Unnamed: 0,accident-spot,Area,County,road,accident-details,total-people-confirmed-dead,year,month,day
0,Sobea,Sobea,Nakuru,Nakuru-Eldoret Highway,Head on Collision,4.0,2023.0,8.0,8.0
1,Maai-Mahiu,Naivasha,Nakuru,Maai-Mahiu Naivasha Highway,vehicle and motorcycle collision,1.0,2023.0,8.0,7.0
2,Ntulele,Ntulele,Narok,Narok Mai Mahiu road,Head on Collision,4.0,2023.0,7.0,25.0
3,Suswa,Suswa,Narok,Narok Mai Mahiu road,Head on Collision,3.0,2022.0,12.0,2.0
4,Mutira,Mutira,Kirinyaga,Kerugoya-Karatina Road,Run over,1.0,2022.0,12.0,1.0


### Categorize Data

Utilize sklearn's one hot encoder to encode columns like (Area, County, and Road) into numerical classes

In [273]:
df = pd.read_csv('1-traffic.csv')

df.isna().sum()

accident-spot                  0
Area                           0
County                         0
road                           0
accident-details               0
total-people-confirmed-dead    0
year                           0
month                          0
day                            0
dtype: int64

In [274]:
df.head()

Unnamed: 0,accident-spot,Area,County,road,accident-details,total-people-confirmed-dead,year,month,day
0,Sobea,Sobea,Nakuru,Nakuru-Eldoret Highway,Head on Collision,4.0,2023.0,8.0,8.0
1,Maai-Mahiu,Naivasha,Nakuru,Maai-Mahiu Naivasha Highway,vehicle and motorcycle collision,1.0,2023.0,8.0,7.0
2,Ntulele,Ntulele,Narok,Narok Mai Mahiu road,Head on Collision,4.0,2023.0,7.0,25.0
3,Suswa,Suswa,Narok,Narok Mai Mahiu road,Head on Collision,3.0,2022.0,12.0,2.0
4,Mutira,Mutira,Kirinyaga,Kerugoya-Karatina Road,Run over,1.0,2022.0,12.0,1.0


In [275]:
df['County'] = df['County'].astype(str)


# Encode the columns
dummies = pd.get_dummies(df['County'], prefix='County')

dummies = dummies.astype(int)

df = pd.concat([df, dummies], axis=1)

df.head()

Unnamed: 0,accident-spot,Area,County,road,accident-details,total-people-confirmed-dead,year,month,day,County_Bomet,...,County_Nairobi,County_Nairobi.1,County_Naivasha,County_Nakuru,County_Narok,County_Taita Taveta,County_Tharaka Nithi,County_Turkana,County_Uansin Gishi,County_Vihiga
0,Sobea,Sobea,Nakuru,Nakuru-Eldoret Highway,Head on Collision,4.0,2023.0,8.0,8.0,0,...,0,0,0,1,0,0,0,0,0,0
1,Maai-Mahiu,Naivasha,Nakuru,Maai-Mahiu Naivasha Highway,vehicle and motorcycle collision,1.0,2023.0,8.0,7.0,0,...,0,0,0,1,0,0,0,0,0,0
2,Ntulele,Ntulele,Narok,Narok Mai Mahiu road,Head on Collision,4.0,2023.0,7.0,25.0,0,...,0,0,0,0,1,0,0,0,0,0
3,Suswa,Suswa,Narok,Narok Mai Mahiu road,Head on Collision,3.0,2022.0,12.0,2.0,0,...,0,0,0,0,1,0,0,0,0,0
4,Mutira,Mutira,Kirinyaga,Kerugoya-Karatina Road,Run over,1.0,2022.0,12.0,1.0,0,...,0,0,0,0,0,0,0,0,0,0


In [276]:
# Drop the county, area and accident spot columns
df = df.drop(columns=['County', 'Area', 'accident-spot', 'road'], axis=1)

df.head()

Unnamed: 0,accident-details,total-people-confirmed-dead,year,month,day,County_Bomet,County_Homabay,County_Isiolo,County_Kajiado,County_Kakamega,...,County_Nairobi,County_Nairobi.1,County_Naivasha,County_Nakuru,County_Narok,County_Taita Taveta,County_Tharaka Nithi,County_Turkana,County_Uansin Gishi,County_Vihiga
0,Head on Collision,4.0,2023.0,8.0,8.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,vehicle and motorcycle collision,1.0,2023.0,8.0,7.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,Head on Collision,4.0,2023.0,7.0,25.0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,Head on Collision,3.0,2022.0,12.0,2.0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,Run over,1.0,2022.0,12.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [277]:
file = '2-traffic.csv'

df.to_csv(file, index=False)

In [278]:
# Convert the accident detail column to a more cleaner version
df['accident-details'] = df['accident-details'].apply(
    lambda x: 'matatu' if 'matatu' in x.lower() else x)
df['accident-details'] = df['accident-details'].apply(
    lambda x: 'truck' if 'truck' in x.lower() else x)
df['accident-details'] = df['accident-details'].apply(
    lambda x: 'bus' if 'bus' in x.lower() else x)
df['accident-details'] = df['accident-details'].apply(
    lambda x: 'car' if any(word in x.lower() for word in ['vehicle', 'car']) else x)


df.head()

Unnamed: 0,accident-details,total-people-confirmed-dead,year,month,day,County_Bomet,County_Homabay,County_Isiolo,County_Kajiado,County_Kakamega,...,County_Nairobi,County_Nairobi.1,County_Naivasha,County_Nakuru,County_Narok,County_Taita Taveta,County_Tharaka Nithi,County_Turkana,County_Uansin Gishi,County_Vihiga
0,Head on Collision,4.0,2023.0,8.0,8.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,car,1.0,2023.0,8.0,7.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,Head on Collision,4.0,2023.0,7.0,25.0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,Head on Collision,3.0,2022.0,12.0,2.0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,Run over,1.0,2022.0,12.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [280]:
df['accident-details'] = df['accident-details'].astype(str)


# Encode the columns
dummies = pd.get_dummies(df['accident-details'], prefix='Cause')

dummies = dummies.astype(int)

df = pd.concat([df, dummies], axis=1)

df.head()

Unnamed: 0,accident-details,total-people-confirmed-dead,year,month,day,County_Bomet,County_Homabay,County_Isiolo,County_Kajiado,County_Kakamega,...,Cause_Head on Collision,Cause_Head on collision,Cause_Head on collision.1,Cause_Motorist crashed into toll station,Cause_Run over,Cause_bus,Cause_car,Cause_matatu,Cause_shuttle rammed into a lorry,Cause_truck
0,Head on Collision,4.0,2023.0,8.0,8.0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,car,1.0,2023.0,8.0,7.0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,Head on Collision,4.0,2023.0,7.0,25.0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,Head on Collision,3.0,2022.0,12.0,2.0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,Run over,1.0,2022.0,12.0,1.0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [281]:
df = df.drop('accident-details', axis=1)

df.head()

Unnamed: 0,total-people-confirmed-dead,year,month,day,County_Bomet,County_Homabay,County_Isiolo,County_Kajiado,County_Kakamega,County_Kiambu,...,Cause_Head on Collision,Cause_Head on collision,Cause_Head on collision.1,Cause_Motorist crashed into toll station,Cause_Run over,Cause_bus,Cause_car,Cause_matatu,Cause_shuttle rammed into a lorry,Cause_truck
0,4.0,2023.0,8.0,8.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1.0,2023.0,8.0,7.0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,4.0,2023.0,7.0,25.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,3.0,2022.0,12.0,2.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,1.0,2022.0,12.0,1.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [282]:
df.shape

(77, 45)