In [1]:
import pandas as pd 
import numpy as np

In [2]:
df_orig = pd.read_csv("traffic_accidents.csv")

df_orig.head()

Unnamed: 0,Date,Accident Spot,Area,County,Road/ Highway,Brief Accident Details/Cause,Victims,Total people confirmed dead,Time of the Accidents,Weather conditions
0,2023-08-08,Sobea,Sobea,Nakuru,Nakuru-Eldoret Highway,Head on Collision,Passengers,4,4.30 pm,
1,2023-08-07,Maai-Mahiu,Naivasha,Nakuru,Maai-Mahiu Naivasha Highway,vehicle and motorcycle collision,Passengers,1,5.50 pm,
2,2023-07-25,Ntulele,Ntulele,Narok,Narok Mai Mahiu road,Head on Collision,Drivers/Occupants,4,,
3,2022-12-02,Suswa,Suswa,Narok,Narok Mai Mahiu road,Head on Collision,Driver and passengers,3,6.00 pm,
4,2022-12-01,Mutira,Mutira,Kirinyaga,Kerugoya-Karatina Road,Run over,Pedestrian,1,,


In [None]:
# View Missing Values
df_orig.isna().sum()

Date                             1
Accident Spot                    0
Area                             0
County                           0
Road/ Highway                    1
Brief Accident Details/Cause     2
Victims                         18
Total people confirmed dead      9
Time of the Accidents           56
Weather conditions              80
dtype: int64

In [16]:
df_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Date                          80 non-null     object
 1   Accident Spot                 81 non-null     object
 2   Area                          81 non-null     object
 3   County                        81 non-null     object
 4   Road/ Highway                 80 non-null     object
 5   Brief Accident Details/Cause  79 non-null     object
 6   Victims                       63 non-null     object
 7   Total people confirmed dead   72 non-null     object
 8   Time of the Accidents         25 non-null     object
 9   Weather conditions            1 non-null      object
dtypes: object(10)
memory usage: 6.5+ KB


### Handle Missing Values

In [6]:
# Make a copy of the db
df = df_orig.copy()

df.head()

Unnamed: 0,Date,Accident Spot,Area,County,Road/ Highway,Brief Accident Details/Cause,Victims,Total people confirmed dead,Time of the Accidents,Weather conditions
0,2023-08-08,Sobea,Sobea,Nakuru,Nakuru-Eldoret Highway,Head on Collision,Passengers,4,4.30 pm,
1,2023-08-07,Maai-Mahiu,Naivasha,Nakuru,Maai-Mahiu Naivasha Highway,vehicle and motorcycle collision,Passengers,1,5.50 pm,
2,2023-07-25,Ntulele,Ntulele,Narok,Narok Mai Mahiu road,Head on Collision,Drivers/Occupants,4,,
3,2022-12-02,Suswa,Suswa,Narok,Narok Mai Mahiu road,Head on Collision,Driver and passengers,3,6.00 pm,
4,2022-12-01,Mutira,Mutira,Kirinyaga,Kerugoya-Karatina Road,Run over,Pedestrian,1,,


In [7]:
# Rename Columns
df = df.rename(
  columns={
    'Accident Spot': 'accident-spot',
    'Road/ Highway': 'road',
    'Brief Accident Details/Cause': 'accident-details',
    'Total people confirmed dead': 'total-people-confirmed-dead',
    'Time of the Accidents': 'accident-time',
    'Weather conditions': 'weather'
  }
)

In [8]:
df.columns

Index(['Date', 'accident-spot', 'Area', 'County', 'road', 'accident-details',
       'Victims', 'total-people-confirmed-dead', 'accident-time', 'weather'],
      dtype='object')

In [9]:

# Drop Columns (Weather Condition, Time of accident)
df = df.drop(columns=['weather', 'accident-time'], axis=1)

df.columns

Index(['Date', 'accident-spot', 'Area', 'County', 'road', 'accident-details',
       'Victims', 'total-people-confirmed-dead'],
      dtype='object')

In [10]:
df.head()

Unnamed: 0,Date,accident-spot,Area,County,road,accident-details,Victims,total-people-confirmed-dead
0,2023-08-08,Sobea,Sobea,Nakuru,Nakuru-Eldoret Highway,Head on Collision,Passengers,4
1,2023-08-07,Maai-Mahiu,Naivasha,Nakuru,Maai-Mahiu Naivasha Highway,vehicle and motorcycle collision,Passengers,1
2,2023-07-25,Ntulele,Ntulele,Narok,Narok Mai Mahiu road,Head on Collision,Drivers/Occupants,4
3,2022-12-02,Suswa,Suswa,Narok,Narok Mai Mahiu road,Head on Collision,Driver and passengers,3
4,2022-12-01,Mutira,Mutira,Kirinyaga,Kerugoya-Karatina Road,Run over,Pedestrian,1


In [22]:
# View Missing Values
df.isna().sum()

Date                           1
accident-spot                  0
Area                           0
County                         0
road                           1
accident-details               2
total-people-confirmed-dead    0
dtype: int64

In [12]:
# Drop the victims column
df.drop('Victims', axis=1, inplace=True)

df.columns

Index(['Date', 'accident-spot', 'Area', 'County', 'road', 'accident-details',
       'total-people-confirmed-dead'],
      dtype='object')

In [None]:
# Replace non-numeric values with NaN's
df['total-people-confirmed-dead'] = pd.to_numeric(df['total-people-confirmed-dead'], errors='coerce')

In [None]:
# Fill the missing values in confirmed dead using the median values
df["total-people-confirmed-dead"].fillna(df['total-people-confirmed-dead'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["total-people-confirmed-dead"].fillna(df['total-people-confirmed-dead'].median(), inplace=True)


In [None]:
# Convert date column to datetime type
df["Date"] = pd.to_datetime(df['Date'], format='%Y-%m-%d')


# Extract Date Data to Different Columns
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day

# Drop the date column
df = df.drop("Date", axis=1)

In [34]:
df.isna().sum()

accident-spot                  0
Area                           0
County                         0
road                           1
accident-details               2
total-people-confirmed-dead    0
year                           1
month                          1
day                            1
dtype: int64

In [35]:
df = df.dropna(subset=['month'])

df.isna().sum()

accident-spot                  0
Area                           0
County                         0
road                           1
accident-details               2
total-people-confirmed-dead    0
year                           0
month                          0
day                            0
dtype: int64

In [39]:
file = '1-traffic.csv'

df.to_csv(file, index=False)

In [40]:
df.head()

Unnamed: 0,accident-spot,Area,County,road,accident-details,total-people-confirmed-dead,year,month,day
0,Sobea,Sobea,Nakuru,Nakuru-Eldoret Highway,Head on Collision,4.0,2023.0,8.0,8.0
1,Maai-Mahiu,Naivasha,Nakuru,Maai-Mahiu Naivasha Highway,vehicle and motorcycle collision,1.0,2023.0,8.0,7.0
2,Ntulele,Ntulele,Narok,Narok Mai Mahiu road,Head on Collision,4.0,2023.0,7.0,25.0
3,Suswa,Suswa,Narok,Narok Mai Mahiu road,Head on Collision,3.0,2022.0,12.0,2.0
4,Mutira,Mutira,Kirinyaga,Kerugoya-Karatina Road,Run over,1.0,2022.0,12.0,1.0


### Categorize Data

Utilize sklearn's one hot encoder to encode columns like (Area, County, and Road) into numerical classes

In [45]:
df = pd.read_csv('1-traffic.csv')

df.isna().sum()

accident-spot                  0
Area                           0
County                         0
road                           0
accident-details               0
total-people-confirmed-dead    0
year                           0
month                          0
day                            0
dtype: int64

In [46]:
df.head()

Unnamed: 0,accident-spot,Area,County,road,accident-details,total-people-confirmed-dead,year,month,day
0,Sobea,Sobea,Nakuru,Nakuru-Eldoret Highway,Head on Collision,4.0,2023.0,8.0,8.0
1,Maai-Mahiu,Naivasha,Nakuru,Maai-Mahiu Naivasha Highway,vehicle and motorcycle collision,1.0,2023.0,8.0,7.0
2,Ntulele,Ntulele,Narok,Narok Mai Mahiu road,Head on Collision,4.0,2023.0,7.0,25.0
3,Suswa,Suswa,Narok,Narok Mai Mahiu road,Head on Collision,3.0,2022.0,12.0,2.0
4,Mutira,Mutira,Kirinyaga,Kerugoya-Karatina Road,Run over,1.0,2022.0,12.0,1.0


In [None]:
from sklearn.preprocessing import OneHotEncoder

# Initialize the encoder
enc = OneHotEncoder()