In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.interactive(True)
plt.ion()
matplotlib.is_interactive()

import sklearn
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
%matplotlib inline

In [2]:
## read dataset file
crash_data = pd.read_csv('Australian_Crash_Data.csv')

In [3]:
def examine_df(df:pd.DataFrame):
    """This function returns a tuple containing the shape of the dataset
    and the list of columns present in the dataset"""
    shape = df.shape
    columns = df.columns
    return shape, columns

In [4]:
examine_df(crash_data)

((52843, 23),
 Index(['Crash ID', 'State', 'Month', 'Year', 'Dayweek', 'Time', 'Crash Type',
        'Bus Involvement', 'Heavy Rigid Truck Involvement',
        'Articulated Truck Involvement', 'Speed Limit', 'Road User', 'Gender',
        'Age', 'National Remoteness Areas', 'SA4 Name 2016',
        'National LGA Name 2017', 'National Road Type', 'Christmas Period',
        'Easter Period', 'Age Group', 'Day of week', 'Time of day'],
       dtype='object'))

In [5]:
## keep an eye on missing values and memory usage
crash_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52843 entries, 0 to 52842
Data columns (total 23 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Crash ID                       52843 non-null  int64 
 1   State                          52843 non-null  object
 2   Month                          52843 non-null  int64 
 3   Year                           52843 non-null  int64 
 4   Dayweek                        52843 non-null  object
 5   Time                           52803 non-null  object
 6   Crash Type                     52843 non-null  object
 7   Bus Involvement                52821 non-null  object
 8   Heavy Rigid Truck Involvement  32328 non-null  object
 9   Articulated Truck Involvement  52821 non-null  object
 10  Speed Limit                    52141 non-null  object
 11  Road User                      52843 non-null  object
 12  Gender                         52816 non-null  object
 13  A

- We can observe that this dataset has a memory usage of about 9300 KB

In [5]:
## Have a glimpse of the dataset
crash_data

Unnamed: 0,Crash ID,State,Month,Year,Dayweek,Time,Crash Type,Bus Involvement,Heavy Rigid Truck Involvement,Articulated Truck Involvement,...,Age,National Remoteness Areas,SA4 Name 2016,National LGA Name 2017,National Road Type,Christmas Period,Easter Period,Age Group,Day of week,Time of day
0,20212133,Vic,9,2021,Sunday,0:30,Single,,,,...,38,Inner Regional Australia,Melbourne - Outer East,Yarra Ranges (S),Arterial Road,No,No,26_to_39,Weekend,Night
1,20214022,SA,9,2021,Saturday,23:31,Multiple,No,No,No,...,28,Major Cities of Australia,Adelaide - North,Playford (C),,No,No,26_to_39,Weekend,Night
2,20212096,Vic,9,2021,Saturday,23:00,Single,,,,...,19,Inner Regional Australia,Hume,Wangaratta (RC),Access road,No,No,17_to_25,Weekend,Night
3,20212145,Vic,9,2021,Saturday,22:25,Single,,,,...,23,Outer Regional Australia,Hume,Wangaratta (RC),Arterial Road,No,No,17_to_25,Weekend,Night
4,20212075,Vic,9,2021,Saturday,5:15,Single,,,,...,46,Major Cities of Australia,Melbourne - South East,Casey (C),Local Road,No,No,40_to_64,Weekend,Night
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52838,19891246,NSW,1,1989,Wednesday,17:05,Single,Yes,,No,...,5,,,,,No,No,0_to_16,Weekday,Day
52839,19895088,WA,1,1989,Monday,6:00,Single,No,,No,...,19,,,,,Yes,No,17_to_25,Weekday,Day
52840,19895088,WA,1,1989,Monday,6:00,Single,No,,No,...,17,,,,,Yes,No,17_to_25,Weekday,Day
52841,19895088,WA,1,1989,Monday,6:00,Single,No,,No,...,17,,,,,Yes,No,17_to_25,Weekday,Day


In [6]:
def drop_nulls(df:pd.DataFrame):
    """This function drops missing values that lie along the row axis
    It returns a dataframe object that is free of missing values"""
    df.dropna(axis=0, inplace=True)
    return df

In [7]:
## list of columns to be dropped from the dataframe
dropped_cols = ['Crash ID', 'SA4 Name 2016', 'National LGA Name 2017', 
'National Road Type', 'Christmas Period', 'Easter Period', 
                'Day of week', 'Time of day', 'Age Group', 'National Remoteness Areas']

In [8]:
def drop_cols(df:pd.DataFrame, columns:list):
    """This function drops undesired columns which are inputted as a list.
    The function returns an updated dataframe object after the undesired columns are dropped"""
    df.drop(columns, axis=1, inplace=True)
    return df

In [9]:
def edit_speedlimit(df:pd.DataFrame):
    """This function is used to eliminate undesirable values in the Speed Limit column.
    It returns an updated dataframe object"""
    df.drop(df[df['Speed Limit'] == '<40'].index, inplace=True)
    df.drop(df[df['Speed Limit'] == 'Unspecified'].index, inplace=True)
    return df

In [10]:
## dictionary containing columns and the data type they are to be converted to
dtype_dict = {'State':'category', 'Month':'int8',
             'Year':'int32', 'Dayweek':'category',
             'Time':'datetime64[ns]', 'Crash Type':'category',
             'Bus Involvement':'bool', 'Heavy Rigid Truck Involvement':'bool',
             'Articulated Truck Involvement':'bool', 'Speed Limit':'int16',
             'Road User':'category', 'Gender':'category', 'Age':'int16'}


In [11]:
def tweak_dtypes(df, dtype_dictionary:dict):
    """This function converts the data types of the existing columns of the input dataframe
    as assigned in the input dictionary; dtype_dict. The function returns a dataframe object with updated datatypes"""
    df = df.astype(dtype_dictionary)
    return df

In [12]:
time_column = 'Time'

In [13]:
def tweak_time(df:pd.DataFrame, time_column):
    """This function converts a column in datetime format to hour-of-the-day format.
    It returns a dataframe with the updated column"""
    return df.assign(Time = lambda df:df[time_column].dt.hour)

In [14]:
idx_col = 'Year'

In [15]:
def change_index(df:pd.DataFrame, idx_col:str):
    """This function sets the input column as the dataframe's index.
    It returns a dataframe object with an updated index"""
    return df.assign(idx_col = lambda df:df.set_index(idx_col, inplace=True))

In [16]:
## this is a pipeline of operations carried out on a dataframe in sequential order
crash_data = crash_data.pipe(drop_nulls).pipe(drop_cols, dropped_cols
                                             ).pipe(edit_speedlimit).pipe(tweak_dtypes, dtype_dict).pipe(tweak_time, 
                                                         time_column).pipe(change_index, idx_col).pipe(drop_cols, 'idx_col')

#### The Pipeline of operations are carried out in the following order:
- dropping missing values
- dropping undesired columns
- changing data types of columns
- edit the format of the Time column from datetime to hour-of-the-day
- set the Year column as the index of the dataframe

In [17]:
crash_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6784 entries, 2021 to 2014
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   State                          6784 non-null   category
 1   Month                          6784 non-null   int8    
 2   Dayweek                        6784 non-null   category
 3   Time                           6784 non-null   int64   
 4   Crash Type                     6784 non-null   category
 5   Bus Involvement                6784 non-null   bool    
 6   Heavy Rigid Truck Involvement  6784 non-null   bool    
 7   Articulated Truck Involvement  6784 non-null   bool    
 8   Speed Limit                    6784 non-null   int16   
 9   Road User                      6784 non-null   category
 10  Gender                         6784 non-null   category
 11  Age                            6784 non-null   int16   
dtypes: bool(3), category(5), int16(

- The updated dataframe is seen to have significantly minimised its numbers in terms of memory usage

In [18]:
crash_data

Unnamed: 0_level_0,State,Month,Dayweek,Time,Crash Type,Bus Involvement,Heavy Rigid Truck Involvement,Articulated Truck Involvement,Speed Limit,Road User,Gender,Age
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021,Qld,9,Saturday,4,Multiple,True,True,True,100,Motorcycle rider,Male,19
2021,Qld,9,Saturday,2,Single,True,True,True,60,Driver,Male,47
2021,SA,9,Thursday,21,Single,True,True,True,80,Driver,Male,24
2021,Qld,9,Sunday,21,Multiple,True,True,True,50,Motorcycle rider,Male,52
2021,NSW,9,Tuesday,21,Single,True,True,True,60,Driver,Female,32
...,...,...,...,...,...,...,...,...,...,...,...,...
2014,SA,1,Friday,11,Multiple,True,True,True,110,Passenger,Male,40
2014,WA,1,Wednesday,11,Single,True,True,True,100,Motorcycle rider,Male,46
2014,SA,1,Monday,9,Single,True,True,True,50,Pedestrian,Female,82
2014,WA,1,Tuesday,21,Single,True,True,True,110,Driver,Male,84
