In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.interactive(True)
plt.ion()
matplotlib.is_interactive()

import sklearn
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
%matplotlib inline

In [2]:
crash_data = pd.read_csv('Australian_Crash_Data.csv')

In [3]:
def examine_df(df:pd.DataFrame):
    shape = df.shape
    columns = df.columns
    return shape, columns

In [4]:
examine_df(crash_data)

((52843, 23),
 Index(['Crash ID', 'State', 'Month', 'Year', 'Dayweek', 'Time', 'Crash Type',
        'Bus Involvement', 'Heavy Rigid Truck Involvement',
        'Articulated Truck Involvement', 'Speed Limit', 'Road User', 'Gender',
        'Age', 'National Remoteness Areas', 'SA4 Name 2016',
        'National LGA Name 2017', 'National Road Type', 'Christmas Period',
        'Easter Period', 'Age Group', 'Day of week', 'Time of day'],
       dtype='object'))

In [5]:
crash_data

Unnamed: 0,Crash ID,State,Month,Year,Dayweek,Time,Crash Type,Bus Involvement,Heavy Rigid Truck Involvement,Articulated Truck Involvement,...,Age,National Remoteness Areas,SA4 Name 2016,National LGA Name 2017,National Road Type,Christmas Period,Easter Period,Age Group,Day of week,Time of day
0,20212133,Vic,9,2021,Sunday,0:30,Single,,,,...,38,Inner Regional Australia,Melbourne - Outer East,Yarra Ranges (S),Arterial Road,No,No,26_to_39,Weekend,Night
1,20214022,SA,9,2021,Saturday,23:31,Multiple,No,No,No,...,28,Major Cities of Australia,Adelaide - North,Playford (C),,No,No,26_to_39,Weekend,Night
2,20212096,Vic,9,2021,Saturday,23:00,Single,,,,...,19,Inner Regional Australia,Hume,Wangaratta (RC),Access road,No,No,17_to_25,Weekend,Night
3,20212145,Vic,9,2021,Saturday,22:25,Single,,,,...,23,Outer Regional Australia,Hume,Wangaratta (RC),Arterial Road,No,No,17_to_25,Weekend,Night
4,20212075,Vic,9,2021,Saturday,5:15,Single,,,,...,46,Major Cities of Australia,Melbourne - South East,Casey (C),Local Road,No,No,40_to_64,Weekend,Night
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52838,19891246,NSW,1,1989,Wednesday,17:05,Single,Yes,,No,...,5,,,,,No,No,0_to_16,Weekday,Day
52839,19895088,WA,1,1989,Monday,6:00,Single,No,,No,...,19,,,,,Yes,No,17_to_25,Weekday,Day
52840,19895088,WA,1,1989,Monday,6:00,Single,No,,No,...,17,,,,,Yes,No,17_to_25,Weekday,Day
52841,19895088,WA,1,1989,Monday,6:00,Single,No,,No,...,17,,,,,Yes,No,17_to_25,Weekday,Day


In [6]:
def drop_nulls(df:pd.DataFrame):
    """This function drops missing values that lie across"""
    df.dropna(axis=0, inplace=True)
    return df

In [7]:
dropped_cols = ['Crash ID', 'SA4 Name 2016', 'National LGA Name 2017', 
'National Road Type', 'Christmas Period', 'Easter Period', 
                'Day of week', 'Time of day', 'Age Group', 'National Remoteness Areas']

In [8]:
def drop_cols(df:pd.DataFrame, columns:list):
    df.drop(columns, axis=1, inplace=True)
    return df

In [9]:
def edit_speedlimit(df:pd.DataFrame):
    df.drop(df[df['Speed Limit'] == '<40'].index, inplace=True)
    df.drop(df[df['Speed Limit'] == 'Unspecified'].index, inplace=True)
    return df

In [10]:
dtype_dict = {'State':'category', 'Month':'int8',
             'Year':'int32', 'Dayweek':'category',
             'Time':'datetime64[ns]', 'Crash Type':'category',
             'Bus Involvement':'category', 'Heavy Rigid Truck Involvement':'category',
             'Articulated Truck Involvement':'category', 'Speed Limit':'int16',
             'Road User':'category', 'Gender':'category', 'Age':'int16'}


In [11]:
def tweak_dtypes(df, dtype_dictionary):
    df = df.astype(dtype_dictionary)
    return df

In [12]:
 crash_data = crash_data.pipe(drop_nulls).pipe(drop_cols, 
                                               dropped_cols).pipe(
     edit_speedlimit).pipe(tweak_dtypes, dtype_dict)

In [13]:
crash_data.dtypes

State                                  category
Month                                      int8
Year                                      int32
Dayweek                                category
Time                             datetime64[ns]
Crash Type                             category
Bus Involvement                        category
Heavy Rigid Truck Involvement          category
Articulated Truck Involvement          category
Speed Limit                               int16
Road User                              category
Gender                                 category
Age                                       int16
dtype: object