In [3]:
# first we need to import the libraries we're going to use
# 'import x as y' will import package x but give it the alias 'y' so you don't have to type out x all the time

import matplotlib.pyplot as plt # for making nice plots
import sklearn as skl # for the machine learning bit!
import pandas as pd # for handling and transforming the data
import numpy as np # for useful numerical operations
import graphviz # for visualizing our decision tree
import lime # for visualizing tree model predictions

## Part 0 - loading the data

In [30]:
df = pd.read_csv("../data/all.csv", encoding='latin1')

In [31]:
df.head(5)

Unnamed: 0,Reference Number,Easting,Northing,Number of Vehicles,Accident Date,Time (24hr),1st Road Class,Road Surface,Lighting Conditions,Weather Conditions,Casualty Class,Casualty Severity,Sex of Casualty,Age of Casualty,Type of Vehicle
0,2609,434723,435534,1,02-Jan-09,2335,Unclassified,Dry,Darkness: street lights present and lit,Fine without high winds,Driver,Serious,Female,23,Car
1,2809,441173,433047,1,02-Jan-09,1645,Unclassified,Dry,Darkness: street lights present and lit,Fine without high winds,Pedestrian,Slight,Female,12,Car
2,2909,443402,438684,2,04-Jan-09,1605,A(M),Dry,Darkness: street lights present and lit,Fine without high winds,Driver,Slight,Male,83,Car
3,2909,443402,438684,2,04-Jan-09,1605,A(M),Dry,Darkness: street lights present and lit,Fine without high winds,Passenger,Slight,Female,65,Car
4,3109,427890,430474,1,05-Jan-09,705,A,Wet / Damp,Darkness: street lights present and lit,Fine without high winds,Driver,Slight,Male,42,Car


## Part 1 - cleaning the data (5-10 mins)

In [25]:
# people load the data and try to find three things that might cause problems for us
# valid answers: null values, mix of numerical and text category labels, data in wrong columns, not normalized

In [47]:
# categories to check

# decide which columns you can check contain the correct categories
columnsToCheck = ['1st Road Class', 'Road Surface', 'Lighting Conditions', 'Weather Conditions', 'Casualty Class',
                 'Casualty Severity', 'Sex of Casualty', 'Type of Vehicle']

# go through these columns and check that the values we have are expected
for column in columnsToCheck:
    valuesPresent = set(df[column].values)
    print("{} - {}".format(column, str(valuesPresent)))
    print("-------")

1st Road Class - {'A(M)', 'A', 'Unclassified', 'B', 'C', 'Motorway'}
-------
Road Surface - {nan, 'Flood (surface water over 3cm deep)', 'Snow', 'Wet / Damp', 'Frost/ Ice', '5', 'Flood', 'Frost / Ice', 'Dry'}
-------
Lighting Conditions - {'Darkness: street lights present and lit', 'Darkness: street lighting unknown', 'Daylight: street lights present', '5', 'Darkness: no street lighting', 'Darkness: street lights present but unlit'}
-------
Weather Conditions - {'Unknown', 'Snowing without high winds', 'Fine with high winds', 'Darkness: street lighting unknown', 'Snowing with high winds', 'Fog or mist \x96 if hazard', 'Raining with high winds', 'Raining without high winds', 'Fine without high winds', 'Other'}
-------
Casualty Class - {'Passenger', 'Driver', 'Pedestrian', 'Driver/Rider'}
-------
Casualty Severity - {'Slight', 'Fatal', 'Serious'}
-------
Sex of Casualty - {'Female', 'Male'}
-------
Type of Vehicle - {'Goods vehicle 7.5 tonnes mgw and over', 'Car', 'Motorcycle - Unknown C

In [50]:
# numerical columns - similar to above

numericalColumnsToCheck = []

# is everything a number? a sensible number?

## Part 1b - loading the dataset we've already prepared

In [9]:
# we'll be using this file from now on! It's been through the process of cleaning and preparing
df = pd.read_csv("../data/cleaned-with-latlng.csv", encoding='latin1')

## Part 2 - exploratory analysis (10 mins)

In [16]:
# have a look at distribution of slight/serious end cases

In [26]:
# want to get a feel for distributions of other features and possible relationships

In [23]:
# what are we going to learn from this section?

## Part 3 - feature engineering (15 mins)

In [11]:
# not all of the existing features in the dataset are going to be useful
# e.g. without map to tie to, lat and long not useful so can exclude

In [13]:
# creating better features from existing ones (e.g. weekend vs weekday, day bins)

# some features are useful but not in their current state

In [15]:
# encoding categorical data

# some features are categorical and we need to encode this = one hot encoding!

In [9]:
# normalization not necessary for decision trees but good to know

# in some methods you'll want to standardize numerical features
# example would be to take e.g. age in our model

# find mean and re-do as difference from mean, and visualise here

## Part 4 - training the model

In [16]:
# want to split into appropriate set sizes, train, cv, test etc
# check target class is well-represented in train/test split
# training the model

## Part 5 - visualising the model

In [27]:
# use graphviz and lime to visualise the model and show paths through the tree for new data?

## Part 6 - evaluating the model

In [28]:
# binary classification metrics

# want good examples to demonstrate pros/cons of different metrics

In [29]:
# testing on test data

## Part 7 - enhancing the model

In [30]:
# random forest and ensemble methods

# extending to random forest and explaining significance of new parameters
# general explanation on advantages of ensemble methods# random forest and ensemble methods

In [31]:
# re evaluation on new model