In [1]:
# first we need to import the libraries we're going to use
# 'import x as y' will import package x but give it the alias 'y' so you don't have to type out x all the time

import matplotlib.pyplot as plt # for making nice plots
import pandas as pd # for handling and transforming the data
import datetime # for manipulating dates, timestamps etc

%matplotlib inline

In [2]:
# we'll be using this file from now on! It's been through the process of cleaning and preparing
df = pd.read_csv("../data/cleaned.csv")

In [5]:
df.head(5)

Unnamed: 0,reference_num,latitude,longitude,num_vehicles,accident_date,accident_time,road_class,road_surface,lighting,weather,casualty_class,casualty_severity,casualty_sex,casualty_age,vehicle_type
0,202609,53.891468,-1.667699,2,14-Mar-09,2330,Unclassified,Dry,Darkness: no street lighting,Fine with high winds,Driver/Rider,Slight,Male,30,Car
1,202609,53.891468,-1.667699,2,14-Mar-09,2330,Unclassified,Dry,Darkness: no street lighting,Fine with high winds,Driver/Rider,Slight,Female,20,Car
2,810209,53.933915,-1.37407,1,03-Oct-09,630,A(M),Dry,Darkness: no street lighting,Fine with high winds,Driver/Rider,Slight,Male,29,Car
3,972109,53.911349,-1.384957,4,19-Nov-09,630,A(M),Dry,Darkness: no street lighting,Fine with high winds,Driver/Rider,Slight,Male,17,Car
4,972109,53.911349,-1.384957,4,19-Nov-09,630,A(M),Dry,Darkness: no street lighting,Fine with high winds,Driver/Rider,Slight,Male,53,Goods vehicle 3.5 tonnes mgw and under


### Part 2a - removing unrelated columns

In [8]:
# we know ahead of time that there are a few columns that definitely won't be useful for classification
# at least not without extra information. One of these is the reference number. Any others?

useless_columns = ['reference_num', ???]

df = df.drop(columns=useless_columns)
df.head(5)

SyntaxError: invalid syntax (<ipython-input-8-ee5b2cc8d2ea>, line 4)

In [9]:
# we have a range of different columns left
# some are numerical, some are date/time-related, some are categorical, and some are binary

### Part 2b - exploring binary columns

In [None]:
# some of our columns are binary, in that they only ever have one of two values. 
# what are these columns?
binary_columns = [???]

In [10]:
# let's look at one of these binary column - casualty sex. Does it matter? What can we tell from this data?

df.groupby(['casualty_severity', 'casualty_sex']).size()

# hint: men are the casualty in 58% of slight accidents but 70% of serious accidents
# this could be useful information!

casualty_severity  casualty_sex
Serious            Female           655
                   Male            1537
Slight             Female          7038
                   Male            9653
dtype: int64

In [None]:
# as our columns are binary we can encode them using 0 and 1 instead of string labels - much easier to handle later!

# write a function will return 1 if our value matches the specified 'positive' value
# e.g. encodeBinary('Male', 'Female') = 0, encodeBinary('Serious', 'Serious') = 1
def encode_binary(positive_value, our_value):
    ???


In [None]:
# we can now apply this function to get new binary columns
df['is_male'] = [encode_binary('Male', s) for s in df.casualty_sex]
df['is_serious'] = [encode_binary('Serious', s) for s in df.casualty_severity]

# we can drop the old columns, now we have the binary versions
df = df.drop(columns = binary_columns)

In [None]:
df.head(5)

### Part 2c - exploring numerical columns

In [None]:
# numerical columns - are they useful? Do we want to do anything with them?

# these are the numerical columns in our dataset
# choose one and have a look at the distribution of values
# extra points: break these down by slight/serious
numericalColumns = ['num_vehicles', 'casualty_age']

???

# hint: plt.hist(values) will give a histogram of a list of values
# hint: df.groupby(['columnsToGroupBy'])['otherColumnsToMeasureAgainst'].agg(['count', 'sum', etc])

In [11]:
# in this case, these columns are fine as they are and we don't want to normalise or modify them, so let's carry on!

### Part 2d - exploring datetime columns

In [13]:
# in their current format, fields like accident_date aren't very useful - we want to extract useful columns

monthMap = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 
            'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}

# write a function to take a date string of format DD-Mon-YY and extract the month as an integer
def getMonth(s):
    ???

# write a function to take a date string of format DD-Mon-YY and extract the full year as an integer
def getYear(s):
    ???

# use these functions to add new columns to our dataframe
df['month'] = ???
df['year'] = ???

df.head(5)

SyntaxError: invalid syntax (<ipython-input-13-5032d4e8769c>, line 15)

In [14]:
# but does the year matter? let's have a look at the trends by year

allByYear = df.groupby('year')['year'].count()
slightByYear = df[df.is_serious == 0].groupby('year')['year'].count()
allByYear.plot()
slightByYear.plot()

# answer: year is probably not a good indicator of slight/serious, same for month, so let's drop these

df = df.drop(columns=['year', 'month'])

KeyError: 'year'

In [None]:
df.head(5)

In [None]:
# what about weekday? we can get this information!

day_map = {1: 'Mon', 2: 'Tue', 3: 'Wed', 4: 'Thu', 5: 'Fri', 6: 'Sat', 7: 'Sun'}

def get_weekday(s):
    month = ???
    year = ???
    date = ???
    dt = datetime.datetime(year, month, date)
    return day_map[dt.isoweekday()]

df['weekday'] = [get_weekday(s) for s in df['accident_date']]

In [18]:
# let's look at accident casualties by day of the week!

day_order = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

grouped_by_day = df.groupby(???)[???].agg([???])

grouped_by_day.loc[day_order].plot()

# this looks interesting, so let's keep weekday as a feature for now

KeyError: 'weekday'

In [None]:
# that's enough datetime business - let's tidy up drop the original columns and carry on
df = df.drop(columns = ['accident_date', 'accident_time'])

### Part 2e - exploring categorical columns

In [None]:
# which of our columns are categorical? I've started off with 'road_class'
categorical_columns = ['road_class', ???]

# let's have a look at the values we have for these categories
for cat_col in categorical_columns:
    countByRoadClass = df.groupby(cat_col)[cat_col].agg(['count'])
    countByRoadClass.plot.bar()

In [None]:
# let's look at road surface - the distribution is quite imbalanced
# we're going to try reducing this to a binary column, where it's either dry or not

df['is_dry'] = [??? for s in df.road_surface]

df = df.drop(columns = ['road_surface'])
df.head(10)

In [None]:
# we want to use one-hot encoding to translate the other categorical columns from text labels into separate binary columns
# we can build this up in a modular way

# first let's have a function with two arguments, that returns 1 if they're the same and 0 if they're not
# hint: where have we encountered this before?
def one_hot_encode_value(value, target):
    ???
    
# this function takes a category and option and turns it into a new column name
# e.g. in the road class category, the label 'A' becomes a new column 'road_class_is_A'
def make_column_name(category_name, field_name):
    return category_name + '_is_' + field_name.lower().replace(' ','_')

# this function applies one-hot-encoding to a whole column
# it's a bit involved but have a read through and see if you can see what it's doing
def one_hot_encode_column(df, category_name, category_options):
    for option in category_options:
        new_column = make_column_name(category_name, option)
        df[new_column] = [one_hot_encode_value(s, option) for s in df[category_name]]
    return df

In [None]:
# now we want to one-hot encode lots of things! Let's start with vehicle type...
# but before we do one-hot encoding, let's reduce the number of options here

# this function will take a vehicle type and reduce it to one of four options (instead of lots)
def encode_vehicle_as_string(s):
    if s == 'Car':
        return s
    elif s == 'Pedal cycle':
        return s
    elif s == 'Bus or coach (17 or more passenger seats)':
        return 'Bus or coach'
    else:
        return 'Other'
    
df['vehicle_type_reduced'] = [encode_vehicle_as_string(s) for s in df.vehicle_type]

# use the one-hot encoding to return a new dataframe with vehicle_type_reduced encoded
df = one_hot_encode_column(???, ???, ???)

df = df.drop(columns = ['vehicle_type', 'vehicle_type_reduced'])
df.head(10)

In [None]:
# let's one hot encode lighting

df = one_hot_encode_column(df, 'lighting', set(df.lighting.values))
df = df.drop(columns = ['lighting'])
df.head(10)

In [None]:
# and casualty class and weekday from earlier!

df = ???
df = ???

df = df.drop(columns=['casualty_class', 'weekday'])
df.head(10)

In [20]:
# finally, let's look at weather a bit more carefully
# there are quite a lot of options - maybe we could simplify this a bit?

# let's try to group by weather and severity
severity_by_weather = ???

# what are all the possible options that weather can take?
weather_options = ???

x = []
y = []

# here we're going to iterate through the options and use our grouped data to count serious vs slight
# we'll measure the proportion between the two and plot it!
for weather_type in weather_options:
    subDf = severity_by_weather.loc[weather_type]
    slight_count = subDf.iloc[0]['count']
    serious_count = subDf.iloc[1]['count']
    
    x.append(???)
    y.append(???)

SyntaxError: invalid syntax (<ipython-input-20-6fb0a1a0889f>, line 5)

In [21]:
# let's plot this
plt.bar(x,y)
plt.xticks(rotation=90)
plt.ylabel('Serious/slight')

NameError: name 'x' is not defined

In [None]:
# quite a lot of variation, not really tied to fine/raining or high winds
# so let's hot encode the lot and move on to the exciting stuff.

df = one_hot_encode_column(df, 'weather', set(df.weather.values))
df = df.drop(columns=['weather'])
df.head(10)