## Import the necessary libraries

In [1]:
import sys
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
# from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from keras.models import Sequential
from keras.layers import Dense
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

from google.colab import drive
drive.mount('/content/drive')
sys.path.append('/drive/MyDrive/ColabNotebooks/modules')
import drive.MyDrive.ColabNotebooks.modules.helperFunctions
from drive.MyDrive.ColabNotebooks.modules.helperFunctions import readFile
from drive.MyDrive.ColabNotebooks.modules.helperFunctions import checkMissingValues
from drive.MyDrive.ColabNotebooks.modules.helperFunctions import cleanMissingValues

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load the Vancouver Police Departmen csv on reported crashes. The dataframe consists of 42 columns and 93,546 observations

In [2]:
# Import the support function "readFile". This function takes as an argument a .cvs file or the route to the file, imports and transforms it into a Pandas DataFrame, returning the loaded DataFrame

In [3]:
file = 'drive/MyDrive/ColabNotebooks/data/VPD_lowerMainLand.csv'
df = readFile(file)
# df.head(3)

In [4]:
df['Year'].value_counts()

2016    20743
2017    20019
2015    19942
2018    18922
2019    13920
Name: Year, dtype: int64

In [5]:
#Check the shape of the entire dataset before any prework done to it.

df.shape

(93546, 42)

## First it is necessary to check the integrity of the dataframe. We check for missing values

In [6]:
# Import the support function "cleanMissingValues". This function takes the DataFrame as an argument and returns the same DataFrame with a treatment for the missing values. 

checkMissingValues(df)

Unnamed: 0,column_name,percent_missing
Region,Region,0.0
Year,Year,0.0
Animal Flag,Animal Flag,0.0
Crash Type,Crash Type,0.0
Cyclist Involved,Cyclist Involved,0.0
Distracted Involved,Distracted Involved,0.0
Impaired Involved,Impaired Involved,0.0
Month,Month,0.0
Motorcycle Involved,Motorcycle Involved,0.0
Pedestrian Involved,Pedestrian Involved,0.0


In [7]:
# Import the support function "cleanMissingValues". This function takes the DataFrame as an argument and returns the same DataFrame with a treatment for the missing values. 

df = cleanMissingValues(df)

# Just in case we want to scheck how the data looks at this point

# df.head(3)

In [8]:
df.shape

(92346, 42)

## Now it is time to analyze the individual features and the range of values for each of them in order to get to know what are the categorical rage of values:

### "Month" values and categorization:

In [9]:
df['Month'].value_counts()
# January   0
# February  1
# March     2
# April     3
# May       4
# June      5
# July      6
# August    7
# September 8
# October   9
# November  10
# December  11

December     8866
November     8467
October      8369
January      8175
February     7617
September    7571
June         7415
August       7371
July         7358
May          7173
March        7094
April        6870
Name: Month, dtype: int64

### "Weather" values and categorization:

In [22]:
df['Weather'].value_counts()
# Clear           Clear         0
# Raining         Raining       1
# Cloudy          Cloudy        2
# Snow/Sleet      Snow/Sleet    3
# Unknown         Clear         0
# Fog             Fog           4
# Smog / Smoke    Smog / Smoke  5
# Strong Wind     Strong Wind   6
# Other           Clear         7
# Hail            Hail          8

Clear           51848
Raining         18098
Cloudy          17820
Snow/Sleet       2463
Unknown          1115
Fog               546
Smog / Smoke      184
Strong Wind       133
Other              92
Hail               47
Name: Weather, dtype: int64

### "Light" values and categorization:

In [11]:
df['Light'].value_counts()
# Daylight                    Daylight  1
# Dark / Some Illumination    Dark      2
# Dark / Full Illumination    Dark      2
# Dusk                        Dusk      4
# Dark / No Illumination      Dark      2
# Dawn                        Dawn      3
# Other                       Daylight  1

Daylight                    58545
Dark / Some Illumination    14084
Dark / Full Illumination     9032
Dusk                         4420
Dark / No Illumination       3594
Dawn                         2594
Other                          77
Name: Light, dtype: int64

### "Land Use" values and categorization:

In [12]:
df['Land Use'].value_counts()
# Urban Residential              Urban  0
# Business/Shopping              Urban  1
# Agricultural/Undeveloped       Rural  2
# Industrial/Manufacturing       Urban  3
# Rural Residential              Rural  4
# Apartment Residential          Urban  5
# School/Playground              Urban  6
# Unknown                        Urban  0
# Recreational/Park/Camping      Rural  7
# Other                          Urban  0

Urban Residential            40572
Business/Shopping            22694
Agricultural/Undeveloped      8065
Industrial/Manufacturing      7618
Rural Residential             5251
Apartment Residential         3512
School/Playground             1837
Recreational/Park/Camping     1312
Unknown                       1186
Other                          299
Name: Land Use, dtype: int64

### "Speed Advisory" values and categorization:

In [13]:
df['Speed Advisory'].value_counts()
# Not Applicable        0
# Advisory - 10 Km/H    1
# Advisory - 20 Km/H    2
# Advisory - 30 Km/H    3
# Advisory - 40 Km/H    4
# Advisory - 50 Km/H    5
# Advisory - 60 Km/H    6
# Advisory - 70 Km/H    7
# Advisory - 80 Km/H    8
# Advisory - 90 Km/H    9
# Advisory - 100 Km/H   10
# Advisory - 110 Km/H   11

Not Applicable         87763
Advisory - 50 Km/H      2800
Advisory - 30 Km/H       538
Advisory - 60 Km/H       537
Advisory - 40 Km/H       208
Advisory - 70 Km/H       117
Advisory - 80 Km/H       116
Advisory - 10 Km/H       104
Advisory - 20 Km/H        89
Advisory - 90 Km/H        37
Advisory - 100 Km/H       35
Advisory - 110 Km/H        2
Name: Speed Advisory, dtype: int64

### "Traffic Control" values and categorization:

In [14]:
df['Traffic Control'].value_counts()
# "None":                               0,
# "Stop Sign":                          1,
# "Yield Sign ":                        2,
# "Officer/Flagman/School Guard ":      3,
# "Railroad Crossing Sign":             4,
# "Lane Use Turn Control Sign":         5,
# "Traffic Signal - Red":               6,
# "Traffic Signal - Yellow":            6,
# "Traffic Signal - Green":             6,
# "Traffic Signal W/Adv Flash - Red":   6, 
# "Traffic Signal W/Adv Flash- Yellow": 6,
# "Traffic Signal W/Adv Flash - Green": 6,
# "Flashing Signal - Red":              7,
# "Flashing Signal - Yellow":           7,
# "Flashing Signal - Green":            7,
# "Lane Use Signal - Red":              8,
# "Lane Use Signal - Yellow":           8,
# "Lane Use Signal - Green":            8,
# "Not Applicable":                     0,
# "Other":                              0,
# "Unknown":                            0,

None                              44346
Stop Sign                          9853
Traffic Signal - Green             9445
Not Applicable                     8652
Traffic Signal - Red               7768
Unknown                            3336
Traffic Signal - Yellow            3113
Yield Sign                         1631
Lane Use Turn Control Sign         1084
Tra Signal W/Adv Flash - Green      718
Flashing Signal - Green             476
Tra Signal W/Adv Flash - Red        452
Other                               395
Officer/Flagman/School Guard        366
Tra Signal W/Adv Flash- Yellow      249
Railroad Crossing Sign              151
Flashing Signal - Yellow            135
Flashing Signal - Red                90
Lane Use Signal - Green              65
Lane Use Signal - Yellow             15
Lane Use Signal - Red                 6
Name: Traffic Control, dtype: int64

### "Traffic Flow" values and categorization:

In [15]:
df['Traffic Flow'].value_counts()
# TwoWayTraffic    2
# OneWayTraffic    1
# Unknown          0
# Other            0

TwoWayTraffic    77817
OneWayTraffic    11035
Unknown           3116
Other              378
Name: Traffic Flow, dtype: int64

### "Road Class" values and categorization:

In [16]:
df['Road Class'].value_counts()
# "Two Lanes, Undivided"         2
# "Four Lanes, Divided"          4
# "Two Lanes, Divided"           2
# "Not Applicable"               5
# "Unknown"                      5
# "Four Lanes, Undivided"        4
# "One Lane, Undivided"          1
# "Six Lanes, Divided"           5
# "Three Lanes, Divided"         3
# "Three Lanes, Undivided"       3
# "One Lane, Divided"            1
# "Five Lanes, Divided"          5
# "Six Lanes, Undivided"         5
# "Five Lanes, Undivided"        5
# "One Lane, Ramp"               1
# "Seven Lanes, Divided"         5
# "Two Lanes, Ramp"              2
# "Other"                        5
# "Seven Lanes, Undivided"       5
# "Three Lanes, Ramp"            3
# "Four Lanes, Ramp"             4
# "Six Lanes, Ramp"              5
# "Five Lanes, Ramp"             5
# "Seven Lanes, Ramp"            5

Two Lanes, Undivided      21178
Four Lanes, Divided       13119
Two Lanes, Divided        11997
Not Applicable             8652
Unknown                    8469
Four Lanes, Undivided      8086
One Lane, Undivided        4379
Six Lanes, Divided         3434
Three Lanes, Divided       2523
Three Lanes, Undivided     2379
One Lane, Divided          1971
Five Lanes, Divided        1947
Six Lanes, Undivided       1392
Five Lanes, Undivided       888
One Lane, Ramp              577
Seven Lanes, Divided        527
Two Lanes, Ramp             276
Other                       214
Seven Lanes, Undivided      182
Three Lanes, Ramp            84
Four Lanes, Ramp             32
Six Lanes, Ramp              26
Five Lanes, Ramp              9
Seven Lanes, Ramp             5
Name: Road Class, dtype: int64

### "Road Character" values and categorization:

In [17]:
df['Road Character'].value_counts()
# "Straight - Flat":              1 
# "Straight - Some Grade":        1
# "Straight - Steep Grade":       1
# "Straight - Hillcrest":         1
# "Straight - Sag":               1
# "Single Curve - Flat":          2
# "Single Curve - Some Grade":    2
# "Single Curve - Steep Grade":   2
# "Single Curve - Hillcrest":     2
# "Single Curve - Sag":           2
# "Sharp Curve - Flat":           2 
# "Sharp Curve - Some Grade":     2
# "Sharp Curve - Steep Grade":    2
# "Sharp Curve - Hillcrest":      2
# "Sharp Curve - Sag":            2
# "Switchback - Flat":            2
# "Switchback - Some Grade":      2
# "Switchback - Steep Grade":     2
# "Switchback - Hillcrest":       2
# "Switchback - Sag":             2
# "Winding Curve - Flat":         2
# "Winding Curve - Some Grade":   2
# "Winding Curve - Steep Grade":  2
# "Winding Curve - Hillcrest":    2
# "Winding Curve - Sag":          2
# "Reverse Curve - Flat":         2
# "Reverse Curve - Some Grade":   2
# "Reverse Curve - Steep Grade":  2
# "Reverse Curve - Hillcrest":    2
# "Reverse Curve - Sag":          2
# "Other":                        1
# "Unknown":                      1

Straight - Flat                56876
Straight - Some Grade          14659
Unknown                         5531
Single Curve - Some Grade       4439
Single Curve - Flat             4391
Straight - Steep Grade          1708
Sharp Curve - Some Grade         887
Sharp Curve - Flat               822
Winding Curve - Some Grade       549
Single Curve - Steep Grade       536
Straight - Hillcrest             416
Winding Curve - Flat             355
Sharp Curve - Steep Grade        323
Winding Curve - Steep Grade      155
Switchback - Some Grade          148
Other                            101
Switchback - Steep Grade          95
Single Curve - Hillcrest          85
Switchback - Flat                 75
Straight - Sag                    53
Sharp Curve - Hillcrest           34
Reverse Curve - Some Grade        24
Reverse Curve - Flat              22
Winding Curve - Hillcrest         20
Single Curve - Sag                18
Sharp Curve - Sag                 11
Winding Curve - Sag                6
R

### "Road Condition" values and categorization:

In [18]:
df['Road Condition'].value_counts()
# Dry          1
# Wet          2
# Snow         3
# Unknown      7
# Ice          4
# Slush        5
# Other        7
# Muddy        6

Dry        59631
Wet        27452
Snow        1987
Ice         1463
Unknown     1031
Slush        647
Other         87
Muddy         48
Name: Road Condition, dtype: int64

### "Road Surface" values and categorization:

In [19]:
df['Road Surface'].value_counts()
# Asphalt            0
# Concrete           1
# Unknown            2
# Gravel             2
# Earth              2
# Brick/Stone        2
# Wood               2
# Oiled Gravel       2
# Other              2

Asphalt         88233
Concrete         2711
Gravel            670
Unknown           478
Earth             136
Brick/Stone        49
Wood               27
Oiled Gravel       23
Other              19
Name: Road Surface, dtype: int64

### "Crash Type" values and categorization:

In [20]:
df['Crash Type'].value_counts()
# Property damage only  0
# Casualty crash        1

Property damage only    50669
Casualty crash          41677
Name: Crash Type, dtype: int64

In [21]:
# After the cleaning of the dataframe, we loose just 1,147 observations.
# This is the dataframe that is used in the pre-process functions next.

df.shape

(92346, 42)

## Next, we define a "helperFunctions" python file .py and save it in the "ColabNotes/modules" folder. Inside this file is where the pre-process of the data is defined and documented.