In [1]:
import numpy as np  
import pandas as pd 
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm

from sklearn.metrics import confusion_matrix
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

In [2]:
filename = 'https://s3.us.cloud-object-storage.appdomain.cloud/cf-courses-data/CognitiveClass/DP0701EN/version-2/Data-Collisions.csv'
df = pd.read_csv(filename)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.columns

Index(['SEVERITYCODE', 'X', 'Y', 'OBJECTID', 'INCKEY', 'COLDETKEY', 'REPORTNO',
       'STATUS', 'ADDRTYPE', 'INTKEY', 'LOCATION', 'EXCEPTRSNCODE',
       'EXCEPTRSNDESC', 'SEVERITYCODE.1', 'SEVERITYDESC', 'COLLISIONTYPE',
       'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'INCDATE',
       'INCDTTM', 'JUNCTIONTYPE', 'SDOT_COLCODE', 'SDOT_COLDESC',
       'INATTENTIONIND', 'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND',
       'PEDROWNOTGRNT', 'SDOTCOLNUM', 'SPEEDING', 'ST_COLCODE', 'ST_COLDESC',
       'SEGLANEKEY', 'CROSSWALKKEY', 'HITPARKEDCAR'],
      dtype='object')

In [4]:
# Create Dataframe with relevant columns
collisions = df[['SEVERITYCODE', 'ADDRTYPE', 'COLLISIONTYPE','PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'JUNCTIONTYPE', 'INATTENTIONIND', 'WEATHER', 'ROADCOND', 'LIGHTCOND', 'UNDERINFL','SPEEDING','HITPARKEDCAR']]
collisions.shape

(194673, 15)

## Data Cleaning

In [5]:
# Removing Rows where Feature values are unknowns
collisions = collisions[collisions['JUNCTIONTYPE'] != "Unknown"]
collisions = collisions[collisions['WEATHER'] != "Unknown"]
collisions = collisions[collisions['ROADCOND'] != "Unknown"]
collisions = collisions[collisions['LIGHTCOND'] != "Unknown"]

# Speeding NaN with No! 
collisions['SPEEDING'] = collisions['SPEEDING'].fillna('N')
collisions['INATTENTIONIND'] = collisions['INATTENTIONIND'].fillna('N')

# Drop NaN
collisions.dropna()
collisions = collisions.reset_index(drop=True)

In [6]:
collisions.shape

(175763, 15)

In [7]:
# Convert Y and N to 1 and 0 (numerical values)
collisions['INATTENTIONIND'].replace('N',0, inplace=True)
collisions['INATTENTIONIND'].replace('Y',0, inplace=True)

collisions['SPEEDING'].replace('N',0, inplace=True)
collisions['SPEEDING'].replace('Y',1, inplace=True)


collisions['HITPARKEDCAR'].replace('N',0, inplace=True)
collisions['HITPARKEDCAR'].replace('Y',1, inplace=True)

collisions['UNDERINFL'].replace('N',0, inplace=True)
collisions['UNDERINFL'].replace('Y',1, inplace=True)

In [8]:
# Subframe - with relevant Features for Model.
collisions = collisions[['SEVERITYCODE', 'ADDRTYPE', 'COLLISIONTYPE','WEATHER', 'LIGHTCOND','ROADCOND', 'UNDERINFL','SPEEDING','INATTENTIONIND', 'HITPARKEDCAR']]
collisions.head()

Unnamed: 0,SEVERITYCODE,ADDRTYPE,COLLISIONTYPE,WEATHER,LIGHTCOND,ROADCOND,UNDERINFL,SPEEDING,INATTENTIONIND,HITPARKEDCAR
0,2,Intersection,Angles,Overcast,Daylight,Wet,0,0,0,0
1,1,Block,Sideswipe,Raining,Dark - Street Lights On,Wet,0,0,0,0
2,1,Block,Parked Car,Overcast,Daylight,Dry,0,0,0,0
3,1,Block,Other,Clear,Daylight,Dry,0,0,0,0
4,2,Intersection,Angles,Raining,Daylight,Wet,0,0,0,0


In [9]:
# Categorical Variable to Numeric values.
collisions['ADDRTYPE'].value_counts()
collisions = collisions[pd.notnull(collisions['ADDRTYPE'])]
collisions = collisions.reset_index(drop=True)

addrtype = preprocessing.LabelEncoder()
addrtype.fit(['Block','Intersection','Alley'])
collisions['ADDRTYPE'] = addrtype.transform(collisions['ADDRTYPE'])

collisions.shape

(174926, 10)

In [10]:
collisions['COLLISIONTYPE'].value_counts()
collisions = collisions[pd.notnull(collisions['COLLISIONTYPE'])]
collisions = collisions.reset_index(drop=True)

# Remove "Other"
collisions = collisions[collisions.COLLISIONTYPE != 'Other']

colltype = preprocessing.LabelEncoder()
colltype.fit(['Angles','Parked Car','Rear Ended','Rear Ended','Sideswipe','Left Turn','Pedestrian','Cycles','Right Turn','Head On'])
collisions['COLLISIONTYPE'] = colltype.transform(collisions['COLLISIONTYPE'])

collisions.shape

(147595, 10)

In [11]:
collisions['WEATHER'].value_counts()

collisions = collisions[pd.notnull(collisions['WEATHER'])]
collisions = collisions.reset_index(drop=True)

# Remove "Other"
collisions = collisions[collisions.WEATHER != 'Other']

weathertype = preprocessing.LabelEncoder()
weathertype.fit(['Clear','Raining', 'Overcast', 'Snowing', 'Fog/Smog/Smoke', 'Sleet/Hail/Freezing Rain','Blowing Sand/Dirt','Severe Crosswind','Partly Cloudy'])
collisions['WEATHER'] = weathertype.transform(collisions['WEATHER'])

collisions.shape

(147253, 10)

In [12]:
collisions['LIGHTCOND'].value_counts()

collisions = collisions[pd.notnull(collisions['LIGHTCOND'])]
collisions = collisions.reset_index(drop=True)

collisions = collisions[collisions.LIGHTCOND != 'Other']
collisions['LIGHTCOND'] = collisions['LIGHTCOND'].replace('Dark - Street Lights On', 'Dark')
collisions['LIGHTCOND'] = collisions['LIGHTCOND'].replace('Dark - No Street Lights', 'Dark')
collisions['LIGHTCOND'] = collisions['LIGHTCOND'].replace('Dark - Street Lights Off','Dark')
collisions['LIGHTCOND'] = collisions['LIGHTCOND'].replace('Dark - Unknown Lighting', 'Dark')

light_type = preprocessing.LabelEncoder()
light_type.fit(['Daylight','Dark', 'Dusk', 'Dawn'])
collisions['LIGHTCOND'] = light_type.transform(collisions['LIGHTCOND'])

collisions.shape

(146967, 10)

In [13]:
collisions['ROADCOND'].value_counts()

collisions = collisions[pd.notnull(collisions['ROADCOND'])]
collisions = collisions.reset_index(drop=True)

collisions = collisions[collisions.ROADCOND != 'Other']


roadcond = preprocessing.LabelEncoder()
roadcond.fit(['Dry','Wet','Ice','Snow/Slush','Standing Water','Sand/Mud/Dirt','Oil'])
collisions['ROADCOND'] = roadcond.transform(collisions['ROADCOND'])

collisions.shape

(146880, 10)

In [14]:
collisions.head()

Unnamed: 0,SEVERITYCODE,ADDRTYPE,COLLISIONTYPE,WEATHER,LIGHTCOND,ROADCOND,UNDERINFL,SPEEDING,INATTENTIONIND,HITPARKEDCAR
0,2,2,0,3,2,6,0,0,0,0
1,1,1,8,5,0,6,0,0,0,0
2,1,1,4,3,2,0,0,0,0,0
3,2,2,0,5,2,6,0,0,0,0
4,1,2,0,1,2,0,0,0,0,0


In [15]:
X = collisions[['ADDRTYPE','COLLISIONTYPE','WEATHER','LIGHTCOND','ROADCOND','UNDERINFL','SPEEDING','INATTENTIONIND','HITPARKEDCAR']].values
X[0:5]

array([[2, 0, 3, 2, 6, 0, 0, 0, 0],
       [1, 8, 5, 0, 6, '0', 0, 0, 0],
       [1, 4, 3, 2, 0, '0', 0, 0, 0],
       [2, 0, 5, 2, 6, '0', 0, 0, 0],
       [2, 0, 1, 2, 0, 0, 0, 0, 0]], dtype=object)

In [16]:
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]



array([[ 1.23775641, -1.45144996,  0.55758625,  0.56591988,  1.65791983,
        -0.21328644, -0.1956852 ,  0.        , -0.18213444],
       [-0.79200491,  1.54405507,  1.7880567 , -1.58759157,  1.65791983,
        -0.21328644, -0.1956852 ,  0.        , -0.18213444],
       [-0.79200491,  0.04630256,  0.55758625,  0.56591988, -0.60860348,
        -0.21328644, -0.1956852 ,  0.        , -0.18213444],
       [ 1.23775641, -1.45144996,  1.7880567 ,  0.56591988,  1.65791983,
        -0.21328644, -0.1956852 ,  0.        , -0.18213444],
       [ 1.23775641, -1.45144996, -0.67288421,  0.56591988, -0.60860348,
        -0.21328644, -0.1956852 ,  0.        , -0.18213444]])

In [17]:
y = collisions['SEVERITYCODE']
y[0:5]

0    2
1    1
2    1
3    2
4    1
Name: SEVERITYCODE, dtype: int64

In [18]:
X_trainset, X_testset, y_trainset, y_testset = train_test_split(X, y, test_size=0.3, random_state=3)

In [19]:
print("X Train Set: ", X_trainset.shape)
print("Y Train Set: ",  y_trainset.shape)
print("X Test Set: ", X_testset.shape)
print("Y Test Set: ",  y_testset.shape)

X Train Set:  (102816, 9)
Y Train Set:  (102816,)
X Test Set:  (44064, 9)
Y Test Set:  (44064,)


## Decision Tree

In [20]:
SeverityTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
SeverityTree

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [21]:
SeverityTree.fit(X_trainset, y_trainset)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [22]:
predTree = SeverityTree.predict(X_testset)

In [23]:
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_testset, predTree))
print("DecisionTrees's Jaccard Index: ", jaccard_similarity_score(y_testset, predTree))
print("DecisionTrees's F1 Score: ", f1_score(y_testset, predTree,average='weighted'))

DecisionTrees's Accuracy:  0.7037037037037037
DecisionTrees's Jaccard Index:  0.7037037037037037
DecisionTrees's F1 Score:  0.6377511520156639


## Logistic Regression

In [24]:

LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_trainset,y_trainset)
LR

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [25]:
y = LR.predict(X_testset)
y

array([1, 1, 1, ..., 1, 1, 1])

In [26]:
print("Logistic Regression Accuracy: ", metrics.accuracy_score(y_testset, y))
print("Logistic Regression Jaccard Similarity: ", jaccard_similarity_score(y_testset, y))
print("Logistic Regression F1 Score: ", f1_score(y_testset, y,average='weighted'))

Logistic Regression Accuracy:  0.6633986928104575
Logistic Regression Jaccard Similarity:  0.6633986928104575
Logistic Regression F1 Score:  0.548167264302313


## Support Vector Machine (SVM)

In [27]:
clf = svm.SVC(kernel='rbf')
clf.fit(X_trainset, y_trainset) 



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [28]:
y = clf.predict(X_testset)
y [0:5]

array([1, 1, 2, 1, 1])

In [29]:
print("SVM Accuracy: ", metrics.accuracy_score(y_testset, y))
print("SVM Jaccard Similarity: ", jaccard_similarity_score(y_testset, y))
print("SVM F1 Score: ", f1_score(y_testset, y,average='weighted'))

SVM Accuracy:  0.6665531953522149
SVM Jaccard Similarity:  0.6665531953522149
SVM F1 Score:  0.6132657857105431
