In [1]:
import numpy as np  
import pandas as pd 
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [2]:
filename = 'https://s3.us.cloud-object-storage.appdomain.cloud/cf-courses-data/CognitiveClass/DP0701EN/version-2/Data-Collisions.csv'
df = pd.read_csv(filename)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.columns

Index(['SEVERITYCODE', 'X', 'Y', 'OBJECTID', 'INCKEY', 'COLDETKEY', 'REPORTNO',
       'STATUS', 'ADDRTYPE', 'INTKEY', 'LOCATION', 'EXCEPTRSNCODE',
       'EXCEPTRSNDESC', 'SEVERITYCODE.1', 'SEVERITYDESC', 'COLLISIONTYPE',
       'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'INCDATE',
       'INCDTTM', 'JUNCTIONTYPE', 'SDOT_COLCODE', 'SDOT_COLDESC',
       'INATTENTIONIND', 'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND',
       'PEDROWNOTGRNT', 'SDOTCOLNUM', 'SPEEDING', 'ST_COLCODE', 'ST_COLDESC',
       'SEGLANEKEY', 'CROSSWALKKEY', 'HITPARKEDCAR'],
      dtype='object')

In [4]:
# Create Dataframe with relevant columns
collisions = df[['SEVERITYCODE', 'ADDRTYPE', 'COLLISIONTYPE','PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'JUNCTIONTYPE', 'INATTENTIONIND', 'WEATHER', 'ROADCOND', 'LIGHTCOND', 'UNDERINFL','SPEEDING','HITPARKEDCAR']]
collisions.shape

(194673, 15)

## Data Cleaning

In [5]:
# Removing Rows where Feature values are unknowns
collisions = collisions[collisions['JUNCTIONTYPE'] != "Unknown"]
collisions = collisions[collisions['WEATHER'] != "Unknown"]
collisions = collisions[collisions['ROADCOND'] != "Unknown"]
collisions = collisions[collisions['LIGHTCOND'] != "Unknown"]

# Speeding NaN with No! 
collisions['SPEEDING'] = collisions['SPEEDING'].fillna('N')
collisions['INATTENTIONIND'] = collisions['INATTENTIONIND'].fillna('N')

# Drop NaN
collisions.dropna()
collisions = collisions.reset_index(drop=True)

In [6]:
collisions.shape

(175763, 15)

## Features

In [7]:
# Convert Y and N to 1 and 0 (numerical values)
collisions['INATTENTIONIND'].replace('N',0, inplace=True)
collisions['INATTENTIONIND'].replace('Y',0, inplace=True)

collisions['SPEEDING'].replace('N',0, inplace=True)
collisions['SPEEDING'].replace('Y',1, inplace=True)


collisions['HITPARKEDCAR'].replace('N',0, inplace=True)
collisions['HITPARKEDCAR'].replace('Y',1, inplace=True)

collisions['UNDERINFL'].replace('N',0, inplace=True)
collisions['UNDERINFL'].replace('Y',1, inplace=True)

In [8]:
collisions = collisions[['SEVERITYCODE', 'ADDRTYPE', 'COLLISIONTYPE','WEATHER', 'LIGHTCOND','ROADCOND', 'UNDERINFL','SPEEDING','INATTENTIONIND', 'HITPARKEDCAR']]
collisions.head()

Unnamed: 0,SEVERITYCODE,ADDRTYPE,COLLISIONTYPE,WEATHER,LIGHTCOND,ROADCOND,UNDERINFL,SPEEDING,INATTENTIONIND,HITPARKEDCAR
0,2,Intersection,Angles,Overcast,Daylight,Wet,0,0,0,0
1,1,Block,Sideswipe,Raining,Dark - Street Lights On,Wet,0,0,0,0
2,1,Block,Parked Car,Overcast,Daylight,Dry,0,0,0,0
3,1,Block,Other,Clear,Daylight,Dry,0,0,0,0
4,2,Intersection,Angles,Raining,Daylight,Wet,0,0,0,0


In [9]:
collisions['ADDRTYPE'].value_counts()
collisions = collisions[pd.notnull(collisions['ADDRTYPE'])]
collisions = collisions.reset_index(drop=True)

addrtype = preprocessing.LabelEncoder()
addrtype.fit(['Block','Intersection','Alley'])
collisions['ADDRTYPE'] = addrtype.transform(collisions['ADDRTYPE'])
collisions.shape

(174926, 10)

In [10]:
collisions['COLLISIONTYPE'].value_counts()
collisions = collisions[pd.notnull(collisions['COLLISIONTYPE'])]
collisions = collisions.reset_index(drop=True)

# TODO: Remove "Other"
colltype = preprocessing.LabelEncoder()
colltype.fit(['Angles','Parked Car','Rear Ended','Rear Ended','Other','Sideswipe','Left Turn','Pedestrian','Cycles','Right Turn','Head On'])
collisions['COLLISIONTYPE'] = colltype.transform(collisions['COLLISIONTYPE'])

collisions.shape

(170134, 10)

In [11]:
collisions['WEATHER'].value_counts()

collisions = collisions[pd.notnull(collisions['WEATHER'])]
collisions = collisions.reset_index(drop=True)

# TODO: Remove "Other"
weathertype = preprocessing.LabelEncoder()
weathertype.fit(['Clear','Raining', 'Overcast', 'Snowing', 'Fog/Smog/Smoke', 'Other', 'Sleet/Hail/Freezing Rain','Blowing Sand/Dirt','Severe Crosswind','Partly Cloudy'])
collisions['WEATHER'] = weathertype.transform(collisions['WEATHER'])

collisions.shape

(169953, 10)

In [12]:
collisions['LIGHTCOND'].value_counts()

collisions = collisions[pd.notnull(collisions['LIGHTCOND'])]
collisions = collisions.reset_index(drop=True)

# TODO: Remove "Other" & Combine "Dark"
light_type = preprocessing.LabelEncoder()
light_type.fit(['Daylight','Dark - Street Lights On', 'Dusk', 'Dawn', 'Dark - No Street Lights', 'Dark - Street Lights Off', 'Other','Dark - Unknown Lighting'])
collisions['LIGHTCOND'] = light_type.transform(collisions['LIGHTCOND'])

collisions.shape

(169791, 10)

In [13]:
collisions['ROADCOND'].value_counts()

collisions = collisions[pd.notnull(collisions['ROADCOND'])]
collisions = collisions.reset_index(drop=True)

# TODO: Remove "Other" & Combine "Dark"
roadcond = preprocessing.LabelEncoder()
roadcond.fit(['Dry','Wet','Ice','Snow/Slush','Other','Standing Water','Sand/Mud/Dirt','Oil'])
collisions['ROADCOND'] = roadcond.transform(collisions['ROADCOND'])

collisions.shape

(169759, 10)

In [15]:
collisions.head()

Unnamed: 0,SEVERITYCODE,ADDRTYPE,COLLISIONTYPE,WEATHER,LIGHTCOND,ROADCOND,UNDERINFL,SPEEDING,INATTENTIONIND,HITPARKEDCAR
0,2,2,0,4,5,7,0,0,0,0
1,1,1,9,6,2,7,0,0,0,0
2,1,1,5,4,5,0,0,0,0,0
3,1,1,4,1,5,0,0,0,0,0
4,2,2,0,6,5,7,0,0,0,0


In [17]:
X = collisions[['ADDRTYPE','COLLISIONTYPE','WEATHER','LIGHTCOND','ROADCOND','UNDERINFL','SPEEDING','INATTENTIONIND','HITPARKEDCAR']].values
X[0:5]

array([[2, 0, 4, 5, 7, 0, 0, 0, 0],
       [1, 9, 6, 2, 7, '0', 0, 0, 0],
       [1, 5, 4, 5, 0, '0', 0, 0, 0],
       [1, 4, 1, 5, 0, 0, 0, 0, 0],
       [2, 0, 6, 5, 7, '0', 0, 0, 0]], dtype=object)

In [18]:
y = collisions['SEVERITYCODE']
y[0:5]

0    2
1    1
2    1
3    1
4    2
Name: SEVERITYCODE, dtype: int64

In [19]:
X_trainset, X_testset, y_trainset, y_testset = train_test_split(X, y, test_size=0.3, random_state=3)

In [21]:
print("X Train Set: ", X_trainset.shape)
print("Y Train Set: ",  y_trainset.shape)
print("X Test Set: ", X_testset.shape)
print("Y Test Set: ",  y_testset.shape)

X Train Set:  (118831, 9)
Y Train Set:  (118831,)
X Test Set:  (50928, 9)
Y Test Set:  (50928,)


## Decision Tree

In [26]:
SeverityTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
SeverityTree

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [27]:
SeverityTree.fit(X_trainset, y_trainset)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [28]:
predTree = SeverityTree.predict(X_testset)

In [29]:
from sklearn import metrics
import matplotlib.pyplot as plt
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_testset, predTree))

DecisionTrees's Accuracy:  0.700930725730443


## Logistic Regression

In [30]:
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]



array([[ 1.3187838 , -1.53951887,  0.72412238,  0.604335  ,  1.61982777,
        -0.23615186, -0.23789845,  0.        , -0.17058177],
       [-0.73583542,  1.58065954,  1.6800851 , -1.47611814,  1.61982777,
        -0.23615186, -0.23789845,  0.        , -0.17058177],
       [-0.73583542,  0.19391358,  0.72412238,  0.604335  , -0.62415344,
        -0.23615186, -0.23789845,  0.        , -0.17058177],
       [-0.73583542, -0.15277291, -0.70982171,  0.604335  , -0.62415344,
        -0.23615186, -0.23789845,  0.        , -0.17058177],
       [ 1.3187838 , -1.53951887,  1.6800851 ,  0.604335  ,  1.61982777,
        -0.23615186, -0.23789845,  0.        , -0.17058177]])

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import jaccard_similarity_score
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_trainset,y_trainset)
LR

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [36]:
y = LR.predict(X_testset)
y

array([1, 1, 1, ..., 2, 1, 1])

In [37]:
jaccard_similarity_score(y_testset, y)


0.6712613886270814