# NHTSA Data Discovery Part 2

In [1]:
import numpy as np
import csv
import matplotlib.pyplot as plt
from scipy.stats import mode

from io import StringIO
import requests
import json
import pandas as pd

import random
from sklearn.ensemble import RandomForestClassifier # This is testing, will review classifiers and modeling options soon

In [2]:
df_accidents = pd.read_csv("Data/2015/accident.csv")
df_vehicles = pd.read_csv("Data/2015/vehicle.csv")

In [3]:
toRemove = ("COUNTY", "CITY", "TWAY_ID", "TWAY_ID2", "VE_TOTAL", 
            "VE_FORMS", "YEAR", "NHS", "MILEPT", "LATITUDE", "LONGITUD", 
            "RELJCT1", "RELJCT2", "TYP_INT", "REL_ROAD", "PEDS", 
            "DAY", "MONTH", "DAY_WEEK", "HOUR", "MINUTE", "RUR_URB",
            "HOSP_HR", "HOSP_MN", "NOT_HOUR", "NOT_MIN", "ARR_HOUR",
            "ARR_MIN", "SP_JUR", "LGT_COND", "ROUTE", "RAIL", "WEATHER",
            "WEATHER1", "WEATHER2", "WRK_ZONE", "HARM_EV", "FUNC_SYS",
            "RD_OWNER", "CF1", "CF2", "CF3", "SCH_BUS")
df_accidents_copy = df_accidents

for item in toRemove:
    if item in df_accidents_copy:
        del df_accidents_copy[item]

#How many unique values?
for column in df_accidents_copy:
    print(column, len(df_accidents_copy[column].unique()))

STATE 51
ST_CASE 32538
PVH_INVL 10
PERNOTMVIT 12
PERMVIT 37
PERSONS 36
MAN_COLL 11
FATALS 8
DRUNK_DR 4


In [4]:
vehiclesToKeep = ["ST_CASE", "MAKE", "MODEL", "MOD_YEAR"]
df_vehicles_copy = df_vehicles[vehiclesToKeep]


#How many unique values?
for column in df_vehicles_copy:
    print(column, len(df_vehicles_copy[column].unique()))

ST_CASE 32166
MAKE 68
MODEL 130
MOD_YEAR 77


In [5]:
#Find blank or whitespace
df_vehicles = df_vehicles.replace(r'\s+\t+', np.nan, regex=True).replace('', np.nan)

#Find blank or whitespace
df_accidents = df_accidents.replace(r'\s+\t+', np.nan, regex=True).replace('', np.nan)

In [6]:
df_merge = pd.merge(df_accidents_copy, df_vehicles_copy, on="ST_CASE")

In [7]:
len(df_merge)

48864

In [8]:
df_merge['MULTI_FATAL'] = np.where(df_merge['FATALS']>1, True, False)

In [9]:
df_merge.astype({'STATE':'str', 'ST_CASE':'str', 'MAKE':'str', 'MODEL':'str', 'MOD_YEAR':'str'}).dtypes

STATE          object
ST_CASE        object
PVH_INVL        int64
PERNOTMVIT      int64
PERMVIT         int64
PERSONS         int64
MAN_COLL        int64
FATALS          int64
DRUNK_DR        int64
MAKE           object
MODEL          object
MOD_YEAR       object
MULTI_FATAL      bool
dtype: object

In [10]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

X=df_merge[['MAKE', 'MODEL', 'MOD_YEAR']]  # Features
y=df_merge['MULTI_FATAL']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=True) # 85% training and 15% test

#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=1000)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

Accuracy: 0.9028649386084584
[[6613   44]
 [ 668    5]]


In [11]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

X=df_merge[['MAKE', 'MODEL', 'MOD_YEAR']]  # Features
y=df_merge['MULTI_FATAL']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=1000)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9038199181446112


In [12]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

X=df_merge[['MAKE', 'MODEL', 'MOD_YEAR']]  # Features
y=df_merge['FATALS']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=1000)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9044338335607094


In [17]:
# Attempt to fix this issue where so much of our data is in fact a 1 fatality crash
print(df_merge.groupby(['MAKE', 'MODEL','MULTI_FATAL']).size())

MAKE  MODEL  MULTI_FATAL
1     7      False             1
      8      False             1
      398    False             1
2     1      False            34
             True              2
      402    False             9
             True              1
      403    False           289
             True             34
      404    False           501
             True             47
      405    False           160
             True             14
      406    False            18
      407    False            47
             True             10
      421    False             1
      422    False            36
             True              5
      431    False             1
      481    True              1
      482    False             4
             True              1
      999    False             5
3     402    False            11
      421    False             3
      431    False            18
      481    False             1
      884    False             1
6     10     False

In [14]:
# Attempt to fix this issue where so much of our data is in fact a 1 fatality crash
print(df_merge['FATALS'].value_counts())

1     44519
2      3533
3       564
4       150
5        72
6        24
10        1
8         1
Name: FATALS, dtype: int64


In [16]:
pd.options.display.max_rows = 5000

In [None]:
# https://stackoverflow.com/questions/20250771/remap-values-in-pandas-column-with-a-dict

In [56]:
df_merge_1fatal = df_merge.loc[df_merge['MULTI_FATAL'] == False]
print(len(df_merge_1fatal))
df_merge_Nfatal = df_merge.loc[df_merge['MULTI_FATAL'] == True]
print(len(df_merge_Nfatal))

df_merge_1fatal_sample = df_merge_1fatal.sample(n=2*len(df_merge_Nfatal), replace=False)
print(len(df_merge_1fatal_sample))

44519
4345
8690


In [57]:
df_merge_1fatal_sample_rebrand = df_merge_1fatal_sample.copy()
df_merge_1fatal_sample_rebrand['MULTI_FATAL'] = False

df_merge_Nfatal_rebrand = df_merge_Nfatal.copy()
df_merge_Nfatal_rebrand['MULTI_FATAL'] = True

frames = [df_merge_Nfatal_rebrand, df_merge_1fatal_sample_rebrand]

df_merge_concat = pd.concat(frames).sample(frac=1)
print(df_merge_concat.head())
print(len(df_merge_concat))

       STATE  ST_CASE  PVH_INVL  PERNOTMVIT  PERMVIT  PERSONS  MAN_COLL  \
22776     26   260218         0           0        9        9         6   
16576     17   170659         0           0        3        3         1   
4846       6    61098         0           0        1        1         0   
33429     39   390545         0           0        5        5         1   
9760      12   120500         0           0        2        2         0   

       FATALS  DRUNK_DR  MAKE  MODEL  MOD_YEAR  MULTI_FATAL  
22776       1         0    20      2      2013        False  
16576       1         0    20    481      2014        False  
4846        1         1    48     45      2005        False  
33429       1         0    20     24      2011        False  
9760        1         1    49    401      2001        False  
13035


In [61]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

X=df_merge_concat[['MAKE', 'MODEL', 'MOD_YEAR']]  # Features
y=df_merge_concat['FATALS']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) # 70% training and 30% test

#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=500)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6060606060606061


In [None]:
# Try and use an SVM for this now, will take more time