In [1]:
import pandas as pd
import numpy as np

In [2]:
!pip install imblearn



In [3]:
from imblearn.under_sampling import RandomUnderSampler

In [4]:
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.metrics import classification_report_imbalanced

In [5]:
df_ml_data = pd.read_csv("final.csv")

In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.svm import SVC

In [9]:
import joblib
import pickle

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin
class PandasDummies(BaseEstimator, TransformerMixin):
    def transform(self, X, *_):
        return pd.get_dummies(X)
    
    def fit(self, *_):
        return self

In [11]:
X = df_ml_data[['Number of Vehicles', 'Manner of Collision',
                'Road Surface Condition', 'Ambient Light',
       'Weather Condition', 'PRCP', 'SNOW', 'TMAX', 'TMIN', 'Ice']]

In [12]:
X = pd.get_dummies(X)
y = df_ml_data['Crash Severity']
print(X.shape, y.shape)

(21493, 40) (21493,)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify =y)
data = X.copy()

In [15]:
model1 = make_pipeline_imb(RandomUnderSampler(), StandardScaler(), RandomForestClassifier(n_estimators=10, max_depth=20))

model1.fit(X_train, y_train)

Pipeline(steps=[('randomundersampler', RandomUnderSampler()),
                ('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=20, n_estimators=10))])

In [16]:
y_pred = model1.predict(X_test)

In [18]:
print(f"Training Data Score with Random Forest Classifier: {model1.score(X_train, y_train)}")
print(f"Testing Data Score with Random Forest Classifier: {model1.score(X_test, y_test)}")

Training Data Score with Random Forest Classifier: 0.2770643340157578
Testing Data Score with Random Forest Classifier: 0.283029400818757


In [19]:
filename = 'acc_sev_prediction.pickle'
joblib.dump(model1, filename)

['acc_sev_prediction.pickle']

In [20]:
pipe = make_pipeline_imb(RandomUnderSampler(),
                         StandardScaler(),
                         RandomForestClassifier(n_estimators=10, max_depth=10))

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [21]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.05      0.31      0.72      0.08      0.48      0.22       229
          1       0.69      0.28      0.72      0.40      0.45      0.19      3682
          2       0.26      0.17      0.82      0.21      0.38      0.13      1448
          3       0.00      0.20      0.73      0.00      0.38      0.14        15

avg / total       0.54      0.25      0.75      0.33      0.43      0.18      5374



In [22]:
print(f"Training Data Score with Random Forest Classifier: {pipe.score(X_train, y_train)}")
print(f"Testing Data Score with Random Forest Classifier: {pipe.score(X_test, y_test)}")

Training Data Score with Random Forest Classifier: 0.2588249891432471
Testing Data Score with Random Forest Classifier: 0.25139560848529957


In [23]:
lg_pipe = make_pipeline_imb(RandomUnderSampler(),
                         StandardScaler(),
                         LogisticRegression())

lg_pipe.fit(X_train, y_train)
y_pred = lg_pipe.predict(X_test)

In [24]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.07      0.31      0.80      0.11      0.50      0.24       229
          1       0.73      0.35      0.72      0.47      0.50      0.24      3682
          2       0.31      0.32      0.74      0.31      0.49      0.23      1448
          3       0.00      0.33      0.80      0.01      0.52      0.25        15

avg / total       0.59      0.34      0.73      0.41      0.50      0.24      5374



In [25]:
print(f"Training Data Score with Logistic Classifier: {lg_pipe.score(X_train, y_train)}")
print(f"Testing Data Score with Logistic Classifier: {lg_pipe.score(X_test, y_test)}")

Training Data Score with Logistic Classifier: 0.334822259445375
Testing Data Score with Logistic Classifier: 0.3386676590993673


In [26]:
filename = 'acc_sev_prediction.pickle'
joblib.dump(pipe, filename)

['acc_sev_prediction.pickle']

In [27]:
my_model_lg = joblib.load("acc_sev_prediction.pickle")

In [28]:
pd.set_option('display.max_columns', None)

'Number of Vehicles'  1-13
'PRCP',  0-4.96
'SNOW', 0-21
'TMAX', 7-103
'TMIN', -10-77
'Ice', True/False
       'Manner of Collision_0', 
       'Manner of Collision_Angle',
       'Manner of Collision_Front to Front',
       'Manner of Collision_Front to Rear', 
       'Manner of Collision_Head-on',
       'Manner of Collision_Rear to Side', 
       'Manner of Collision_Rear-end',
       'Manner of Collision_Rear-to-rear',
       'Manner of Collision_Sideswipe,opposite direction',
       'Manner of Collision_Sideswipe, same direction',
       'Manner of Collision_Single vehicle crash', 
       'Road Surface Condition_0',
       'Road Surface Condition_Dry', 
       'Road Surface Condition_Ice',
       'Road Surface Condition_Other',
       'Road Surface Condition_Sand,mud, dirt, oil, gravel',
       'Road Surface Condition_Slush', 
       'Road Surface Condition_Snow',
       'Road Surface Condition_Water (standing, moving)',
       'Road Surface Condition_Wet', 
       'Ambient Light_0',
       'Ambient Light_Dark - 0 roadway lighting',
       'Ambient Light_Dark - lighted roadway',
       'Ambient Light_Dark - roadway not lighted',
       'Ambient Light_Dawn',
       'Ambient Light_Daylight', 
       'Ambient Light_Dusk',
       'Ambient Light_Other',
       'Weather Condition_Clear',
       'Weather Condition_Cloudy',
       'Weather Condition_Cloudy/Rain',
       'Weather Condition_Fog',
       'Weather Condition_Rain', 
       'Weather Condition_Snow'

In [31]:
my_model_lg.steps[-1][1].predict([[5, 3, 0, 35,22,False, 0, 0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0]])

array([1], dtype=int64)

Not Reported 0
Property damage only (none injured) 1
Non-fatal injury 2
Fatal injury 3
