## Predicting Fatal Accidents - Final Model

### BalancedRandomForestClassifier

In [1]:
import json
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sqlalchemy import create_engine
from config import db_password

In [2]:
file_dir = 'C://Users/sd0066/Documents/GitHub/Final/Resources'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/fatal_accident_db"
engine = create_engine(db_string)

In [3]:
# fetch data from database
cursor = engine.execute('SELECT "REGIONNAME" as "region", "MONTHNAME" as "month", "LGT_CONDNAME" as "light_condition", "WEATHERNAME" as "weather", "Outcome" as "outcome"\
FROM accident2020 LEFT JOIN vehicle2020 ON accident2020."CASENUM" = vehicle2020."CASENUM" UNION ALL \
SELECT "REGIONNAME" as "region",  "MONTHNAME" as "month", "LGT_CONDNAME" as "light_condition", "WEATHERNAME" as "weather","Outcome" as "outcome" FROM accident2019 \
LEFT JOIN vehicle2019 ON accident2019."CASENUM" = vehicle2019."CASENUM"').fetchall()

In [4]:
accident_df = pd.DataFrame(cursor, columns=["region","month","light_condtion","weather","outcome"])
accident_df

Unnamed: 0,region,month,light_condtion,weather,outcome
0,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Clear,0
1,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Clear,0
2,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Rain,0
3,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Rain,0
4,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Rain,0
...,...,...,...,...,...
179275,"Midwest (OH, IN, IL, MI, WI, MN, ND, SD, NE, I...",July,Daylight,Clear,0
179276,"Midwest (OH, IN, IL, MI, WI, MN, ND, SD, NE, I...",July,Dark - Lighted,Clear,0
179277,"West (MT, ID, WA, OR, CA, NV, NM, AZ, UT, CO, ...",February,Daylight,Clear,0
179278,"West (MT, ID, WA, OR, CA, NV, NM, AZ, UT, CO, ...",February,Daylight,Clear,0


In [5]:
category_col =["region","month","light_condtion","weather"]
labelEncoder = preprocessing.LabelEncoder()
 
mapping_dict ={}
for col in category_col:
    accident_df[col] = labelEncoder.fit_transform(accident_df[col])
 
    le_name_mapping = dict(zip(labelEncoder.classes_,
                        labelEncoder.transform(labelEncoder.classes_)))
 
    mapping_dict[col]= le_name_mapping
print(mapping_dict)

{'region': {'Midwest (OH, IN, IL, MI, WI, MN, ND, SD, NE, IA, MO, KS)': 0, 'Northeast (PA, NJ, NY, NH, VT, RI, MA, ME, CT)': 1, 'South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA, FL, AL, MS, LA, AR, OK, TX)': 2, 'West (MT, ID, WA, OR, CA, NV, NM, AZ, UT, CO, WY, AK, HI)': 3}, 'month': {'April': 0, 'August': 1, 'December': 2, 'February': 3, 'January': 4, 'July': 5, 'June': 6, 'March': 7, 'May': 8, 'November': 9, 'October': 10, 'September': 11}, 'light_condtion': {'Dark - Lighted': 0, 'Dark - Not Lighted': 1, 'Dawn': 2, 'Daylight': 3, 'Dusk': 4}, 'weather': {'Blowing Sand, Soil, Dirt': 0, 'Blowing Snow': 1, 'Clear': 2, 'Cloudy': 3, 'Fog, Smog, Smoke': 4, 'Freezing Rain or Drizzle': 5, 'Rain': 6, 'Severe Crosswinds': 7, 'Sleet or Hail': 8, 'Snow': 9}}


In [6]:
from sklearn.model_selection import train_test_splitC:\Users\sweet\Documents\GitHub\Final\Machine Learning\templates\index.html
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier
 
y = accident_df["outcome"]
X = accident_df.drop(columns="outcome")

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(134460, 4)

In [8]:
X_test.shape

(44820, 4)

In [9]:
# Resample the training data with the BalancedRandomForestClassifier
random_forest_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
random_forest_model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [10]:
# Calculated the balanced accuracy score
y_pred = random_forest_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.624153728860352

In [11]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[30062, 13877],
       [  384,   497]], dtype=int64)

In [12]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.68      0.56      0.81      0.62      0.39     43939
          1       0.03      0.56      0.68      0.07      0.62      0.38       881

avg / total       0.97      0.68      0.57      0.79      0.62      0.39     44820



In [15]:
import pickle
pickle_out = open("model.pkl", "wb")
pickle.dump(random_forest_model, pickle_out)
pickle_out.close()