## Predicting Fatal Accidents - Final Model

### BalancedRandomForestClassifier

In [10]:
import json
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier
from config import db_password

In [2]:
file_dir = 'C://Users/sd0066/Documents/GitHub/Final/Resources'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/fatal_accident_db"
engine = create_engine(db_string)

In [3]:
# fetch data from database
cursor = engine.execute('SELECT "REGIONNAME" as "region", "MONTHNAME" as "month", "LGT_CONDNAME" as "light_condition", "WEATHERNAME" as "weather", "MAK_MODNAME" as "makmod", "MOD_YEAR" as "year", "Outcome" as "outcome"\
FROM accident2020 LEFT JOIN vehicle2020 ON accident2020."CASENUM" = vehicle2020."CASENUM" UNION ALL \
SELECT "REGIONNAME" as "region",  "MONTHNAME" as "month", "LGT_CONDNAME" as "light_condition", "WEATHERNAME" as "weather","MAK_MODNAME" as "makmod", "MOD_YEAR" as "year","Outcome" as "outcome" FROM accident2019 \
LEFT JOIN vehicle2019 ON accident2019."CASENUM" = vehicle2019."CASENUM"').fetchall()

In [4]:
accident_df = pd.DataFrame(cursor, columns=["region","month","light_condtion","weather","make_and_model","year","outcome"])
accident_df

Unnamed: 0,region,month,light_condtion,weather,make_and_model,year,outcome
0,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Clear,Nissan/Datsun Versa,2018.0,0
1,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Rain,GMC Medium/Heavy - CBE,2016.0,0
2,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Rain,Dodge Caravan/Grand Caravan,2016.0,0
3,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Rain,Ford Mustang/Mustang II,2005.0,0
4,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Dawn,Clear,Nissan/Datsun 810/Maxima,2010.0,0
...,...,...,...,...,...,...,...
174139,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",April,Daylight,Clear,,,0
174140,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",August,Daylight,Clear,,,0
174141,"West (MT, ID, WA, OR, CA, NV, NM, AZ, UT, CO, ...",January,Dusk,Cloudy,,,0
174142,"Northeast (PA, NJ, NY, NH, VT, RI, MA, ME, CT)",June,Daylight,Clear,,,0


In [5]:
# make dummies for categorical data
dummy_df = pd.get_dummies(accident_df, columns=["region","month","light_condtion","weather","make_and_model","year"])
dummy_df.head()

Unnamed: 0,outcome,"region_Midwest (OH, IN, IL, MI, WI, MN, ND, SD, NE, IA, MO, KS)","region_Northeast (PA, NJ, NY, NH, VT, RI, MA, ME, CT)","region_South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA, FL, AL, MS, LA, AR, OK, TX)","region_West (MT, ID, WA, OR, CA, NV, NM, AZ, UT, CO, WY, AK, HI)",month_April,month_August,month_December,month_February,month_January,...,year_2012.0,year_2013.0,year_2014.0,year_2015.0,year_2016.0,year_2017.0,year_2018.0,year_2019.0,year_2020.0,year_2021.0
0,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
y = dummy_df ["outcome"]
X = dummy_df .drop(columns="outcome")

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(130608, 1161)

In [8]:
X_test.shape

(43536, 1161)

In [11]:
# Resample the training data with the BalancedRandomForestClassifier
random_forest_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
random_forest_model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [12]:
# Calculated the balanced accuracy score
y_pred = random_forest_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6229943557252144

In [13]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[28402, 14262],
       [  366,   506]], dtype=int64)

In [14]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.67      0.58      0.80      0.62      0.39     42664
          1       0.03      0.58      0.67      0.06      0.62      0.38       872

avg / total       0.97      0.66      0.58      0.78      0.62      0.39     43536

