## Predicting Fatal Accidents - Testing Models

In [1]:
import json
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

In [2]:
from config import db_password

In [3]:
file_dir = 'C://Users/sd0066/Documents/GitHub/Final/Resources'

In [4]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/fatal_accident_db"

In [5]:
engine = create_engine(db_string)

In [6]:
# fetch data from database
cursor = engine.execute('SELECT "REGIONNAME" as "region", "MONTHNAME" as "month", "LGT_CONDNAME" as "light_condition", "WEATHERNAME" as "weather", "MAK_MODNAME" as "makmod", "MOD_YEAR" as "year", "Outcome" as "outcome"\
FROM accident2020 LEFT JOIN vehicle2020 ON accident2020."CASENUM" = vehicle2020."CASENUM" UNION ALL \
SELECT "REGIONNAME" as "region",  "MONTHNAME" as "month", "LGT_CONDNAME" as "light_condition", "WEATHERNAME" as "weather","MAK_MODNAME" as "makmod", "MOD_YEAR" as "year","Outcome" as "outcome" FROM accident2019 \
LEFT JOIN vehicle2019 ON accident2019."CASENUM" = vehicle2019."CASENUM"').fetchall()

In [7]:
accident_df = pd.DataFrame(cursor, columns=["region","month","light_condtion","weather","make_and_model","year","outcome"])
accident_df

Unnamed: 0,region,month,light_condtion,weather,make_and_model,year,outcome
0,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Clear,Nissan/Datsun Versa,2018.0,0
1,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Rain,GMC Medium/Heavy - CBE,2016.0,0
2,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Rain,Dodge Caravan/Grand Caravan,2016.0,0
3,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Rain,Ford Mustang/Mustang II,2005.0,0
4,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Dawn,Clear,Nissan/Datsun 810/Maxima,2010.0,0
...,...,...,...,...,...,...,...
174139,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",April,Daylight,Clear,,,0
174140,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",August,Daylight,Clear,,,0
174141,"West (MT, ID, WA, OR, CA, NV, NM, AZ, UT, CO, ...",January,Dusk,Cloudy,,,0
174142,"Northeast (PA, NJ, NY, NH, VT, RI, MA, ME, CT)",June,Daylight,Clear,,,0


In [8]:
# make dummies for categorical data
dummy_df = pd.get_dummies(accident_df, columns=["region","month","light_condtion","weather","make_and_model","year"])
dummy_df.head()

Unnamed: 0,outcome,"region_Midwest (OH, IN, IL, MI, WI, MN, ND, SD, NE, IA, MO, KS)","region_Northeast (PA, NJ, NY, NH, VT, RI, MA, ME, CT)","region_South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA, FL, AL, MS, LA, AR, OK, TX)","region_West (MT, ID, WA, OR, CA, NV, NM, AZ, UT, CO, WY, AK, HI)",month_April,month_August,month_December,month_February,month_January,...,year_2012.0,year_2013.0,year_2014.0,year_2015.0,year_2016.0,year_2017.0,year_2018.0,year_2019.0,year_2020.0,year_2021.0
0,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [9]:
y = dummy_df ["outcome"]
X = dummy_df .drop(columns="outcome")

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(130608, 1161)

In [11]:
X_test.shape

(43536, 1161)

In [12]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [13]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [14]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [15]:
from sklearn.metrics import confusion_matrix, classification_report

In [16]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9799705990444689


In [17]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[42664     0]
 [  872     0]]


In [18]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     42664
           1       0.00      0.00      0.00       872

    accuracy                           0.98     43536
   macro avg       0.49      0.50      0.49     43536
weighted avg       0.96      0.98      0.97     43536



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Naive Random Oversampling

In [19]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [20]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({0: 127993, 1: 127993})

In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

random_over_model = LogisticRegression(solver='lbfgs', random_state=1)
random_over_model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = random_over_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

### SMOTE Oversampling

In [None]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
smote_model = LogisticRegression(solver='lbfgs', random_state=1)
smote_model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = smote_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

# Undersampling

In [None]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
random_under_model = LogisticRegression(solver='lbfgs', random_state=1)
random_under_model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = random_under_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

# Combination (Over and Under) Sampling

In [None]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
combo_model = LogisticRegression(solver='lbfgs', random_state=1)
combo_model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = combo_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

### Balanced Random Forest Classifier

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_forest_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
random_forest_model.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = random_forest_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

## BorderlineSMOTE

In [None]:
# Resample the training data with BorderlineSMOTE
from imblearn.over_sampling import BorderlineSMOTE 
borderline_smote = BorderlineSMOTE(random_state=0)
X_resampled, y_resampled = borderline_smote.fit_resample(X, y)
Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
borderline_smote_model = LogisticRegression(solver='lbfgs', random_state=1)
borderline_smote_model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = borderline_smote_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))