## Predicting Fatal Accidents - Testing Models

In [12]:
import json
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

In [13]:
from config import db_password

In [19]:
file_dir = 'C://Users/sd0066/Documents/GitHub/Final/Resources'

In [20]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/fatal_accident_db"

In [21]:
engine = create_engine(db_string)

In [23]:
# fetch data from database
cursor = engine.execute('SELECT "REGIONNAME" as "region", "MONTHNAME" as "month", "LGT_CONDNAME" as "light_condition", "WEATHERNAME" as "weather", "MAK_MODNAME" as "makmod", "MOD_YEAR" as "year", "Outcome" as "outcome"\
FROM accident2020 LEFT JOIN vehicle2020 ON accident2020."CASENUM" = vehicle2020."CASENUM" UNION ALL \
SELECT "REGIONNAME" as "region",  "MONTHNAME" as "month", "LGT_CONDNAME" as "light_condition", "WEATHERNAME" as "weather","MAK_MODNAME" as "makmod", "MOD_YEAR" as "year","Outcome" as "outcome" FROM accident2019 \
LEFT JOIN vehicle2019 ON accident2019."CASENUM" = vehicle2019."CASENUM"').fetchall()

In [24]:
accident_df = pd.DataFrame(cursor, columns=["region","month","light_condtion","weather","make_and_model","year","outcome"])
accident_df

Unnamed: 0,region,month,light_condtion,weather,make_and_model,year,outcome
0,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Clear,Chrysler PT Cruiser,9999,0
1,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Clear,Nissan/Datsun Versa,2018,0
2,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Rain,GMC Medium/Heavy - CBE,2016,0
3,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Rain,Dodge Caravan/Grand Caravan,2016,0
4,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Rain,Ford Mustang/Mustang II,2005,0
...,...,...,...,...,...,...,...
179275,"Midwest (OH, IN, IL, MI, WI, MN, ND, SD, NE, I...",July,Daylight,Clear,Honda 125-349 cc,2019,0
179276,"Midwest (OH, IN, IL, MI, WI, MN, ND, SD, NE, I...",July,Dark - Lighted,Clear,Not Reported Not Reported,9998,0
179277,"West (MT, ID, WA, OR, CA, NV, NM, AZ, UT, CO, ...",February,Daylight,Clear,Ford F-Series pickup,2011,0
179278,"West (MT, ID, WA, OR, CA, NV, NM, AZ, UT, CO, ...",February,Daylight,Clear,Dodge Ram Pickup,2016,0


In [25]:
# make dummies for categorical data
dummy_df = pd.get_dummies(accident_df, columns=["region","month","light_condtion","weather","make_and_model","year"])
dummy_df.head()

Unnamed: 0,outcome,"region_Midwest (OH, IN, IL, MI, WI, MN, ND, SD, NE, IA, MO, KS)","region_Northeast (PA, NJ, NY, NH, VT, RI, MA, ME, CT)","region_South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA, FL, AL, MS, LA, AR, OK, TX)","region_West (MT, ID, WA, OR, CA, NV, NM, AZ, UT, CO, WY, AK, HI)",month_April,month_August,month_December,month_February,month_January,...,year_2014,year_2015,year_2016,year_2017,year_2018,year_2019,year_2020,year_2021,year_9998,year_9999
0,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [26]:
y = dummy_df ["outcome"]
X = dummy_df .drop(columns="outcome")

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(134460, 1178)

In [28]:
X_test.shape

(44820, 1178)

In [29]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [30]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=200, random_state=1)

In [31]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [32]:
from sklearn.metrics import confusion_matrix, classification_report

In [33]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9803435966086569


In [34]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[43939     0]
 [  881     0]]


In [35]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     43939
           1       0.00      0.00      0.00       881

    accuracy                           0.98     44820
   macro avg       0.49      0.50      0.50     44820
weighted avg       0.96      0.98      0.97     44820



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Naive Random Oversampling

In [36]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [37]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({0: 131818, 1: 131818})

In [38]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

random_over_model = LogisticRegression(solver='lbfgs', random_state=1)
random_over_model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [39]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = random_over_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6363310434063487

In [40]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[30783, 13156],
       [  377,   504]], dtype=int64)

In [41]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.70      0.57      0.82      0.63      0.41     43939
          1       0.04      0.57      0.70      0.07      0.63      0.40       881

avg / total       0.97      0.70      0.57      0.81      0.63      0.41     44820



### SMOTE Oversampling

In [42]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({0: 131818, 1: 131818})

In [43]:
# Train the Logistic Regression model using the resampled data
smote_model = LogisticRegression(solver='lbfgs', random_state=1)
smote_model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [44]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = smote_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5

In [45]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[43939,     0],
       [  881,     0]], dtype=int64)

In [46]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      1.00      0.00      0.99      0.00      0.00     43939
          1       0.00      0.00      1.00      0.00      0.00      0.00       881

avg / total       0.96      0.98      0.02      0.97      0.00      0.00     44820



  _warn_prf(average, modifier, msg_start, len(result))


# Undersampling

In [41]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 2462, 1: 2462})

In [42]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
random_under_model = LogisticRegression(solver='lbfgs', random_state=1)
random_under_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [43]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = random_under_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6131662860396255

In [44]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[29519, 10551],
       [  419,   402]], dtype=int64)

In [45]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.74      0.49      0.84      0.60      0.37     40070
          1       0.04      0.49      0.74      0.07      0.60      0.35       821

avg / total       0.97      0.73      0.49      0.83      0.60      0.37     40891



# Combination (Over and Under) Sampling

In [46]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 55706, 1: 60692})

In [47]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
combo_model = LogisticRegression(solver='lbfgs', random_state=1)
combo_model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [48]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = combo_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5896766681449972

In [49]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[25001, 15069],
       [  365,   456]], dtype=int64)

In [50]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.62      0.56      0.76      0.59      0.35     40070
          1       0.03      0.56      0.62      0.06      0.59      0.34       821

avg / total       0.97      0.62      0.56      0.75      0.59      0.35     40891



### Balanced Random Forest Classifier

In [51]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_forest_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
random_forest_model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [None]:
# Calculated the balanced accuracy score
y_pred = random_forest_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

## BorderlineSMOTE

In [55]:
# Resample the training data with BorderlineSMOTE
from imblearn.over_sampling import BorderlineSMOTE 
borderline_smote = BorderlineSMOTE(random_state=0)
X_resampled, y_resampled = borderline_smote.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 160280, 1: 160280})

In [56]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
borderline_smote_model = LogisticRegression(solver='lbfgs', random_state=1)
borderline_smote_model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [57]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = borderline_smote_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6186543068509524

In [58]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[31423,  8647],
       [  449,   372]], dtype=int64)

In [59]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.78      0.45      0.87      0.60      0.37     40070
          1       0.04      0.45      0.78      0.08      0.60      0.34       821

avg / total       0.97      0.78      0.46      0.86      0.60      0.37     40891



In [60]:
import pickle
pickle_out = open("model.pkl", "wb")
pickle.dump(random_forest_model, pickle_out)
pickle_out.close()