## Predicting Fatal Accidents

In [22]:
import json
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

In [23]:
from config import db_password

In [24]:
file_dir = 'C://Users/sd0066/Documents/GitHub/Final/Resources'

In [25]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/accident_db"

In [26]:
engine = create_engine(db_string)

In [27]:
# fetch data from database
cursor = engine.execute('SELECT "REGIONNAME" as "region", "MONTHNAME" as "month", "LGT_CONDNAME" as "light_condition", "WEATHERNAME" as "weather", "MAK_MODNAME" as "makmod", "MOD_YEAR" as "year", "OUTCOME" as "outcome"\
FROM accident2020 LEFT JOIN vehicle2020 ON accident2020."CASENUM" = vehicle2020."CASENUM" UNION ALL \
SELECT "REGIONNAME" as "region",  "MONTHNAME" as "month", "LGT_CONDNAME" as "light_condition", "WEATHER1NAME" as "weather","MAK_MODNAME" as "makmod", "MOD_YEAR" as "year","OUTCOME" as "outcome" FROM accident2019 \
LEFT JOIN vehicle2019 ON accident2019."CASENUM" = vehicle2019."CASENUM"').fetchall()

In [28]:
accident_df = pd.DataFrame(cursor, columns=["region","month","light_condtion","weather","make_and_model","year","outcome"])
accident_df

Unnamed: 0,region,month,light_condtion,weather,make_and_model,year,outcome
0,"Midwest (OH, IN, IL, MI, WI, MN, ND, SD, NE, I...",March,Not Reported,Not Reported,,,0
1,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Not Reported,Clear,,,0
2,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",January,Daylight,Clear,,,0
3,"West (MT, ID, WA, OR, CA, NV, NM, AZ, UT, CO, ...",January,Daylight,Snow,,,0
4,"Northeast (PA, NJ, NY, NH, VT, RI, MA, ME, CT)",January,Dark - Not Lighted,Snow,,,0
...,...,...,...,...,...,...,...
163558,"West (MT, ID, WA, OR, CA, NV, NM, AZ, UT, CO, ...",December,Dark - Lighted,Cloudy,,,0
163559,"Midwest (OH, IN, IL, MI, WI, MN, ND, SD, NE, I...",July,Dark - Lighted,Clear,,,0
163560,"Midwest (OH, IN, IL, MI, WI, MN, ND, SD, NE, I...",July,Daylight,Clear,,,0
163561,"Midwest (OH, IN, IL, MI, WI, MN, ND, SD, NE, I...",July,Daylight,Clear,,,0


In [37]:
# make dummies for categorical data
dummy_df = pd.get_dummies(accident_df, columns=["region","month","light_condtion","weather","make_and_model","year"])

In [38]:
y = dummy_df ["outcome"]
X = dummy_df .drop(columns="outcome")

In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(122672, 38)

In [40]:
X_test.shape

(40891, 38)

In [41]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [42]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [43]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [44]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.979922232276051


In [45]:
from sklearn.metrics import confusion_matrix, classification_report

In [46]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[40070     0]
 [  821     0]]


In [47]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     40070
           1       0.00      0.00      0.00       821

    accuracy                           0.98     40891
   macro avg       0.49      0.50      0.49     40891
weighted avg       0.96      0.98      0.97     40891



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
