In [37]:
# Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
import psycopg2 
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [38]:
# Load in password
from secret import secret, database, username, host

# Establish connection to database
conn = psycopg2.connect(
    host = host,
    database = database,
    user =username,
    password = secret
)

In [39]:
# Load the data
query = 'SELECT * FROM clark_co_traffic'
traffic_df = pd.read_sql_query(query, conn)

  traffic_df = pd.read_sql_query(query, conn)


In [40]:
# Drop unnecessary columns
traffic_df.drop(columns = ['objectid','crash_severity','property_damage_only','accident_rec_num','primary_street', 'secondary_street'], inplace = True)

In [41]:
# Change target values to numeric
injury_num = {
    'K': 4,
    'A': 3,
    'B': 2,
    'C': 1,
    'U': 0,    
}

traffic_df['injury_type'] = traffic_df['injury_type'].apply(lambda x: injury_num[x])

traffic_df['injury_type'].head()

0    0
1    0
2    0
3    0
4    0
Name: injury_type, dtype: int64

In [42]:
# Extract datetime values into separate columns
traffic_df['crash_month'] = pd.DatetimeIndex(traffic_df['crash_date']).month
traffic_df['crash_day'] = pd.DatetimeIndex(traffic_df['crash_date']).day
traffic_df['crash_hour'] = pd.DatetimeIndex(traffic_df['crash_time']).hour


In [43]:
# Drop original datetime columns
traffic_df.drop(columns = ['crash_date', 'crash_time'], inplace = True)

In [44]:
traffic_df.dtypes

x                        float64
y                        float64
county                    object
crash_year                 int64
weather                   object
fatalities                 int64
injured                    int64
injury_type                int64
crash_type                object
total_vehicles             int64
v1_type                   object
v1_driver_age              int64
v1_action                 object
v1_driver_factors         object
v1_driver_distracted      object
v1_vehicle_factors        object
v1_most_harmful_event     object
v1_all_events             object
v2_type                   object
v2_driver_age              int64
v2_action                 object
v2_driver_factors         object
v2_driver_distracted      object
v2_vehicle_factors        object
v2_most_harmful_event     object
v2_all_events             object
nonmotorist_factors       object
factors_roadway           object
lighting                  object
hwy_factors               object
pedalcycli

In [45]:
# Get names of all columns with dtypes of "object"
sel_cols = list(traffic_df.select_dtypes(include='object'))
sel_cols

['county',
 'weather',
 'crash_type',
 'v1_type',
 'v1_action',
 'v1_driver_factors',
 'v1_driver_distracted',
 'v1_vehicle_factors',
 'v1_most_harmful_event',
 'v1_all_events',
 'v2_type',
 'v2_action',
 'v2_driver_factors',
 'v2_driver_distracted',
 'v2_vehicle_factors',
 'v2_most_harmful_event',
 'v2_all_events',
 'nonmotorist_factors',
 'factors_roadway',
 'lighting',
 'hwy_factors',
 'pedalcyclist',
 'pedestrian',
 'motorcyclist',
 'animaltype']

In [46]:
# Preprocessing to convert all strings to numeric values
traffic_df_encoded = pd.get_dummies(traffic_df, columns = sel_cols)
traffic_df_encoded.head()

Unnamed: 0,x,y,crash_year,fatalities,injured,injury_type,total_vehicles,v1_driver_age,v2_driver_age,crash_month,...,animaltype_BIG HORN SHEEP,animaltype_BURRO,animaltype_CATTLE,animaltype_DEER,animaltype_DOG/COYOTE,animaltype_ELK,animaltype_HORSE,animaltype_NO,animaltype_OTHER ANIMAL,animaltype_UNKNOWN
0,-115.106709,36.236043,2016,0,0,0,2,35,41,1,...,0,0,0,0,0,0,0,1,0,0
1,-119.673794,39.626433,2016,0,0,0,1,39,41,1,...,0,0,0,0,0,0,0,1,0,0
2,-115.101063,36.2402,2016,0,0,0,2,31,53,1,...,0,0,0,0,0,0,0,1,0,0
3,-119.633092,39.516952,2016,0,0,0,1,18,41,1,...,0,0,0,0,0,0,0,1,0,0
4,-115.14057,36.19268,2016,0,0,0,1,56,41,1,...,0,0,0,0,0,0,0,1,0,0


In [47]:
# Separate data into features and target
y = traffic_df_encoded['injury_type'].values
X = traffic_df_encoded.drop(columns = 'injury_type')

In [48]:
# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)
X_train.shape

(169251, 8006)

In [49]:
# Scale the model
scaler = StandardScaler() 

# Fit the scaler with the training data
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [50]:
# Create and fit the Random Forest model
rf_model = RandomForestClassifier(n_estimators = 128)
rf_model = rf_model.fit(X_train_scaled, y_train)

In [51]:
# Create predictions
predictions = rf_model.predict(X_test_scaled)
results = pd.DataFrame({
    "Prediction": predictions,
    "Actual": y_test
}).reset_index(drop = True)
results.head()

Unnamed: 0,Prediction,Actual
0,0,1
1,0,0
2,1,1
3,0,0
4,0,0


In [52]:
# Assess accuracy score
accuracy_score(y_test, predictions)

0.8719534891965188

In [53]:
# Generate confusion matrix
cm = confusion_matrix(y_test, predictions)

cm
                  

array([[32081,    27,    15,     0,     2],
       [  693, 16107,   580,     9,     1],
       [  358,  4512,   762,    16,     1],
       [   59,   603,   276,    20,     1],
       [   56,    14,     1,     0,   223]])

In [54]:
# Display results
print('Confusion Matrix')
display(cm)
print(f'Accuracy Score: {accuracy_score}')
print('Classification Report')
print(classification_report(y_test, predictions))

Confusion Matrix


array([[32081,    27,    15,     0,     2],
       [  693, 16107,   580,     9,     1],
       [  358,  4512,   762,    16,     1],
       [   59,   603,   276,    20,     1],
       [   56,    14,     1,     0,   223]])

Accuracy Score: <function accuracy_score at 0x123ef3a60>
Classification Report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     32125
           1       0.76      0.93      0.83     17390
           2       0.47      0.13      0.21      5649
           3       0.44      0.02      0.04       959
           4       0.98      0.76      0.85       294

    accuracy                           0.87     56417
   macro avg       0.72      0.57      0.58     56417
weighted avg       0.84      0.87      0.84     56417



In [55]:
# Calculate importance of features
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse = True)

[(0.2953527260218005, 'injured'),
 (0.03597520895758143, 'y'),
 (0.03562527813842288, 'v1_driver_age'),
 (0.03535969501845756, 'x'),
 (0.031070133926235362, 'v2_driver_age'),
 (0.030636369357163393, 'crash_day'),
 (0.028865874113330556, 'crash_hour'),
 (0.026553531508055755, 'crash_month'),
 (0.016341058117840153, 'crash_year'),
 (0.015699160144839734, 'factors_roadway_DRY'),
 (0.014396874936647722, 'lighting_UNKNOWN'),
 (0.011549391765678397, 'hwy_factors_NONE'),
 (0.010142222631983585, 'hwy_factors_UNKNOWN'),
 (0.009914940400982107, 'lighting_DAYLIGHT'),
 (0.009517522325722878, 'factors_roadway_UNKNOWN'),
 (0.008508083247497817, 'total_vehicles'),
 (0.00738048783406023, 'v1_type_SEDAN, 4 DOOR'),
 (0.006154132252616592, 'v1_action_GOING STRAIGHT'),
 (0.006122035497131916, 'crash_type_ANGLE'),
 (0.005953899993978655, 'v1_vehicle_factors_FAILED TO YIELD RIGHT OF WAY'),
 (0.005898389631464833, 'v2_type_SEDAN, 4 DOOR'),
 (0.005803593986538579, 'v1_driver_factors_APPARENTLY NORMAL'),
 (0.0