In [1]:
# Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
import psycopg2 
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Load in password
from secret import secret, database, username, host

# Establish connection to database
conn = psycopg2.connect(
    host = host,
    database = database,
    user =username,
    password = secret
)

In [3]:
# Load the data
query = 'SELECT * FROM clark_co_traffic'
traffic_df = pd.read_sql_query(query, conn)

In [4]:
# Drop unnecessary columns
traffic_df.drop(columns = ['objectid','crash_severity','property_damage_only','accident_rec_num'], inplace = True)

In [5]:
# Change target values to numeric
le = LabelEncoder()
traffic_df['injury_type'] = le.fit_transform(traffic_df['injury_type'])

In [6]:
# Extract datetime values into separate columns
traffic_df['crash_month'] = pd.DatetimeIndex(traffic_df['crash_date']).month
traffic_df['crash_day'] = pd.DatetimeIndex(traffic_df['crash_date']).day
traffic_df['crash_hour'] = pd.DatetimeIndex(traffic_df['crash_time']).hour


In [7]:
# Drop original datetime columns
traffic_df.drop(columns = ['crash_date', 'crash_time'], inplace = True)

In [8]:
traffic_df.dtypes

x                        float64
y                        float64
county                    object
crash_year                 int64
weather                   object
fatalities                 int64
injured                    int64
injury_type                int32
crash_type                object
total_vehicles             int64
v1_type                   object
v1_driver_age              int64
v1_action                 object
v1_driver_factors         object
v1_driver_distracted      object
v1_vehicle_factors        object
v1_most_harmful_event     object
v1_all_events             object
v2_type                   object
v2_driver_age              int64
v2_action                 object
v2_driver_factors         object
v2_driver_distracted      object
v2_vehicle_factors        object
v2_most_harmful_event     object
v2_all_events             object
nonmotorist_factors       object
factors_roadway           object
lighting                  object
hwy_factors               object
pedalcycli

In [9]:
# Get names of all columns with dtypes of "object"
sel_cols = list(traffic_df.select_dtypes(include='object'))
sel_cols

['county',
 'weather',
 'crash_type',
 'v1_type',
 'v1_action',
 'v1_driver_factors',
 'v1_driver_distracted',
 'v1_vehicle_factors',
 'v1_most_harmful_event',
 'v1_all_events',
 'v2_type',
 'v2_action',
 'v2_driver_factors',
 'v2_driver_distracted',
 'v2_vehicle_factors',
 'v2_most_harmful_event',
 'v2_all_events',
 'nonmotorist_factors',
 'factors_roadway',
 'lighting',
 'hwy_factors',
 'pedalcyclist',
 'pedestrian',
 'motorcyclist',
 'animaltype']

In [10]:
# Preprocessing to convert all strings to numeric values
traffic_df_encoded = pd.get_dummies(traffic_df, columns = sel_cols)
traffic_df_encoded.head()

Unnamed: 0,x,y,crash_year,fatalities,injured,injury_type,total_vehicles,v1_driver_age,v2_driver_age,crash_month,...,animaltype_BIG HORN SHEEP,animaltype_BURRO,animaltype_CATTLE,animaltype_DEER,animaltype_DOG/COYOTE,animaltype_ELK,animaltype_HORSE,animaltype_No,animaltype_OTHER ANIMAL,animaltype_UNKNOWN
0,-115.106709,36.236043,2016,0,0,4,2,35,41,1,...,0,0,0,0,0,0,0,1,0,0
1,-119.673794,39.626433,2016,0,0,4,1,39,41,1,...,0,0,0,0,0,0,0,1,0,0
2,-115.101063,36.2402,2016,0,0,4,2,31,53,1,...,0,0,0,0,0,0,0,1,0,0
3,-119.633092,39.516952,2016,0,0,4,1,18,41,1,...,0,0,0,0,0,0,0,1,0,0
4,-115.14057,36.19268,2016,0,0,4,1,56,41,1,...,0,0,0,0,0,0,0,1,0,0


In [11]:
# Separate data into features and target
y = traffic_df_encoded['injury_type'].values
X = traffic_df_encoded.drop(columns = 'injury_type')

In [12]:
# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)
X_train.shape

(169251, 8018)

In [14]:
# Scale the model
scaler = StandardScaler() 

# Fit the scaler with the training data
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

MemoryError: Unable to allocate 10.1 GiB for an array with shape (8018, 169251) and data type float64

In [None]:
# Create and fit the Random Forest model
rf_model = RandomForestClassifier(n_estimators = 64)
rf_model = rf_model.fit(X_train_scaled, y_train)

In [None]:
# Create predictions
predictions = rf_model.predict(X_test_scaled)
results = pd.DataFrame({
    "Prediction": predictions,
    "Actual": y_test
}).reset_index(drop = True)
results.head()

In [None]:
# Assess accuracy score
accuracy_score(y_test, predictions)

In [None]:
# Generate confusion matrix
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(cm, index['Actual 0', 'Actual 1'], columns = ['Predicted 0', 'Predicted 1'])
cm_df
                               

In [None]:
# Display results
print('Confusion Matrix')
display(cm_df)
print(f'Accuracy Score: {accuracy_score}')
print('Classification Report')
print(classification_report(y_test, predictions))

In [None]:
# Calculate importance of features
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse = True)