# Statewide Data with Injury Type as Target

## Imports and Setup 

In [1]:
# Initial imports
import pandas as pd
import numpy as np
import psycopg2 
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Load in password
from secret import secret, database, username, host

# Establish connection to database
conn = psycopg2.connect(
    host = host,
    database = database,
    user =username,
    password = secret
)

In [3]:
# Load the data
query = 'SELECT * FROM clark_co_traffic'
traffic_df = pd.read_sql_query(query, conn)

  traffic_df = pd.read_sql_query(query, conn)


In [4]:
# Drop unnecessary columns
traffic_df.drop(columns = ['objectid','crash_severity','accident_rec_num','primary_street', 'secondary_street', 'fatalities', 'injured' ], inplace = True)

In [5]:
# Move PDO values from property_damage_only column info to injury_type column
traffic_df['injury_type'] = np.where(traffic_df['property_damage_only'] == 'PDO', traffic_df['property_damage_only'], traffic_df['injury_type'])

In [6]:
# Drop property damage only column
traffic_df.drop(columns = ['property_damage_only'], inplace = True)

In [7]:
# Split single column with multiple data points into separate columns
traffic_df[['v1_driver_factors_1', 'v1_driver_factors_2','v1_driver_factors_3', 'v1_driver_factors_4']] = traffic_df['v1_driver_factors'].str.split(':', expand=True)

traffic_df[['v1_vehicle_factors_1', 'v1_vehicle_factors_2','v1_vehicle_factors_3', 'v1_vehicle_factors_4', 'v1_vehicle_factors_5']] = traffic_df['v1_vehicle_factors'].str.split(':', expand=True)

traffic_df[['v1_all_events_1', 'v1_all_events_2','v1_all_events_3', 'v1_all_events_4', 'v1_all_events_5']] = traffic_df['v1_all_events'].str.split(':', expand=True)

traffic_df[['v2_driver_factors_1', 'v2_driver_factors_2']] = traffic_df['v2_driver_factors'].str.split(':', expand=True)

traffic_df[['v2_vehicle_factors_1', 'v2_vehicle_factors_2','v2_vehicle_factors_3', 'v2_vehicle_factors_4', 'v2_vehicle_factors_5']] = traffic_df['v2_vehicle_factors'].str.split(':', expand=True)

traffic_df[['v2_all_events_1', 'v2_all_events_2','v2_all_events_3', 'v2_all_events_4', 'v2_all_events_5']] = traffic_df['v1_all_events'].str.split(':', expand=True)

traffic_df[['nonmotorist_factors_1', 'nonmotorist_factors_2','nonmotorist_factors_3', 'nonmotorist_factors_4', 'nonmotorist_factors_5']] = traffic_df['nonmotorist_factors'].str.split(':', expand=True)

traffic_df[['factors_roadway_1', 'factors_roadway_2','factors_roadways_3']] = traffic_df['factors_roadway'].str.split(':', expand=True)

traffic_df[['hwy_factors_1', 'hwy_factors_2','hwy_factors_3', 'hwy_factors_4']] = traffic_df['hwy_factors'].str.split(':', expand=True)

traffic_df.drop(columns = ['v1_driver_factors','v1_vehicle_factors','v1_all_events','v2_driver_factors','v2_vehicle_factors', 'v2_all_events', 'nonmotorist_factors', 'factors_roadway', 'hwy_factors',], inplace = True)

In [8]:
traffic_df['injury_type'].value_counts()

PDO    128261
C       69561
B       22594
A        3836
K        1175
U         241
Name: injury_type, dtype: int64

In [9]:
# Change target values to numeric
# Values are based on Nevada Traffic Records Coordinating Committee Data Dictionary,
# https://zerofatalitiesnv.com/app/uploads/2021/04/2021-01-NV-TRCC-Data-Dictionary.pdf
injury_num = {
    'K': 5,
    'A': 4,
    'B': 3,
    'C': 2,
    'PDO': 1,
    'U': 0,    
}

traffic_df['injury_type'] = traffic_df['injury_type'].apply(lambda x: injury_num[x])

traffic_df['injury_type'].value_counts()

1    128261
2     69561
3     22594
4      3836
5      1175
0       241
Name: injury_type, dtype: int64

In [10]:
# Extract datetime values into separate columns
traffic_df['crash_month'] = pd.DatetimeIndex(traffic_df['crash_date']).month
traffic_df['crash_day'] = pd.DatetimeIndex(traffic_df['crash_date']).day
traffic_df['crash_day_of_week'] = pd.DatetimeIndex(traffic_df['crash_date']).dayofweek
traffic_df['crash_hour'] = pd.DatetimeIndex(traffic_df['crash_time']).hour


In [11]:
# Drop original datetime columns
traffic_df.drop(columns = ['crash_date', 'crash_time'], inplace = True)

In [12]:
traffic_df.dtypes

x                    float64
y                    float64
county                object
crash_year             int64
weather               object
                      ...   
hwy_factors_4         object
crash_month            int64
crash_day              int64
crash_day_of_week      int64
crash_hour             int64
Length: 65, dtype: object

In [13]:
# Get names of all columns with dtypes of "object"
sel_cols = list(traffic_df.select_dtypes(include='object'))


In [14]:
# Preprocessing to convert all strings to numeric values
traffic_df_encoded = pd.get_dummies(traffic_df, columns = sel_cols)
traffic_df_encoded.head()

Unnamed: 0,x,y,crash_year,injury_type,total_vehicles,v1_driver_age,v2_driver_age,crash_month,crash_day,crash_day_of_week,...,hwy_factors_3_ OTHER HIGHWAY,hwy_factors_3_ ROAD OBSTRUCTION,"hwy_factors_3_ RUTS, HOLES, BUMPS",hwy_factors_3_ SHOULDERS,hwy_factors_3_ VISUAL OBSTRUCTION(S),hwy_factors_3_ WEATHER,"hwy_factors_3_ WET, ICY, SNOW, SLUSH",hwy_factors_3_ WORK ZONE (CONST. MAINT. UTILITY),hwy_factors_3_ WORN TRAFFIC SURFACE,"hwy_factors_4_ WET, ICY, SNOW, SLUSH"
0,-115.106709,36.236043,2016,1,2,35,41,1,4,0,...,0,0,0,0,0,0,0,0,0,0
1,-119.673794,39.626433,2016,1,1,39,41,1,7,3,...,0,0,0,0,0,0,0,0,0,0
2,-115.101063,36.2402,2016,1,2,31,53,1,4,0,...,0,0,0,0,0,0,0,0,0,0
3,-119.633092,39.516952,2016,1,1,18,41,1,6,2,...,0,0,0,0,0,0,0,0,0,0
4,-115.14057,36.19268,2016,1,1,56,41,1,3,6,...,0,0,0,0,0,0,0,0,0,0


## Model with Injury Type Crash Severity as Target and All Other Columns as Features

In [15]:
# Separate data into features and target
y = traffic_df_encoded['injury_type'].values
X = traffic_df_encoded.drop(columns = 'injury_type')

In [16]:
# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)
X_train.shape

(169251, 1345)

In [17]:
# Scale the model
scaler = StandardScaler() 

# Fit the scaler with the training data
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
# Create and fit the Random Forest model
rf_model = RandomForestClassifier(n_estimators = 128)
rf_model = rf_model.fit(X_train_scaled, y_train)

In [19]:
# Create predictions
predictions = rf_model.predict(X_test_scaled)
results = pd.DataFrame({
    "Prediction": predictions,
    "Actual": y_test
}).reset_index(drop = True)
results.head()

Unnamed: 0,Prediction,Actual
0,2,2
1,2,3
2,1,1
3,1,1
4,1,1


In [20]:
# Assess accuracy score
accuracy = accuracy_score(y_test, predictions)
accuracy

0.6549444316429445

In [21]:
# Generate confusion matrix and dataframe
cm = confusion_matrix(y_test, predictions)

index_values = ['Actual: Unknown', 
                'Actual: Property Damage Only', 
                'Actual: Possible Injury', 
                'Actual: Suspected Minor Injury', 
                'Actual: Suspected Serious Injury', 
                'Actual: Fatal Injury']

column_values = ['Predicted: Unknown', 
                'Predicted: Property Damage Only', 
                'Predicted: Possible Injury', 
                'Predicted: Suspected Minor Injury', 
                'Predicted: Suspected Serious Injury', 
                'Predicted: Fatal Injury']

cm_df = pd.DataFrame(cm, index = index_values, columns = column_values)
cm_df
                  

Unnamed: 0,Predicted: Unknown,Predicted: Property Damage Only,Predicted: Possible Injury,Predicted: Suspected Minor Injury,Predicted: Suspected Serious Injury,Predicted: Fatal Injury
Actual: Unknown,0,36,13,10,0,1
Actual: Property Damage Only,0,27151,4796,112,5,1
Actual: Possible Injury,0,7802,9109,461,17,1
Actual: Suspected Minor Injury,0,2194,2794,630,24,7
Actual: Suspected Serious Injury,0,329,369,222,31,8
Actual: Fatal Injury,0,123,60,64,18,29


## Results with Injury Type as Target and All Other Columns as Features

In [22]:
# Display results
print('Statewide Data with Injury Type as Target, All Other Columns as Features')
print('Confusion Matrix')
display(cm_df)
print(f'\nAccuracy Score: {accuracy}\n')
print('Classification Report')
print(classification_report(y_test, predictions))

Statewide Data with Injury Type as Target, All Other Columns as Features
Confusion Matrix


Unnamed: 0,Predicted: Unknown,Predicted: Property Damage Only,Predicted: Possible Injury,Predicted: Suspected Minor Injury,Predicted: Suspected Serious Injury,Predicted: Fatal Injury
Actual: Unknown,0,36,13,10,0,1
Actual: Property Damage Only,0,27151,4796,112,5,1
Actual: Possible Injury,0,7802,9109,461,17,1
Actual: Suspected Minor Injury,0,2194,2794,630,24,7
Actual: Suspected Serious Injury,0,329,369,222,31,8
Actual: Fatal Injury,0,123,60,64,18,29



Accuracy Score: 0.6549444316429445

Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        60
           1       0.72      0.85      0.78     32065
           2       0.53      0.52      0.53     17390
           3       0.42      0.11      0.18      5649
           4       0.33      0.03      0.06       959
           5       0.62      0.10      0.17       294

    accuracy                           0.65     56417
   macro avg       0.44      0.27      0.29     56417
weighted avg       0.62      0.65      0.62     56417



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Importances with Injury Type as Target and All Other Columns as Features

In [23]:
# Calculate importance of features
importances = rf_model.feature_importances_
important = sorted(zip(importances, X.columns), reverse = True)
important

[(0.0536998625511074, 'y'),
 (0.053190194662320105, 'x'),
 (0.05068146743509144, 'v1_driver_age'),
 (0.045665923775227456, 'v2_driver_age'),
 (0.04505989953218323, 'crash_day'),
 (0.04199681487042551, 'crash_hour'),
 (0.03859809616266888, 'crash_month'),
 (0.03404867940580348, 'crash_day_of_week'),
 (0.022574923192223834, 'crash_year'),
 (0.020611757757959794, 'factors_roadway_1_UNKNOWN'),
 (0.01790707291917942, 'factors_roadway_1_DRY'),
 (0.016662772610195404, 'lighting_UNKNOWN'),
 (0.012390619787925666, 'total_vehicles'),
 (0.012127385305322393, 'lighting_DAYLIGHT'),
 (0.012003814764723768, 'hwy_factors_1_NONE'),
 (0.011394615228787527, 'v1_type_SEDAN, 4 DOOR'),
 (0.011020758956292582, 'hwy_factors_1_UNKNOWN'),
 (0.008777743757306845, 'v1_type_CARRY-ALL'),
 (0.008560421327553159, 'v2_type_SEDAN, 4 DOOR'),
 (0.008400816353888362, 'v1_action_GOING STRAIGHT'),
 (0.008225204670009584, 'v1_driver_factors_1_APPARENTLY NORMAL'),
 (0.007897217502443033, 'v1_type_PICKUP'),
 (0.007853024129108

# Second Version of Model: Only Top 50% of Importances as Features

### Importances to exclude

In [24]:
low_priority = important[772:]


In [25]:
imp_only_list = []
listy_list = []

# iterate using index with enumerate function
for index, tuple in enumerate(low_priority):
       
    # access through index by appending to list
    imp_only_list.append(low_priority[index])
     
# iterate through the list
for x in imp_only_list:
    for y in x:
         listy_list.append(y)

final_list = listy_list[1::2]



### Statewide Dataframe without Low-Value Importances

In [26]:
important_df = traffic_df_encoded.drop(columns = final_list)
important_df.head()

Unnamed: 0,x,y,crash_year,injury_type,total_vehicles,v1_driver_age,v2_driver_age,crash_month,crash_day,crash_day_of_week,...,hwy_factors_2_ NON-ROADWAY WORK,hwy_factors_2_ OTHER ENVIRONMENTAL,hwy_factors_2_ OTHER HIGHWAY,"hwy_factors_2_ RUTS, HOLES, BUMPS","hwy_factors_2_ WET, ICY, SNOW, SLUSH",hwy_factors_2_ WORK ZONE (CONST. MAINT. UTILITY),hwy_factors_3_ BACKUP DUE TO PRIOR CRASH,hwy_factors_3_ BACKUP DUE TO REGULAR CONGESTION,hwy_factors_3_ OTHER HIGHWAY,hwy_factors_3_ WORK ZONE (CONST. MAINT. UTILITY)
0,-115.106709,36.236043,2016,1,2,35,41,1,4,0,...,0,0,0,0,0,0,0,0,0,0
1,-119.673794,39.626433,2016,1,1,39,41,1,7,3,...,0,0,0,0,0,0,0,0,0,0
2,-115.101063,36.2402,2016,1,2,31,53,1,4,0,...,0,0,0,0,0,0,0,0,0,0
3,-119.633092,39.516952,2016,1,1,18,41,1,6,2,...,0,0,0,0,0,0,0,0,0,0
4,-115.14057,36.19268,2016,1,1,56,41,1,3,6,...,0,0,0,0,0,0,0,0,0,0


## Model with Injury Type as Target and Only Top 50% of Importances as Features

In [27]:
# Separate data into features and target
y_imp = important_df['injury_type'].values
X_imp = important_df.drop(columns = 'injury_type')

In [28]:
X_train_imp, X_test_imp, y_train_imp, y_test_imp = train_test_split(X_imp, y_imp)
X_train_imp.shape

(169251, 772)

In [29]:
# Scale the model
scaler_imp = StandardScaler() 

# Fit the scaler with the training data
X_scaler_imp = scaler_imp.fit(X_train_imp)
X_train_scaled_imp = X_scaler_imp.transform(X_train_imp)
X_test_scaled_imp = X_scaler_imp.transform(X_test_imp)

In [30]:
# Create and fit the Random Forest model
rf_model_imp = RandomForestClassifier(n_estimators = 128)
rf_model_imp = rf_model_imp.fit(X_train_scaled_imp, y_train_imp)

In [31]:
# Create predictions
predictions_imp = rf_model_imp.predict(X_test_scaled_imp)
results_imp = pd.DataFrame({
    "Prediction": predictions_imp,
    "Actual": y_test_imp
}).reset_index(drop = True)
results_imp.head()

Unnamed: 0,Prediction,Actual
0,1,1
1,1,2
2,1,2
3,1,2
4,1,1


In [32]:
# Assess accuracy score
accuracy_imp = accuracy_score(y_test_imp, predictions_imp)
accuracy_imp

0.6579222574755836

In [33]:
# Generate confusion matrix and dataframe
cm_imp = confusion_matrix(y_test_imp, predictions_imp)

index_values_imp = ['Actual: Unknown', 
                'Actual: Property Damage Only', 
                'Actual: Possible Injury', 
                'Actual: Suspected Minor Injury', 
                'Actual: Suspected Serious Injury', 
                'Actual: Fatal Injury']

column_values_imp = ['Predicted: Unknown', 
                'Predicted: Property Damage Only', 
                'Predicted: Possible Injury', 
                'Predicted: Suspected Minor Injury', 
                'Predicted: Suspected Serious Injury', 
                'Predicted: Fatal Injury']

cm_df_imp = pd.DataFrame(cm_imp, index = index_values_imp, columns = column_values_imp)
cm_df_imp

Unnamed: 0,Predicted: Unknown,Predicted: Property Damage Only,Predicted: Possible Injury,Predicted: Suspected Minor Injury,Predicted: Suspected Serious Injury,Predicted: Fatal Injury
Actual: Unknown,0,35,17,9,0,1
Actual: Property Damage Only,0,27177,4828,141,4,0
Actual: Possible Injury,0,7621,9207,459,11,2
Actual: Suspected Minor Injury,1,2217,2762,662,32,3
Actual: Suspected Serious Injury,0,308,366,213,34,3
Actual: Fatal Injury,0,124,59,64,19,38


## Statewide Results with Injury Type as Target and High-Priority Columns as Features

In [34]:
# Display results
print('Statewide Data with Injury Type as Target, High-Priority Columns as Features')
print('Confusion Matrix')
display(cm_df_imp)
print(f'\nAccuracy Score: {accuracy_imp}\n')
print('Classification Report')
print(classification_report(y_test_imp, predictions_imp))

Statewide Data with Injury Type as Target, High-Priority Columns as Features
Confusion Matrix


Unnamed: 0,Predicted: Unknown,Predicted: Property Damage Only,Predicted: Possible Injury,Predicted: Suspected Minor Injury,Predicted: Suspected Serious Injury,Predicted: Fatal Injury
Actual: Unknown,0,35,17,9,0,1
Actual: Property Damage Only,0,27177,4828,141,4,0
Actual: Possible Injury,0,7621,9207,459,11,2
Actual: Suspected Minor Injury,1,2217,2762,662,32,3
Actual: Suspected Serious Injury,0,308,366,213,34,3
Actual: Fatal Injury,0,124,59,64,19,38



Accuracy Score: 0.6579222574755836

Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        62
           1       0.73      0.85      0.78     32150
           2       0.53      0.53      0.53     17300
           3       0.43      0.12      0.18      5677
           4       0.34      0.04      0.07       924
           5       0.81      0.12      0.22       304

    accuracy                           0.66     56417
   macro avg       0.47      0.28      0.30     56417
weighted avg       0.63      0.66      0.63     56417



### Calculate importance of features

In [35]:

importances_high_priority = rf_model_imp.feature_importances_
sorted(zip(importances_high_priority, X_imp.columns), reverse = True)

[(0.05450589985544467, 'y'),
 (0.05389074042781289, 'x'),
 (0.05219383239829924, 'v1_driver_age'),
 (0.046059792378634695, 'v2_driver_age'),
 (0.045261420167148056, 'crash_day'),
 (0.04203915222978382, 'crash_hour'),
 (0.038437734508913315, 'crash_month'),
 (0.03393654530040478, 'crash_day_of_week'),
 (0.022334158784459702, 'crash_year'),
 (0.0198979638275505, 'lighting_UNKNOWN'),
 (0.01676572599247091, 'factors_roadway_1_DRY'),
 (0.014619675751889716, 'factors_roadway_1_UNKNOWN'),
 (0.0133895961979308, 'hwy_factors_1_UNKNOWN'),
 (0.01244247022188486, 'hwy_factors_1_NONE'),
 (0.012240966015117679, 'total_vehicles'),
 (0.012019903160036514, 'lighting_DAYLIGHT'),
 (0.011667158135798989, 'v1_type_SEDAN, 4 DOOR'),
 (0.009006672363505159, 'v1_type_CARRY-ALL'),
 (0.008469916537633031, 'v2_type_SEDAN, 4 DOOR'),
 (0.00826852750091677, 'v1_driver_factors_1_APPARENTLY NORMAL'),
 (0.00805283643698691, 'v1_action_GOING STRAIGHT'),
 (0.00790574524947342, 'v1_type_PICKUP'),
 (0.007753780057153597, '

# Third Version of Model with Injury Type as Target and Only Importances >= 0.01

### Importances to Exclude

In [36]:
lowest_priority = important[17:]

In [37]:
imp_low_list = []
exclude_list = []

# iterate using index with enumerate function
for index, tuple in enumerate(lowest_priority):
       
    # access through index by appending to list
    imp_low_list.append(lowest_priority[index])
     
# iterate through the list
for x in imp_low_list:
    for y in x:
        exclude_list.append(y)

final_exclude_list = exclude_list[1::2]


In [38]:
imp_top_df = traffic_df_encoded.drop(columns = final_exclude_list)
imp_top_df.head()

Unnamed: 0,x,y,crash_year,injury_type,total_vehicles,v1_driver_age,v2_driver_age,crash_month,crash_day,crash_day_of_week,crash_hour,"v1_type_SEDAN, 4 DOOR",lighting_DAYLIGHT,lighting_UNKNOWN,factors_roadway_1_DRY,factors_roadway_1_UNKNOWN,hwy_factors_1_NONE,hwy_factors_1_UNKNOWN
0,-115.106709,36.236043,2016,1,2,35,41,1,4,0,10,0,1,0,1,0,1,0
1,-119.673794,39.626433,2016,1,1,39,41,1,7,3,5,1,0,1,0,1,0,1
2,-115.101063,36.2402,2016,1,2,31,53,1,4,0,7,0,1,0,1,0,0,0
3,-119.633092,39.516952,2016,1,1,18,41,1,6,2,0,0,0,0,1,0,0,0
4,-115.14057,36.19268,2016,1,1,56,41,1,3,6,2,1,0,0,0,0,1,0


## Model with Injury Type as Target and Only Importances >= 0.01 as Features

In [39]:
# Separate data into features and target
y_imp_top = imp_top_df['injury_type'].values
X_imp_top = imp_top_df.drop(columns = 'injury_type')

In [40]:
X_train_imp_top, X_test_imp_top, y_train_imp_top, y_test_imp_top = train_test_split(X_imp_top, y_imp_top)
X_train_imp_top.shape

(169251, 17)

In [41]:
# Scale the model
scaler_imp_top = StandardScaler() 

# Fit the scaler with the training data
X_scaler_imp_top = scaler_imp_top.fit(X_train_imp_top)
X_train_scaled_imp_top = X_scaler_imp_top.transform(X_train_imp_top)
X_test_scaled_imp_top = X_scaler_imp_top.transform(X_test_imp_top)

In [42]:
# Create and fit the Random Forest model
rf_model_imp_top = RandomForestClassifier(n_estimators = 128)
rf_model_imp_top = rf_model_imp_top.fit(X_train_scaled_imp_top, y_train_imp_top)

In [43]:
# Create predictions
predictions_imp_top = rf_model_imp_top.predict(X_test_scaled_imp_top)
results_imp_top = pd.DataFrame({
    "Prediction": predictions_imp_top,
    "Actual": y_test_imp_top
}).reset_index(drop = True)
results_imp_top.head()

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,2
3,1,1
4,2,1


In [44]:
# Assess accuracy score
accuracy_imp_top = accuracy_score(y_test_imp_top, predictions_imp_top)
accuracy_imp_top

0.6143361043657054

In [45]:
# Generate confusion matrix and dataframe
cm_imp_top = confusion_matrix(y_test_imp_top, predictions_imp_top)

index_values_imp = ['Actual: Unknown', 
                'Actual: Property Damage Only', 
                'Actual: Possible Injury', 
                'Actual: Suspected Minor Injury', 
                'Actual: Suspected Serious Injury', 
                'Actual: Fatal Injury']

column_values_imp = ['Predicted: Unknown', 
                'Predicted: Property Damage Only', 
                'Predicted: Possible Injury', 
                'Predicted: Suspected Minor Injury', 
                'Predicted: Suspected Serious Injury', 
                'Predicted: Fatal Injury']

cm_df_imp_top = pd.DataFrame(cm_imp_top, index = index_values_imp, columns = column_values_imp)
cm_df_imp_top

Unnamed: 0,Predicted: Unknown,Predicted: Property Damage Only,Predicted: Possible Injury,Predicted: Suspected Minor Injury,Predicted: Suspected Serious Injury,Predicted: Fatal Injury
Actual: Unknown,0,46,12,3,0,0
Actual: Property Damage Only,0,26997,4770,109,2,1
Actual: Possible Injury,0,9871,7489,118,1,1
Actual: Suspected Minor Injury,0,3451,2169,169,0,1
Actual: Suspected Serious Injury,0,595,280,34,4,6
Actual: Fatal Injury,0,224,48,9,7,0


## Statewide Results with Injury Type as Target and Only Importances >= 0.01 as Features

In [46]:
# Display results
print('Statewide Data with Injury Type as Target, Only Top 17 as Features')
print('Confusion Matrix')
display(cm_df_imp_top)
print(f'\nAccuracy Score: {accuracy_imp_top}\n')
print('Classification Report')
print(classification_report(y_test_imp_top, predictions_imp_top))

Statewide Data with Injury Type as Target, Only Top 17 as Features
Confusion Matrix


Unnamed: 0,Predicted: Unknown,Predicted: Property Damage Only,Predicted: Possible Injury,Predicted: Suspected Minor Injury,Predicted: Suspected Serious Injury,Predicted: Fatal Injury
Actual: Unknown,0,46,12,3,0,0
Actual: Property Damage Only,0,26997,4770,109,2,1
Actual: Possible Injury,0,9871,7489,118,1,1
Actual: Suspected Minor Injury,0,3451,2169,169,0,1
Actual: Suspected Serious Injury,0,595,280,34,4,6
Actual: Fatal Injury,0,224,48,9,7,0



Accuracy Score: 0.6143361043657054

Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        61
           1       0.66      0.85      0.74     31879
           2       0.51      0.43      0.46     17480
           3       0.38      0.03      0.05      5790
           4       0.29      0.00      0.01       919
           5       0.00      0.00      0.00       288

    accuracy                           0.61     56417
   macro avg       0.31      0.22      0.21     56417
weighted avg       0.57      0.61      0.57     56417



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
