In [1]:
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Clustering
from sklearn.cluster import KMeans

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Load and understand the data

In [2]:
DATA = "data/crash_data.csv"
# Load the dataset
data = pd.read_csv(DATA)

# Display the first few rows
data.head()

Unnamed: 0,OBJECTID,Document_Nbr,Crash_Year,Crash_Severity,Driver_VehicleNumber,Driver_InjuryType,Driver_Age,Driver_Gender,Driver_Action_Type_Cd,Driver_Airbag_Deployment,...,Ped_Action,Ped_Drink,Ped_Cond,Ped_Al_Test,Ped_Drug,Ped_Rflct,Numberoflane,Carspeedlimit,Local Case CD,Route or Street Name
0,1,100200959,2009,B,1,PDO,48,Male,1. No Improper Action,2. Not Deployed,...,1. Crossing At Intersection With Signal,1. No Drinking,1. No Defects,4. No Test,3. Unknown,2. No,4.0,35.0,20093440243,RT 7LEESBURG PIKE
1,2,93010657,2009,O,1;2,PDO;PDO,45;57,Male;Male,22. Disregarded Stop or Yield Sign;1. No Impro...,2. Not Deployed;2. Not Deployed,...,,,,,,,4.0,,R-2385,WARWICK BLVD
2,3,92440184,2009,O,1;2,NA;PDO,47,Male;Male,34. Hit and Run;1. No Improper Action,2. Not Deployed;2. Not Deployed,...,,,,,,,4.0,,P09040800226,1800 BLK E BRAMBELTON
3,4,92870200,2009,O,1,PDO,17,Female,1. No Improper Action,2. Not Deployed,...,,,,,,,2.0,55.0,,I 66 EAST
4,5,93400162,2009,C,1;2,C;PDO,53;63,Female;Male,11. Did Not Have Right-of-Way;1. No Improper A...,1. Deployed - Front;3. n/a,...,,,,,,,,,200909150302,DISTRIBUTOR DRIVE


In [3]:
data.info()

# Print out the columns that arent objects
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 844692 entries, 0 to 844691
Data columns (total 65 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   OBJECTID                     844692 non-null  int64  
 1   Document_Nbr                 844692 non-null  int64  
 2   Crash_Year                   844692 non-null  int64  
 3   Crash_Severity               844692 non-null  object 
 4   Driver_VehicleNumber         844692 non-null  object 
 5   Driver_InjuryType            840732 non-null  object 
 6   Driver_Age                   829887 non-null  object 
 7   Driver_Gender                844692 non-null  object 
 8   Driver_Action_Type_Cd        844692 non-null  object 
 9   Driver_Airbag_Deployment     844692 non-null  object 
 10  Driver_Alcohol_Test_Type_Cd  844692 non-null  object 
 11  Driver_Condition_Type_Cd     844692 non-null  object 
 12  Driver_Distraction_Type_Cd   844692 non-null  object 
 13 

Unnamed: 0,OBJECTID,Document_Nbr,Crash_Year,Numberoflane,Carspeedlimit
count,844692.0,844692.0,844692.0,630024.0,373565.0
mean,422346.5,123273000.0,2012.042972,3.214603,48.608248
std,243841.721129,19468740.0,1.997255,1.374737,11.252441
min,1.0,90135100.0,2009.0,0.0,15.0
25%,211173.75,103640300.0,2010.0,2.0,40.0
50%,422346.5,122265200.0,2012.0,3.0,45.0
75%,633519.25,141305000.0,2014.0,4.0,55.0
max,844692.0,160550000.0,2015.0,10.0,70.0


In [4]:
# Columns and number of missing values from high to low
print(f"Number of missing values (high to low): \n {data.isnull().sum().sort_values(ascending=False)}")

# Drop columsn with more than 90% missing values
data = data.dropna(thresh=0.7*data.shape[0], axis=1)

print(f"Number of missing values after the drop (high to low): \n {data.isnull().sum().sort_values(ascending=False)}")
print(f"Number of columns: {len(data.columns)}")

Number of missing values (high to low): 
 Bike_Age                      839787
Bike_InjuryType               839707
Bike_VehicleNumber            839684
Bike_Gender                   839684
PED_Age                       833367
                               ...  
Fourth_Crash_Event_Cd              0
Direction_Of_Travel_Cd             0
Initial_Veh_Impact_Area_Cd         0
Second_Crash_Event_Cd              0
Third_Crash_Event_Cd               0
Length: 65, dtype: int64
Number of missing values after the drop (high to low): 
 Numberoflane                   214668
Local Case CD                  175653
Route or Street Name           129771
Driver_Age                      14805
Vehicle_Model_Nm                11801
Speed_Before                     7707
Vehicle_Year_Nbr                 4312
Driver_InjuryType                3960
Vehicle_Make_Nm                  2310
Speed_Max_Safe                   2145
Speed_Posted                     1266
Driver_Action_Type_Cd               0
Driver_Gender

## Encode categorical data

In [5]:
# Encode categorical values
def split_string(string):
    # If the string is empty or nan, return an empty list
    if string == "" or pd.isnull(string):
        return []
    # Break down each string into a list if there is a semicolon
    if ";" in str(string):
        parts = string.split(";")                
        return parts
    else:
        return [string]
    
def find_id(string):
    string = str(string).strip()
    result = ""
    for char in string:
        if char.isdigit():
            result += char
        elif char == ".":
            return result
        else:
            break
    return None

def clean_values(string):
    individual_strings: list = split_string(string)
    output: list = []
    for string in individual_strings:
        if find_id(string):
            output.append(find_id(string))
        else:
            output.append(string)
    return output

# Apply clean_values to all of the categorical columns
for column in data.columns:
    if data[column].dtype == "object":
        data[column] = data[column].apply(clean_values)

In [8]:
# One hot encode the categorical columns
# Encode categorical values
# Identify categorical variables
categorical_vars = data.select_dtypes(include=['object']).columns.tolist()
print("Categorical variables:")
print(categorical_vars)
# Initialize the encoder
encoder = OrdinalEncoder()

# Encode categorical variables
data[categorical_vars] = data[categorical_vars].fillna('Missing')
data[categorical_vars] = encoder.fit_transform(data[categorical_vars])

Categorical variables:
['Crash_Severity', 'Driver_VehicleNumber', 'Driver_InjuryType', 'Driver_Age', 'Driver_Gender', 'Driver_Action_Type_Cd', 'Driver_Airbag_Deployment', 'Driver_Alcohol_Test_Type_Cd', 'Driver_Condition_Type_Cd', 'Driver_Distraction_Type_Cd', 'Driver_Drinking_Type_Cd', 'Driver_Drug_Use_Cd', 'Driver_Ejected_From_Vehicle', 'Driver_Ems_Transport_Ind', 'Driver_Fled_Scene_Ind', 'Driver_Safety_Equip_Used', 'Driver_Vis_Obscured_Type_Cd', 'Summons_Issued_Cd', 'Vehiclenumber', 'Vehicle_Body_Type_Cd', 'Vehicle_Make_Nm', 'Vehicle_Maneuver_Type_Cd', 'Vehicle_Model_Nm', 'Vehicle_Year_Nbr', 'Comm_Cargo_Body_Type_Cd', 'Comm_Vehicle_Body_Type_Cd', 'Speed_Before', 'Speed_Posted', 'Speed_Max_Safe', 'First_Crash_Event_Cd', 'Second_Crash_Event_Cd', 'Third_Crash_Event_Cd', 'Fourth_Crash_Event_Cd', 'Most_Harmful_Crash_Event_Cd', 'Initial_Veh_Impact_Area_Cd', 'Direction_Of_Travel_Cd', 'Local Case CD', 'Route or Street Name']


TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['list']

## Look into data correlation now that most of the data is cleaned

In [7]:
# Drop rows with missing 'CRASH_SEVERITY'
data = data.dropna(subset=['Crash_Severity'])

# Identify numerical and categorical columns
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()

# # Fill missing values
for col in numerical_cols:
    data[col] = data[col].fillna(data[col].mean())
for col in categorical_cols:
    data[col] = data[col].fillna(data[col].mode()[0])

# One-Hot Encode categorical variables
data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

# Compute the correlation matrix
corr_matrix = data_encoded.corr()

# Get correlations with 'CRASH_SEVERITY'
crash_severity_corr = corr_matrix['Crash_Severity'].sort_values(ascending=False)

# Visualize correlations with 'CRASH_SEVERITY'
plt.figure(figsize=(8, len(crash_severity_corr) / 2))
crash_severity_corr.drop('Crash_Severity').plot(kind='barh')
plt.title('Correlation of Features with CRASH_SEVERITY')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Features')
plt.gca().invert_yaxis()
plt.show()

TypeError: unhashable type: 'list'

In [15]:
# Print out the list where the correlation coefficient is greater than abs(0.2)
print(crash_severity_corr[abs(crash_severity_corr) > 0.2])

Crash_Severity              1.000000
Driver_InjuryType           0.736833
Driver_Ems_Transport_Ind   -0.525539
Name: Crash_Severity, dtype: float64
