In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Load the cleaned dataset
df = pd.read_csv("cleaned_traffic_violations.csv")

# Ensure binary target
df = df[df['accident'].isin([True, False])]
df['accident'] = df['accident'].astype(int)

# Encode high-cardinality fields
for col in ['make', 'model']:
    if col in df.columns:
        df[col] = df[col].astype(str).fillna("Unknown")
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

# Verified features
safe_features = [
    'hour', 'month', 'day_of_week',
    'gender', 'race', 'driver_state',
    'vehicle_type', 'arrest_type',
    'belts', 'personal_injury', 'property_damage', 'fatal',
    'alcohol', 'commercial_vehicle', 'hazmat',
    'make', 'model'
]

# Filter available features
features = [f for f in safe_features if f in df.columns]
categorical = ['gender', 'race', 'driver_state', 'vehicle_type', 'arrest_type', 'day_of_week']
categorical = [c for c in categorical if c in df.columns]
numerical = list(set(features) - set(categorical))

# One-hot encode low-cardinality categoricals
X_encoded = pd.get_dummies(df[categorical], drop_first=True)
X_full = pd.concat([df[numerical], X_encoded], axis=1)
y = df['accident']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y, test_size=0.2, random_state=42, stratify=y
)

# Drop completely empty columns in training set
X_train = X_train.dropna(axis=1, how='all')
X_test = X_test[X_train.columns]  # ensure shape matches

# Impute remaining missing values using most frequent values
imputer = SimpleImputer(strategy="most_frequent")
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Standard scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SMOTE for class balancing
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Train a Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluation
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98    387943
           1       0.45      0.74      0.56     10934

    accuracy                           0.97    398877
   macro avg       0.72      0.86      0.77    398877
weighted avg       0.98      0.97      0.97    398877

Confusion Matrix:
 [[378000   9943]
 [  2799   8135]]
ROC AUC Score: 0.8591897288577374


In [31]:
def predict_accident_risk(input_dict):
    df_input = pd.DataFrame([input_dict])
    df_input_encoded = pd.get_dummies(df_input)
    df_input_encoded = df_input_encoded.reindex(columns=X_train.columns, fill_value=0)
    df_imputed = pd.DataFrame(imputer.transform(df_input_encoded), columns=X_train.columns)
    df_scaled = scaler.transform(df_imputed)
    prob = model.predict_proba(df_scaled)[0][1]
    return round(prob * 100, 2)

def simplified_accident_risk(user_input):
    base = {
        'hour': 12,
        'month': 6,
        'day_of_week': 'Monday',
        'gender': 'Male',
        'race': 'Unknown',
        'driver_state': 'MD',
        'vehicle_type': 'PASSENGER CAR',
        'arrest_type': 'Citation',
        'belts': True,
        'personal_injury': False,
        'property_damage': True,
        'fatal': False,
        'alcohol': False,
        'commercial_vehicle': False,
        'hazmat': False,
        'make': 12,
        'model': 103
    }
    base.update(user_input)
    return predict_accident_risk(base)

simple_input = {
    'hour': 18,
    'day_of_week': 'Saturday',
    'vehicle_type': 'MOTORCYCLE',
    'alcohol': True,
    'belts': False
}

likelihood = simplified_accident_risk(simple_input)
print(f"Estimated accident risk: {likelihood}%")

Estimated accident risk: 32.09%
