In [1]:
# Week 3 - Feature Engineering

import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('../data/cleaned.csv')

# ✅ NO NEED to clean or map Accident_severity
print("\n🎯 Unique values in 'Accident_severity':", df['Accident_severity'].unique())

# Optional: Check for NaNs
if df['Accident_severity'].isna().sum() > 0:
    df = df.dropna(subset=['Accident_severity'])

# One-hot encode categorical features
categorical_cols = [
    'Age_band_of_driver', 'Sex_of_driver', 'Educational_level',
    'Driving_experience', 'Lanes_or_Medians', 'Types_of_Junction',
    'Road_surface_type', 'Light_conditions', 'Weather_conditions',
    'Type_of_collision', 'Vehicle_movement', 'Pedestrian_movement',
    'Cause_of_accident'
]

df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Split features and labels
X = df_encoded.drop(columns=['Accident_severity'])
y = df_encoded['Accident_severity']

print(f"\n✅ Dataset ready. Samples: {X.shape[0]} | Features: {X.shape[1]}")

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\n✅ Train-test split complete:")
print(f"X_train: {X_train.shape} | y_train: {y_train.shape}")
print(f"X_test:  {X_test.shape} | y_test:  {y_test.shape}")

import numpy as np
unique, counts = np.unique(y_train, return_counts=True)
print(dict(zip(unique, counts)))


🎯 Unique values in 'Accident_severity': [2 1 0]

✅ Dataset ready. Samples: 12316 | Features: 97

✅ Train-test split complete:
X_train: (9852, 97) | y_train: (9852,)
X_test:  (2464, 97) | y_test:  (2464,)
{np.int64(0): np.int64(127), np.int64(1): np.int64(1394), np.int64(2): np.int64(8331)}
