# Experiment 1

The "911 Dispatch Priority" System (Classification) Goal: Predict the Severity (1-4) of an accident immediately when it is reported, before police arrive.

Why? To help emergency services decide whether to send a tow truck (Severity 2) or an ambulance and fire truck (Severity 4).

The ML Task: Multi-class Classification.

Critical Engineering Step (Based on your EDA):

Avoid Data Leakage: You MUST DROP Distance_km and End_Time from your training features. These are outcomes of the accident. At the moment the accident happens (t=0), you don't know how long the traffic jam will be.

Feature Engineering: You must create an Is_Rush_Hour feature (since you proved 5 PM is high volume) and an Is_Intersection feature (since you proved signals reduce severity).

Imbalance Handling: Your data is heavily skewed toward Severity 2. You will need to use Class Weights or Undersampling (downsample Severity 2) so the model learns to spot the rare Severity 4 cases.

In [0]:
!pip install torch torchvision torchaudio torchmetrics --quiet

In [0]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [0]:
df = pd.read_csv("/content/silver.csv")

In [0]:
leakage_cols = ["End_Time", "Distance_km", "accident_duration_hours", "End_Lat", "End_Lng"]
df = df.drop(columns=[c for c in leakage_cols if c in df.columns])

In [0]:
df["Start_Time"] = pd.to_datetime(df["Start_Time"], errors="coerce")
df["hour"] = df["Start_Time"].dt.hour
df["Is_Rush_Hour"] = df["hour"].isin([7,8,9,16,17,18]).astype(int)

for col in ["Traffic_Signal", "Crossing", "Junction"]:
    df[col] = df[col].fillna(0).astype(int)

df["Is_Intersection"] = (df["Traffic_Signal"] + df["Crossing"] + df["Junction"]).clip(0,1)

In [0]:
numeric_cols = [
    "Start_Lat", "Start_Lng", "Humidity_%", "Visibility_km",
    "Temperature_C", "Wind_Speed_kmh", "Precipitation_mm",
    "Is_Rush_Hour", "Is_Intersection", "is_weekend", "is_bad_weather", "hour"
]

categorical_cols = ["start_weekday", "County", "State", "Weather_Condition"]

df = df.dropna(subset=numeric_cols + ["Severity"] + categorical_cols)

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

features = numeric_cols + categorical_cols
X = df[features].values.astype(np.float32)
y = (df["Severity"].values - 1).astype(np.int64)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [0]:
scaler = StandardScaler()
X_train[:, :len(numeric_cols)] = scaler.fit_transform(X_train[:, :len(numeric_cols)])
X_test[:, :len(numeric_cols)] = scaler.transform(X_test[:, :len(numeric_cols)])

In [0]:
class CrashDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = CrashDataset(X_train, y_train)
test_ds  = CrashDataset(X_test, y_test)

In [0]:
classes, counts = np.unique(y_train, return_counts=True)
total = counts.sum()
class_weights = torch.tensor(total / (4 * counts), dtype=torch.float32)

samples_weights = class_weights[y_train]
sampler = WeightedRandomSampler(weights=samples_weights, num_samples=len(samples_weights), replacement=True)

train_loader = DataLoader(train_ds, batch_size=512, sampler=sampler)
test_loader  = DataLoader(test_ds, batch_size=512)

In [0]:
class CrashNet(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=4):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        return self.model(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CrashNet(X_train.shape[1]).to(device)

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        logpt = F.log_softmax(inputs, dim=1)
        pt = torch.exp(logpt)
        pt = pt.gather(1, targets.unsqueeze(1)).squeeze(1)
        logpt = logpt.gather(1, targets.unsqueeze(1)).squeeze(1)

        if self.alpha is not None:
            at = self.alpha.gather(0, targets)
            loss = -at * ((1 - pt) ** self.gamma) * logpt
        else:
            loss = -((1 - pt) ** self.gamma) * logpt

        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss

In [0]:
alpha = class_weights.to(device)
loss_fn = FocalLoss(alpha=alpha, gamma=2.0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [0]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

In [0]:
early_stopper = EarlyStopping(patience=5, min_delta=1e-4)

for epoch in range(50):
    model.train()
    running_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        pred = model(xb)
        loss = loss_fn(pred, yb)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * xb.size(0)
    train_loss = running_loss / len(train_ds)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = loss_fn(pred, yb)
            val_loss += loss.item() * xb.size(0)
    val_loss /= len(test_ds)
    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    early_stopper(val_loss)
    if early_stopper.early_stop:
        print(f"Early stopping at epoch {epoch+1}")
        break

In [0]:
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        pred = model(xb)
        all_preds.append(pred.argmax(dim=1).cpu().numpy())
        all_labels.append(yb.numpy())

y_pred = np.concatenate(all_preds)
y_true = np.concatenate(all_labels)

print(classification_report(y_true, y_pred, digits=4))

This model is a failure since our classes are incredibly unbalanced and there is no way to property train the model for good precision.

# Experiment 2

The "Waze/Google Maps" ETA Corrector (Regression) Goal: Predict accident_duration_hours.

Why? To update navigation ETAs. If a crash happens on I-95, will it clear in 30 minutes or 4 hours?

The ML Task: Regression (predicting a continuous number).

Critical Engineering Step:

Outlier Removal: You cannot train on the raw data because of the "Louisiana/Active Ticket" issue. You must filter training data to only include closed accidents (e.g., duration < 24 hours).

Model Choice: Gradient Boosted Trees (XGBoost/LightGBM) usually perform best here because they handle non-linear relationships between "Weather" and "Time of Day" well.

In [0]:
!pip install lightgbm --quiet

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [0]:
df = pd.read_csv("/content/silver.csv")

In [0]:
df = df[df["accident_duration_hours"] < 24]

In [0]:
df["Start_Time"] = pd.to_datetime(df["Start_Time"], errors="coerce")
df["hour"] = df["Start_Time"].dt.hour
df["Is_Rush_Hour"] = df["hour"].isin([7,8,9,16,17,18]).astype(int)

for col in ["Traffic_Signal", "Crossing", "Junction"]:
    df[col] = df[col].fillna(0).astype(int)

df["Is_Intersection"] = (df["Traffic_Signal"] + df["Crossing"] + df["Junction"]).clip(0,1)

In [0]:
numeric_cols = [
    "Start_Lat", "Start_Lng", "Humidity_%", "Visibility_km",
    "Temperature_C", "Wind_Speed_kmh", "Precipitation_mm",
    "Is_Rush_Hour", "Is_Intersection", "is_weekend", "is_bad_weather", "hour"
]

categorical_cols = ["start_weekday", "County", "State", "Weather_Condition"]

df = df.dropna(subset=numeric_cols + categorical_cols + ["accident_duration_hours"])

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

features = numeric_cols + categorical_cols

for col in ["is_weekend", "is_bad_weather"]:
    df[col] = df[col].astype(int)

X = df[features]
y = df["accident_duration_hours"]

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [0]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data  = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [0]:
params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "max_depth": -1,
    "verbose": -1
}

gbm = lgb.train(
    params,
    train_data,
    num_boost_round=500,
    valid_sets=[train_data, test_data]
)

In [0]:
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
rmse = mean_squared_error(y_test, y_pred)
mae  = mean_absolute_error(y_test, y_pred)

print(f"RMSE: {rmse:.4f}, MAE: {mae:.4f}")

In [0]:
results_df = pd.DataFrame({
    "Actual": y_test.values,
    "Predicted": y_pred
})

results_df.sample(10)