# **05 - Submit on Kaggle**

In [None]:
import sys
sys.path.insert(0, "../src")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from dataset import Dataset

pd.set_option("display.max_columns", None)

## **Normalization**

Each year must produce the exact same column set regardless of its format group.

In [None]:
from dataset import (
    CANONICAL_CHARACTERISTICS,
    CANONICAL_LOCATIONS,
    CANONICAL_VEHICLES,
    CANONICAL_USERS,
)


ds = Dataset("test")
assert ds.characteristics.columns.tolist() == CANONICAL_CHARACTERISTICS, "test: characteristics columns mismatch"
assert ds.locations.columns.tolist() == CANONICAL_LOCATIONS, "test: locations columns mismatch"
assert ds.vehicles.columns.tolist() == CANONICAL_VEHICLES, "test: vehicles columns mismatch"
assert ds.users.columns.tolist() == CANONICAL_USERS, "test : users columns mismatch"
n_acc = ds.characteristics["Num_Acc"].nunique()

print(f"\nTest normalized successfully. Found {n_acc:,} unique accidents.")

## **Build test set**

In [None]:
test = Dataset("test").merged()
print(f"Test set: {test.shape[0]:,} rows x {test.shape[1]} cols")
print(f"Unique accidents: {test['Num_Acc'].nunique():,}")
print(f"grav column (should be all NaN): {test['grav'].isna().all()}")
test.head()

## **Feature Engineering**

### **Accident-level aggregation features**

Count features derived from `groupby("Num_Acc")`:
- `nb_vehicules` — number of distinct vehicles involved
- `nb_usagers` — total number of users (drivers, passengers, pedestrians)
- `nb_pietons` — number of pedestrians (`catu == 3`)
- `nb_occupants_vehicule` — number of non-pedestrian users per vehicle (`catu != 3`), to avoid counting pedestrians who are linked to a vehicle only because the accident involves them

In [None]:
# Per accident
test["nb_vehicules"] = test.groupby("Num_Acc")["num_veh"].transform("nunique")
test["nb_usagers"]   = test.groupby("Num_Acc")["catu"].transform("count")
test["nb_pietons"]   = test.groupby("Num_Acc")["catu"].transform(lambda x: (x == 3).sum())

# Per vehicle: count only non-pedestrians (catu != 3)
occupants = (
    test[test["catu"] != 3]
    .groupby(["Num_Acc", "num_veh"])
    .size()
    .rename("nb_occupants_vehicule")
)
test = test.merge(occupants, on=["Num_Acc", "num_veh"], how="left")
test["nb_occupants_vehicule"] = test["nb_occupants_vehicule"].fillna(0).astype(int)

### **Individual-level features**

Derive per-user attributes from existing columns:
- `age` — age at the time of the accident (`an - an_nais`). Values outside [0, 120] are set to NaN as data entry errors.

In [None]:
test["age"] = test["an"] - test["an_nais"]
test.loc[(test["age"] < 0) | (test["age"] > 120), "age"] = np.nan

print(f"age NaN: {test['age'].isna().sum():,} ({test['age'].isna().mean():.2%})")

### **Drop low-value columns**

Remove free-text, identifier, and high-cardinality string columns that carry no direct predictive signal for a tree-based model:

| Column | Reason |
|--------|--------|
| `voie` | Road name/number — free-text, very high cardinality |
| `v1`, `v2` | Alphanumeric road index points — free-text identifiers |
| `pr`, `pr1` | Milestone references (bornes kilométriques) — numeric road markers, not meaningful as features |
| `com` | Commune code — inconsistent format across groups (3-digit vs full INSEE), high cardinality (~36k unique). Geographic info already captured by `dep` |

Note: `adr`, `lat`, `long` were already dropped during normalization (free-text address and incompatible coordinate systems).

In [None]:
drop_cols = ["voie", "v1", "v2", "pr", "pr1", "com"]
test.drop(columns=drop_cols, inplace=True)

### **Variable transformations**

- `hrmn` (integer HHMM) → extract `hour` (0-23), then drop `hrmn`
- `lartpc`, `larrout` — road widths stored as strings → convert to numeric
- `an_nais` — redundant with `age` → drop

In [None]:
# Extract hour from HHMM
test["hour"] = test["hrmn"] // 100

# Convert road widths to numeric
for col in ("lartpc", "larrout"):
    test[col] = pd.to_numeric(test[col], errors="coerce")

# Drop redundant columns
test.drop(columns=["hrmn", "an_nais"], inplace=True)

### **Drop identifiers**

`Num_Acc` and `num_veh` are row identifiers with no predictive value. Dropped after all groupby-based features have been computed.

In [None]:
# Save Num_Acc for submission, then drop identifiers and grav
submission_ids = test["Num_Acc"].copy()
test.drop(columns=["Num_Acc", "num_veh", "grav"], inplace=True)

print(f"Test features: {test.shape[1]} cols")
print(f"Submission IDs: {submission_ids.nunique():,} unique accidents")

## **Load model & predict**

Load the saved LightGBM model, cast categorical features to the same dtype used during training, and predict `P(GRAVE=1)` for each row.

In [None]:
import lightgbm as lgb

# Categorical features (same list as training)
CAT_FEATURES = [
    "mois", "jour", "lum", "agg", "int", "atm", "col", "dep",
    "catr", "circ", "vosp", "prof", "plan", "surf", "infra", "situ",
    "senc", "catv", "obs", "obsm", "choc", "manv",
    "place", "catu", "sexe", "trajet", "secu1", "locp", "actp", "etatp",
    "hour",
]

for col in CAT_FEATURES:
    test[col] = test[col].astype("category")

# Load model
model = lgb.Booster(model_file="../models/lgbm_grave.txt")

# Predict P(GRAVE=1) per row
test["y_prob"] = model.predict(test[model.feature_name()])

print(f"Predictions: min={test['y_prob'].min():.4f}  max={test['y_prob'].max():.4f}  mean={test['y_prob'].mean():.4f}")

## **Aggregate to accident level & export**

The model predicts at the **user level** (one row per person). For submission we need one prediction per accident (`Num_Acc`). We take the **max probability** across all users of the same accident — consistent with the target definition (GRAVE=1 if *at least one* user is killed or hospitalized).

In [None]:
# Aggregate: max proba per accident
submission = (
    pd.DataFrame({"Num_Acc": submission_ids, "y_prob": test["y_prob"]})
    .groupby("Num_Acc")["y_prob"]
    .max()
    .reset_index()
    .rename(columns={"y_prob": "GRAVE"})
)

print(f"Submission: {submission.shape[0]:,} accidents")
print(f"GRAVE mean: {submission['GRAVE'].mean():.4f}")
submission.head(10)

In [None]:
output_path = "../submissions/lgbm_grave.csv"
submission.to_csv(output_path, index=False)
print(f"Saved to {output_path} ({submission.shape[0]:,} rows)")