# **03 - Feature Engineering**

In [1]:
import sys
sys.path.insert(0, "../src")

import pandas as pd
import numpy as np
from dataset import Dataset

## **Normalization**

### **Verify normalization across all years**

Each year must produce the exact same column set regardless of its format group.

In [2]:
from dataset import (
    CANONICAL_CHARACTERISTICS,
    CANONICAL_LOCATIONS,
    CANONICAL_VEHICLES,
    CANONICAL_USERS,
)

for y in list(range(2010, 2023)) + ["test"]:
    ds = Dataset(y)
    assert ds.characteristics.columns.tolist() == CANONICAL_CHARACTERISTICS, f"{y}: characteristics columns mismatch"
    assert ds.locations.columns.tolist() == CANONICAL_LOCATIONS, f"{y}: locations columns mismatch"
    assert ds.vehicles.columns.tolist() == CANONICAL_VEHICLES, f"{y}: vehicles columns mismatch"
    assert ds.users.columns.tolist() == CANONICAL_USERS, f"{y}: users columns mismatch"
    n_acc = ds.characteristics["Num_Acc"].nunique()
    print(f"{str(y):5s} (group {ds.group}) : {n_acc:>6,} accidents  OK")

print("\nAll years normalized successfully.")

2010  (group A) : 69,379 accidents  OK
2011  (group A) : 66,974 accidents  OK
2012  (group B) : 56,025 accidents  OK
2013  (group B) : 52,558 accidents  OK
2014  (group B) : 53,869 accidents  OK
2015  (group B) : 52,789 accidents  OK
2016  (group B) : 53,489 accidents  OK
2017  (group B) : 54,631 accidents  OK
2018  (group B) : 52,005 accidents  OK
2019  (group C) : 52,956 accidents  OK
2020  (group C) : 42,970 accidents  OK
2021  (group C) : 50,867 accidents  OK
2022  (group C) : 49,772 accidents  OK
test  (group D) : 63,544 accidents  OK

All years normalized successfully.


### **Build full training set**

In [3]:
train = pd.concat(
    [Dataset(y).merged() for y in range(2010, 2023)],
    ignore_index=True,
)
print(f"Full training set: {train.shape[0]:,} rows x {train.shape[1]} cols")
print(f"Unique accidents:  {train['Num_Acc'].nunique():,}")
train.head()

Full training set: 1,583,848 rows x 48 cols
Unique accidents:  708,284


Unnamed: 0,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,...,grav,sexe,an_nais,trajet,secu1,secu2,secu3,locp,actp,etatp
0,201000000001,2010,6,12,1930,1,2,1,1,6,...,3,1,1976,5,2.0,1.0,-1,0,0,0
1,201000000002,2010,8,7,1000,1,2,1,1,6,...,1,2,1983,5,1.0,1.0,-1,0,0,0
2,201000000002,2010,8,7,1000,1,2,1,1,6,...,3,1,1965,0,,,-1,4,3,1
3,201000000003,2010,9,11,1600,1,1,1,1,5,...,3,1,1979,5,2.0,3.0,-1,0,0,0
4,201000000003,2010,9,11,1600,1,1,1,1,5,...,1,2,1959,5,1.0,1.0,-1,0,0,0


## **Feature Engineering**

### **Target variable: GRAVE**

Create a binary target at the **accident level**: `GRAVE = 1` if at least one user was killed (`grav=2`) or hospitalized (`grav=3`), `0` otherwise. We use `groupby("Num_Acc").transform("max")` to propagate the label to every row of the same accident.

In [4]:
train['grav'].value_counts()

grav
1    653242
4    586816
3    301107
2     42683
Name: count, dtype: Int64

In [5]:
train["GRAVE"] = train["grav"].isin([2, 3]).astype(int)
train["GRAVE"] = train.groupby("Num_Acc")["GRAVE"].transform("max")
train.drop(columns=["grav"], inplace=True)
train["GRAVE"].value_counts()

GRAVE
0    930831
1    653017
Name: count, dtype: int64

### **Accident-level aggregation features**

Count features derived from `groupby("Num_Acc")`:
- `nb_vehicules` — number of distinct vehicles involved
- `nb_usagers` — total number of users (drivers, passengers, pedestrians)
- `nb_pietons` — number of pedestrians (`catu == 3`)
- `nb_occupants_vehicule` — number of non-pedestrian users per vehicle (`catu != 3`), to avoid counting pedestrians who are linked to a vehicle only because the accident involves them

In [6]:
# Per accident
train["nb_vehicules"] = train.groupby("Num_Acc")["num_veh"].transform("nunique")
train["nb_usagers"]   = train.groupby("Num_Acc")["catu"].transform("count")
train["nb_pietons"]   = train.groupby("Num_Acc")["catu"].transform(lambda x: (x == 3).sum())

# Per vehicle: count only non-pedestrians (catu != 3)
occupants = (
    train[train["catu"] != 3]
    .groupby(["Num_Acc", "num_veh"])
    .size()
    .rename("nb_occupants_vehicule")
)
train = train.merge(occupants, on=["Num_Acc", "num_veh"], how="left")
train["nb_occupants_vehicule"] = train["nb_occupants_vehicule"].fillna(0).astype(int)

train[["nb_vehicules", "nb_usagers", "nb_pietons", "nb_occupants_vehicule"]].describe()

Unnamed: 0,nb_vehicules,nb_usagers,nb_pietons,nb_occupants_vehicule
count,1583848.0,1583848.0,1583848.0,1583848.0
mean,1.881821,2.85573,0.1864282,1.551775
std,0.9473748,2.301026,0.4873079,1.747358
min,1.0,1.0,0.0,0.0
25%,1.0,2.0,0.0,1.0
50%,2.0,2.0,0.0,1.0
75%,2.0,3.0,0.0,2.0
max,54.0,68.0,25.0,68.0
