# **02 - Preprocessing & Feature Engineering**

In [1]:
import sys
sys.path.insert(0, "../src")

import pandas as pd
import numpy as np
from dataset import Dataset

## **Verify normalization across all years**

Each year must produce the exact same column set regardless of its format group.

In [2]:
from dataset import (
    CANONICAL_CHARACTERISTICS,
    CANONICAL_LOCATIONS,
    CANONICAL_VEHICLES,
    CANONICAL_USERS,
)

for y in list(range(2010, 2023)) + ["test"]:
    ds = Dataset(y)
    assert ds.characteristics.columns.tolist() == CANONICAL_CHARACTERISTICS, f"{y}: characteristics columns mismatch"
    assert ds.locations.columns.tolist() == CANONICAL_LOCATIONS, f"{y}: locations columns mismatch"
    assert ds.vehicles.columns.tolist() == CANONICAL_VEHICLES, f"{y}: vehicles columns mismatch"
    assert ds.users.columns.tolist() == CANONICAL_USERS, f"{y}: users columns mismatch"
    n_acc = ds.characteristics["Num_Acc"].nunique()
    print(f"{str(y):5s} (group {ds.group}) : {n_acc:>6,} accidents  OK")

print("\nAll years normalized successfully.")

2010  (group A) : 69,379 accidents  OK
2011  (group A) : 66,974 accidents  OK
2012  (group B) : 56,025 accidents  OK
2013  (group B) : 52,558 accidents  OK
2014  (group B) : 53,869 accidents  OK
2015  (group B) : 52,789 accidents  OK
2016  (group B) : 53,489 accidents  OK
2017  (group B) : 54,631 accidents  OK
2018  (group B) : 52,005 accidents  OK
2019  (group C) : 52,956 accidents  OK
2020  (group C) : 42,970 accidents  OK
2021  (group C) : 50,867 accidents  OK
2022  (group C) : 49,772 accidents  OK
test  (group D) : 63,544 accidents  OK

All years normalized successfully.


## **Build full training set**

In [3]:
train = pd.concat(
    [Dataset(y).merged() for y in range(2010, 2023)],
    ignore_index=True,
)
print(f"Full training set: {train.shape[0]:,} rows x {train.shape[1]} cols")
print(f"Unique accidents:  {train['Num_Acc'].nunique():,}")
train.head()

Full training set: 1,584,127 rows x 48 cols
Unique accidents:  708,284


Unnamed: 0,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,...,grav,sexe,an_nais,trajet,secu1,secu2,secu3,locp,actp,etatp
0,201000000001,2010,6,12,1930,1,2,1,1.0,6.0,...,3,1,1976.0,5.0,2.0,1.0,-1,0.0,0.0,0.0
1,201000000002,2010,8,7,1000,1,2,1,1.0,6.0,...,1,2,1983.0,5.0,1.0,1.0,-1,0.0,0.0,0.0
2,201000000002,2010,8,7,1000,1,2,1,1.0,6.0,...,3,1,1965.0,0.0,,,-1,4.0,3.0,1.0
3,201000000003,2010,9,11,1600,1,1,1,1.0,5.0,...,3,1,1979.0,5.0,2.0,3.0,-1,0.0,0.0,0.0
4,201000000003,2010,9,11,1600,1,1,1,1.0,5.0,...,1,2,1959.0,5.0,1.0,1.0,-1,0.0,0.0,0.0


## **Build test set**

In [4]:
test = Dataset("test").merged()
print(f"Test set: {test.shape[0]:,} rows x {test.shape[1]} cols")
print(f"Unique accidents: {test['Num_Acc'].nunique():,}")
print(f"grav column (should be all NaN): {test['grav'].isna().all()}")
test.head()

Test set: 142,422 rows x 48 cols
Unique accidents: 63,544
grav column (should be all NaN): True


Unnamed: 0,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,...,grav,sexe,an_nais,trajet,secu1,secu2,secu3,locp,actp,etatp
0,201200049538,2012,8,12,1320,1,1,1,1.0,4.0,...,,2,1954.0,1.0,1.0,1.0,-1.0,0.0,0.0,0.0
1,201200049538,2012,8,12,1320,1,1,1,1.0,4.0,...,,2,1968.0,1.0,1.0,1.0,-1.0,0.0,0.0,0.0
2,201200049538,2012,8,12,1320,1,1,1,1.0,4.0,...,,2,1984.0,2.0,1.0,1.0,-1.0,0.0,0.0,0.0
3,201200004221,2012,6,22,615,2,1,1,1.0,6.0,...,,1,1973.0,0.0,2.0,1.0,-1.0,0.0,0.0,0.0
4,201200002457,2012,2,3,2245,3,1,1,1.0,1.0,...,,1,1984.0,0.0,1.0,1.0,-1.0,0.0,0.0,0.0
