In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.inspection import permutation_importance


# Chargement du dataset
df = pd.read_csv("../../data/raw/DpeClean_enriched.csv") # Attention au chemin d'accès avec les push Git

# Normalise les noms de colonnes (espaces -> _)
df.columns = (df.columns
              .str.strip()
              .str.replace(r"\s+", "_", regex=True))

print("Dataset shape:", df.shape)

Dataset shape: (10000, 82)


In [5]:
# Target

TARGET = "conso_5_usages_par_m2_ep"

if TARGET not in df.columns:
    raise ValueError(f"Target '{TARGET}' introuvable dans le dataset.")

y = pd.to_numeric(df[TARGET], errors="coerce")

# Colonnes à exclure (anti-fuite + anti-mémorisation)

leak_cols = [
    # Identifiants / mémorisation
    "numero_dpe",
    "adresse_ban",

    # Labels/scores dérivés de la performance
    "etiquette_dpe", "etiquette_ges",
    "classe_dpe", "classe_ges", "score_dpe", "score_ges",

    # La target (EXCLUSION OBLIGATOIRE)
    "conso_5_usages_par_m2_ep",

    # Autres sorties perf (fuite directe / très corrélé)
    "conso_5_usages_ep",
    "conso_5_usages_ef",
    "conso_5_usages_par_m2_ef",
    "emission_ges_5_usages",
    "emission_ges_5_usages_par_m2",
    "cout_total_5_usages",
]

In [6]:
# Construction de X (sans fuite) + filtrage des y manquants

X = df.drop(columns=[c for c in leak_cols if c in df.columns], errors="ignore")

mask = y.notna()
X = X.loc[mask].copy()
y = y.loc[mask].copy()

# Garde-fous anti-fuite
if TARGET in X.columns:
    raise RuntimeError("Fuite détectée : la target est encore dans X.")
for c in ["conso_5_usages_par_m2_ef", "emission_ges_5_usages_par_m2", "cout_total_5_usages"]:
    if c in X.columns:
        raise RuntimeError(f"Fuite détectée : '{c}' est encore dans X.")

print("Rows kept:", len(y))
print("X shape:", X.shape)


# Split train/test/validation (70% train, 15% test, 15% validation)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42
)

X_test, X_val, y_test, y_val = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42
)

# Détection num/cat (à partir du train)
num_cols = X_train.select_dtypes(include=[np.number]).columns
cat_cols = X_train.select_dtypes(exclude=[np.number]).columns


Rows kept: 10000
X shape: (10000, 71)


In [7]:
# Prétraitement (dense car HGBR n'accepte pas sparse)

try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
        ]), num_cols),

        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", ohe),
        ]), cat_cols),
    ],
    remainder="drop",
    sparse_threshold=0.0  # force sortie dense globale
)

# Modèle + pipeline

model = HistGradientBoostingRegressor(random_state=42)

pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", model),
])


In [8]:

# Entraînement + évaluation

pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)

mae  = mean_absolute_error(y_test, pred)
rmse = mean_squared_error(y_test, pred, squared=False)
r2   = r2_score(y_test, pred)

print("\n=== Evaluation (sans fuite) ===")
print("TARGET:", TARGET)
print(f"MAE  = {mae:.2f}")
print(f"RMSE = {rmse:.2f}")
print(f"R²   = {r2:.3f}")

# Importance des features (permutation importance sur X brut)

imp = permutation_importance(
    pipe,
    X_test, y_test,
    n_repeats=10,
    random_state=42,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

imp_df = pd.DataFrame({
    "feature": X_test.columns,
    "importance_mean": imp.importances_mean,
    "importance_std": imp.importances_std,
}).sort_values("importance_mean", ascending=False)

print("\n=== Top 25 features (Permutation Importance) ===")
print(imp_df.head(25).to_string(index=False))

[WinError 2] Le fichier spécifié est introuvable
  File "c:\Users\rapha\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\rapha\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\rapha\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\rapha\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^



=== Evaluation (sans fuite) ===
TARGET: conso_5_usages_par_m2_ep
MAE  = 26.50
RMSE = 41.23
R²   = 0.899

=== Top 25 features (Permutation Importance) ===
                                   feature  importance_mean  importance_std
                          besoin_chauffage        86.132966        1.730793
        surface_habitable_logement_imputee        49.010456        1.557136
           deperditions_renouvellement_air        15.539065        0.606866
         type_energie_principale_chauffage         8.598712        0.311210
surface_chauffee_installation_chauffage_n1         7.549106        0.390379
         type_energie_generateur_n1_ecs_n1         5.468178        0.367209
       type_generateur_chauffage_principal         4.856237        0.317609
                annee_construction_imputee         4.584153        0.616592
                   methode_application_dpe         3.685641        0.313907
               qualite_isolation_enveloppe         3.476908        0.331167
         

In [9]:

# Option : réduire le dataset aux Top-K features et ré-évaluer

TOP_K = 25
top_features = imp_df["feature"].head(TOP_K).tolist()

X_train_top = X_train[top_features]
X_test_top  = X_test[top_features]

# Rebuild pipeline (préprocess recalculé sur subset)
num_cols_top = X_train_top.select_dtypes(include=[np.number]).columns
cat_cols_top = X_train_top.select_dtypes(exclude=[np.number]).columns

preprocess_top = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
        ]), num_cols_top),

        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", ohe),
        ]), cat_cols_top),
    ],
    remainder="drop",
    sparse_threshold=0.0
)

pipe_top = Pipeline(steps=[
    ("prep", preprocess_top),
    ("model", HistGradientBoostingRegressor(random_state=42)),
])

pipe_top.fit(X_train_top, y_train)
pred_top = pipe_top.predict(X_test_top)

mae_top  = mean_absolute_error(y_test, pred_top)
rmse_top = mean_squared_error(y_test, pred_top, squared=False)
r2_top   = r2_score(y_test, pred_top)

print("\n=== Evaluation avec Top-K features ===")
print(f"TOP_K = {TOP_K}")
print(f"MAE  = {mae_top:.2f}")
print(f"RMSE = {rmse_top:.2f}")
print(f"R²   = {r2_top:.3f}")

# affichage des features utilisées avec score
print("\nFeatures utilisées:")
for feat in top_features:
    print(f"- {feat}")  



=== Evaluation avec Top-K features ===
TOP_K = 25
MAE  = 26.65
RMSE = 41.85
R²   = 0.895

Features utilisées:
- besoin_chauffage
- surface_habitable_logement_imputee
- deperditions_renouvellement_air
- type_energie_principale_chauffage
- surface_chauffee_installation_chauffage_n1
- type_energie_generateur_n1_ecs_n1
- type_generateur_chauffage_principal
- annee_construction_imputee
- methode_application_dpe
- qualite_isolation_enveloppe
- deperditions_murs
- deperditions_planchers_hauts
- zone_climatique
- qualite_isolation_murs
- type_installation_ecs
- classe_altitude
- type_emetteur_installation_chauffage_n1
- qualite_isolation_menuiseries
- deperditions_baies_vitrees
- coordonnee_cartographique_y_ban
- type_energie_principale_ecs
- type_generateur_n1_ecs_n1
- surface_habitable_immeuble
- volume_stockage_generateur_n1_ecs_n1
- deperditions_ponts_thermiques


