In [18]:
# Installation de la bibliothèque 'mord'

import sys
!{sys.executable} -m pip install mord



# Prédiction de la consommation (conso_5_usages_ef) via **régression logistique ordinale**

Objectif : prédire une **consommation estimée** à partir des caractéristiques du logement, en utilisant une **régression logistique ordinale**.

Comme l’ordinal logit prédit des **classes ordonnées**, on procède en 3 étapes :
1. Discrétiser la conso en **K bins ordonnés** (calculés **sur le train uniquement**).
2. Apprendre un modèle **ordinal logit** sur ces classes.
3. Reconstruire une conso continue via une valeur **robuste par classe** (médiane par bin, côté train).


In [19]:
import pandas as pd
import numpy as np
import mord

from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.inspection import permutation_importance


# Chargement du dataset
df = pd.read_csv("../../data/processed/DPE.csv") # Attention au chemin d'accès avec les push Git

print("Dataset shape:", df.shape)

Dataset shape: (8787, 14)


In [20]:
# Target
TARGET = "conso_5_usages_ef"

if TARGET not in df.columns:
    # debug utile
    cols_like = [c for c in df.columns if "conso" in c.lower()]
    raise ValueError(f"Target '{TARGET}' introuvable. Colonnes conso: {cols_like[:50]}")

y = pd.to_numeric(df[TARGET], errors="coerce")


# Colonnes à exclure (anti-fuite + anti-mémorisation)
leak_cols = [
    # Labels / sorties dérivées de la performance
    "etiquette_dpe",
    "etiquette_ges",

    # Autres sorties perf très corrélées (à exclure)
    "conso_5_usages_ep",
    "cout_total_5_usages_energie_n1",

    # La target (EXCLUSION OBLIGATOIRE)
    TARGET,
]

# garde uniquement celles qui existent vraiment dans le CSV
leak_cols = [c for c in leak_cols if c in df.columns]

In [21]:
# Construction de X (sans fuite) + filtrage des y manquants

X = df.drop(columns=[c for c in leak_cols if c in df.columns], errors="ignore")

mask = y.notna()
X = X.loc[mask].copy()
y = y.loc[mask].copy()


# Split train/test/validation (70% train, 15% test, 15% validation)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42
)
X_test, X_val, y_test, y_val = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42
)
print("Train:", X_train.shape, "Test:", X_test.shape, "Val:", X_val.shape)

# Détection num/cat (à partir du train)
num_cols = X_train.select_dtypes(include=[np.number]).columns
cat_cols = X_train.select_dtypes(exclude=[np.number]).columns


Train: (6150, 9) Test: (1318, 9) Val: (1319, 9)


In [22]:
# Prétraitement (dense car HGBR n'accepte pas sparse)

try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
        ]), num_cols),

        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", ohe),
        ]), cat_cols),
    ],
    remainder="drop",
    sparse_threshold=0.0  # force sortie dense globale
)

Xtr = preprocess.fit_transform(X_train)
Xte = preprocess.transform(X_test)
Xva = preprocess.transform(X_val)

print("Xtr:", Xtr.shape, "Xte:", Xte.shape, "Xva:", Xva.shape)


Xtr: (6150, 29) Xte: (1318, 29) Xva: (1319, 29)


In [23]:
# Choix du nombre de classes ordinales
K = 5  # à tester: 5 / 7 / 10

# Binning sur TRAIN uniquement (quantiles) -> pas de fuite
y_train_cls, bin_edges = pd.qcut(
    y_train, q=K, labels=False, retbins=True, duplicates="drop"
)
y_train_cls = y_train_cls.astype(int)

# Appliquer les mêmes bornes sur test/val
y_test_cls = pd.cut(y_test, bins=bin_edges, labels=False, include_lowest=True)
y_val_cls  = pd.cut(y_val,  bins=bin_edges, labels=False, include_lowest=True)

# Filtrer les NaN éventuels (valeurs hors bornes, rare)
test_mask = ~y_test_cls.isna()
val_mask  = ~y_val_cls.isna()

Xte2 = Xte[test_mask.values]
y_test2 = y_test.loc[test_mask].copy()
y_test_cls = y_test_cls.loc[test_mask].astype(int)

Xva2 = Xva[val_mask.values]
y_val2 = y_val.loc[val_mask].copy()
y_val_cls = y_val_cls.loc[val_mask].astype(int)

print("Bins réellement utilisés:", len(bin_edges)-1)
print("Test kept:", len(y_test2), "Val kept:", len(y_val2))

# Reconstruction continue robuste: médiane par classe (sur train)
train_bins = pd.cut(y_train, bins=bin_edges, labels=False, include_lowest=True)
class_median = y_train.groupby(train_bins).median().values  # taille = nb_bins

# Modèle ordinal logit
ord_model = mord.LogisticAT(alpha=1.0)  # alpha = régularisation (à tuner)
ord_model.fit(Xtr, y_train_cls)

# Prédiction classes
cls_pred_test = ord_model.predict(Xte2)
cls_pred_val  = ord_model.predict(Xva2)

# Prédiction conso (continue) via médiane de bin
y_pred_test = class_median[cls_pred_test]
y_pred_val  = class_median[cls_pred_val]


Bins réellement utilisés: 5
Test kept: 1316 Val kept: 1319


In [24]:
def eval_reg(y_true, y_pred, name=""):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{name} MAE  = {mae:.2f}")
    print(f"{name} RMSE = {rmse:.2f}")
    print(f"{name} R²   = {r2:.3f}")
    return mae, rmse, r2

print("=== Ordinal logit sur conso binned ===")
print("TARGET:", TARGET)
print("K bins:", len(bin_edges)-1)

print("\n--- Régression (reconstruction continue) ---")
eval_reg(y_test2, y_pred_test, name="TEST")
eval_reg(y_val2,  y_pred_val,  name="VAL ")

print("\n--- Ordinal (qualité de classement) ---")
mae_rank_test = np.mean(np.abs(y_test_cls.values - cls_pred_test))
acc_pm1_test  = np.mean(np.abs(y_test_cls.values - cls_pred_test) <= 1)

mae_rank_val = np.mean(np.abs(y_val_cls.values - cls_pred_val))
acc_pm1_val  = np.mean(np.abs(y_val_cls.values - cls_pred_val) <= 1)

print(f"TEST MAE rang  = {mae_rank_test:.3f}")
print(f"TEST Acc ±1    = {acc_pm1_test:.3f}")
print(f"VAL  MAE rang  = {mae_rank_val:.3f}")
print(f"VAL  Acc ±1    = {acc_pm1_val:.3f}")

print("\n--- Détails bins ---")
print("bin_edges:", bin_edges)
print("class_median:", class_median)


=== Ordinal logit sur conso binned ===
TARGET: conso_5_usages_ef
K bins: 5

--- Régression (reconstruction continue) ---
TEST MAE  = 5929.68
TEST RMSE = 16804.63
TEST R²   = 0.247
VAL  MAE  = 5397.88
VAL  RMSE = 13508.85
VAL  R²   = 0.393

--- Ordinal (qualité de classement) ---
TEST MAE rang  = 0.540
TEST Acc ±1    = 0.938
VAL  MAE rang  = 0.535
VAL  Acc ±1    = 0.939

--- Détails bins ---
bin_edges: [6.774000e+02 4.702220e+03 7.962080e+03 1.276604e+04 2.179290e+04
 8.982664e+05]
class_median: [ 3357.4   6094.95 10204.55 16409.8  31702.6 ]


## Modèle idiot pour comparrer les résultats

In [25]:
from sklearn.metrics import mean_absolute_error, r2_score
y_pred_base_test = np.full_like(y_test2.values, y_train.median(), dtype=float)
y_pred_base_val  = np.full_like(y_val2.values,  y_train.median(), dtype=float)

print("BASE TEST MAE =", mean_absolute_error(y_test2, y_pred_base_test))
print("BASE TEST R²  =", r2_score(y_test2, y_pred_base_test))
print("BASE VAL  MAE =", mean_absolute_error(y_val2,  y_pred_base_val))
print("BASE VAL  R²  =", r2_score(y_val2,  y_pred_base_val))


BASE TEST MAE = 9491.242629179333
BASE TEST R²  = -0.06782236006637854
BASE VAL  MAE = 9196.973654283549
BASE VAL  R²  = -0.052654650886298615
