In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# Racine du projet (fonctionne depuis /notebooks ou depuis la racine)
ROOT = Path.cwd().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()

DATA_RAW = ROOT / "data" / "raw"
FIG_DIR = ROOT / "reports" / "figures"
METRICS_DIR = ROOT / "reports" / "metrics"
MODELS_DIR = ROOT / "models"

FIG_DIR.mkdir(parents=True, exist_ok=True)
METRICS_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)

TARGET = "SalePrice"
RANDOM_STATE = 42
TEST_SIZE = 0.2
CV_FOLDS = 5

print("ROOT:", ROOT)
print("train.csv exists:", (DATA_RAW / "train.csv").exists())


ROOT: d:\DeveloppementWeb\Hexagone\Programmation avancee\ProgrammationAvancee_IA_M1_HousePrice
train.csv exists: True


In [2]:
path = "../data/raw/train.csv"
df_house = pd.read_csv(path)
display(df_house.head())

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


Split and preprocess

80% des données pour apprendre (train) et 20% pour tester (test)

random_state = pour que la séparation soit reproductible

In [None]:
#split
from sklearn.model_selection import train_test_split

X = df_house.drop(columns=[TARGET])
y = df_house[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)


In [7]:
print("X_train shape:", X_train.shape)
print("X_test shape :", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape :", y_test.shape)

# Affichage de quelques lignes
display(X_train.head())
display(y_train.head())


X_train shape: (1168, 80)
X_test shape : (292, 80)
y_train shape: (1168,)
y_test shape : (292,)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
254,255,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
1066,1067,60,RL,59.0,7837,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,5,2009,WD,Normal
638,639,30,RL,67.0,8777,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,5,2008,WD,Normal
799,800,50,RL,60.0,7200,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,6,2007,WD,Normal
380,381,50,RL,50.0,5000,Pave,Pave,Reg,Lvl,AllPub,...,0,0,,,,0,5,2010,WD,Normal


254     145000
1066    178000
638      85000
799     175000
380     127000
Name: SalePrice, dtype: int64

In [None]:
#preprocess
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_cols = X_train.select_dtypes(include=["int64","float64"]).columns
cat_cols = X_train.select_dtypes(include=["object"]).columns

numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer([
    ("num", numeric_pipe, num_cols),
    ("cat", categorical_pipe, cat_cols)
])


In [8]:
# affiche les 20 premières
print("Nombre de variables numériques:", len(num_cols))
print(list(num_cols)[:20])  
print("\nNombre de variables catégorielles:", len(cat_cols))
print(list(cat_cols)[:20])  


Nombre de variables numériques: 37
['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath']

Nombre de variables catégorielles: 43
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond']


In [9]:
preprocess.fit(X_train)

X_train_prep = preprocess.transform(X_train)
X_test_prep  = preprocess.transform(X_test)

print("Après preprocess:")
print("X_train_prep shape:", X_train_prep.shape)
print("X_test_prep shape :", X_test_prep.shape)


Après preprocess:
X_train_prep shape: (1168, 286)
X_test_prep shape : (292, 286)


Modèls,  métrics and raw vs log1p

3 modèles de régression sont testés pour prédire une valeur numérique, ici SalePrice par exemple :


**Ridge (sklearn.linear_model.Ridge)**

C’est une régression linéaire avec régularisation L2 (ça pénalise les coefficients pour éviter le sur-apprentissage).

Bon quand la relation est plutôt “linéaire” et quand on veut un modèle stable.


**RandomForestRegressor (sklearn.ensemble.RandomForestRegressor)**

Une forêt d’arbres de décision (ici n_estimators=400 → 400 arbres).

Très bon pour capturer des relations non linéaires et des interactions entre variables, sans trop de réglages.


**GradientBoostingRegressor (sklearn.ensemble.GradientBoostingRegressor)**

Du boosting : des arbres construits les uns après les autres, chaque nouvel arbre corrige les erreurs du précédent.

Souvent très performant, mais plus sensible aux hyperparamètres que RandomForest.

In [6]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd

def metrics(y_true, y_pred):
    return {
        "RMSE": mean_squared_error(y_true, y_pred, squared=False),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R2": r2_score(y_true, y_pred),
    }

models = {
    "Ridge": Ridge(random_state=RANDOM_STATE),
    "RandomForest": RandomForestRegressor(random_state=RANDOM_STATE, n_estimators=400),
    "GradientBoosting": GradientBoostingRegressor(random_state=RANDOM_STATE),
}

rows = []

# 1) Target brute
for name, model in models.items():
    pipe = Pipeline([("preprocess", preprocess), ("model", model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    rows.append({"Target": "raw", "Model": name, **metrics(y_test, preds)})

# 2) Target log1p
y_train_log = np.log1p(y_train)

for name, model in models.items():
    pipe = Pipeline([("preprocess", preprocess), ("model", model)])
    pipe.fit(X_train, y_train_log)
    preds_log = pipe.predict(X_test)
    preds = np.expm1(preds_log)  # retour en prix
    rows.append({"Target": "log1p", "Model": name, **metrics(y_test, preds)})

results = pd.DataFrame(rows).sort_values(["Target", "RMSE"])
results




Unnamed: 0,Target,Model,RMSE,MAE,R2
3,log1p,Ridge,23879.545484,15754.37704,0.925657
4,log1p,RandomForest,29733.186611,17547.820195,0.884743
5,log1p,GradientBoosting,29749.235499,16933.78828,0.884618
2,raw,GradientBoosting,27062.168987,16576.292256,0.90452
1,raw,RandomForest,28683.447668,17444.557449,0.892737
0,raw,Ridge,29844.251463,19006.271311,0.88388


Interprétation

Le Meilleur modèle est : **Ridge avec target log1p**

RMSE = 23 879 le plus bas de tous

MAE = 15 754 le plus bas de tous

R² = 0,9257 le plus haut de tous

Donc il fait les erreurs les plus faibles et explique le plus la variable cible.