### Import des librairies
---

In [74]:

import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import xgboost as xgb
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error, r2_score, mean_absolute_error, median_absolute_error


import matplotlib.pyplot as plt
import seaborn as sns

### Chargement des donn√©es
---

In [75]:
path = '../data/get_around_pricing_project.csv'
df = pd.read_csv(path, encoding='utf-8')
df.head()

Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,0,Citro√´n,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,1,Citro√´n,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,2,Citro√´n,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,3,Citro√´n,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,4,Citro√´n,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [76]:
df = df.drop(columns=['Unnamed: 0'])

### Enregsitrement dans ML Flow 
---

In [103]:
#Initialisation MLflow
# Pointez vers le serveur MLflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Getaround_Model_Comparison")


<Experiment: artifact_location='mlflow-artifacts:/956791146109077519', creation_time=1761668958082, experiment_id='956791146109077519', last_update_time=1761668958082, lifecycle_stage='active', name='Getaround_Model_Comparison', tags={'mlflow.experimentKind': 'custom_model_development'}>

### Mod√©le Lin√©aire Regression Baseline
---

In [105]:
# S√©paration des donn√©es
target = "rental_price_per_day"
X = df.drop(columns=target)
y = df[target]

# Split du jeu de donn√©es
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Pipeline de pr√©traitement 
# numerical_columns = ['mileage', 'engine_power']
# categorical_columns = ['model_key', 'fuel', 'paint_color', 'car_type',
#                         'private_parking_available', 'has_gps', 'has_air_conditioning',
#                         'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']

# numerical_pipeline = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy="median")),
#     ("standardization", StandardScaler())
# ])

# categorical_pipeline = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy="most_frequent")),
#     ("encoder", OneHotEncoder(drop="first"))
# ])

# feature_encoder = ColumnTransformer(transformers=[
#     ("num", numerical_pipeline, numerical_columns),
#     ("cat", categorical_pipeline, categorical_columns)
# ])

# # Transformation des donn√©es
# X_train = feature_encoder.fit_transform(X_train)
# X_test = feature_encoder.transform(X_test)

# # Entra√Ænement des mod√®les de r√©gression 
# lin_reg = LinearRegression() 
# lin_reg.fit(X_train, y_train)

# # Pr√©dictions
# y_train_pred = lin_reg.predict(X_train)
# y_test_pred = lin_reg.predict(X_test)

# Pipeline de pr√©traitement
numerical_columns = ['mileage', 'engine_power']
categorical_columns = ['model_key', 'fuel', 'paint_color', 'car_type',
                        'private_parking_available', 'has_gps', 'has_air_conditioning',
                        'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("standardization", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(drop="first"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_columns),
        ("cat", categorical_transformer, categorical_columns)
    ])

# Cr√©ation du pipeline complet
# n_estimators=100 ‚Üí assez d‚Äôarbres pour stabiliser le mod√®le.
# max_depth=9 ‚Üí limite la complexit√© des arbres pour mieux g√©n√©raliser sur le test.
# random_state=42 ‚Üí assure que les r√©sultats sont toujours identiques.
model = LinearRegression() 
pipeline_lr = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", model) 
])

# Entra√Ænement
pipeline_lr.fit(X_train, y_train)

# Pr√©diction 
y_pred = pipeline_lr.predict(X_test)

y_train_pred = pipeline_lr.predict(X_train)
y_test_pred = pipeline_lr.predict(X_test)

# √âvaluation
print("--- Evaluation ---\n")
print(f"R2 Score (Train): {r2_score(y_train, y_train_pred):.4f}")
print(f"R2 Score (Test) : {r2_score(y_test, y_test_pred):.4f}")
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print(f"RMSE : {rmse :.2f}")
mae = mean_absolute_error(y_test, y_test_pred) # moy des ecarts (valeurs absolues) entre le prix predit et le prix r√©el
print(f"MAE : {mae:.2f}")
mean_price = df["rental_price_per_day"].mean()
print(f"Prix moyen : {mean_price:.2f} ‚Ç¨")





--- Evaluation ---

R2 Score (Train): 0.7140
R2 Score (Test) : 0.6937
RMSE : 17.96
MAE : 12.12
Prix moyen : 121.21 ‚Ç¨


In [106]:
# enregistrement dans ML FLow
with mlflow.start_run(run_name="LinearRegression"):
    # Log du pipeline complet
    mlflow.sklearn.log_model(pipeline_lr, "linear_model")
   # Log des m√©triques
    mlflow.log_metric("R2_train", r2_score(y_train, y_train_pred))
    mlflow.log_metric("R2_test", r2_score(y_test, y_test_pred))
    mlflow.log_metric("RMSE", np.sqrt(mean_squared_error(y_test, y_test_pred)))
    mlflow.log_metric("MAE", mean_absolute_error(y_test, y_test_pred))
    
    print("Mod√®le et m√©triques enregistr√©s dans MLflow")




Mod√®le et m√©triques enregistr√©s dans MLflow
üèÉ View run LinearRegression at: http://127.0.0.1:5000/#/experiments/956791146109077519/runs/0fad1d54c43f400196582866709053f6
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/956791146109077519


### Mod√©le RandomForestRegressor
---
#### RandomForestRegressor pr√©dit un prix en construisant plusieurs arbres de d√©cision ind√©pendants et en moyennant leurs pr√©dictions, ce qui rend la pr√©diction plus stable et robuste.

In [107]:

target = "rental_price_per_day"
X = df.drop(columns=target)
y = df[target]

# Split du jeu de donn√©es ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline de pr√©traitement ===
numerical_columns = ['mileage', 'engine_power']
categorical_columns = ['model_key', 'fuel', 'paint_color', 'car_type',
                        'private_parking_available', 'has_gps', 'has_air_conditioning',
                        'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("standardization", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_columns),
        ("cat", categorical_transformer, categorical_columns)
    ])

# Cr√©ation du pipeline complet
# n_estimators=100 ‚Üí assez d‚Äôarbres pour stabiliser le mod√®le.
# max_depth=9 ‚Üí limite la complexit√© des arbres pour mieux g√©n√©raliser sur le test.
# random_state=42 ‚Üí assure que les r√©sultats sont toujours identiques.
model = RandomForestRegressor(n_estimators=100, max_depth=12, random_state=42) #100 arbres dans la foret - chaque arbre √† max 9 niveaux de decisions - 
pipeline_rf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", model) 
])

# Entra√Ænement
pipeline_rf.fit(X_train, y_train)

# Pr√©diction 
y_pred = pipeline_rf.predict(X_test)

y_train_pred = pipeline_rf.predict(X_train)
y_test_pred = pipeline_rf.predict(X_test)

# √âvaluation
print("--- Evaluation ---\n")
print(f"R2 Score (Train): {r2_score(y_train, y_train_pred):.4f}")
print(f"R2 Score (Test) : {r2_score(y_test, y_test_pred):.4f}")
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print(f"RMSE : {rmse :.2f}")
mae = mean_absolute_error(y_test, y_test_pred) # moy des ecarts (valeurs absolues) entre le prix predit et le prix r√©el
print(f"MAE : {mae:.2f}")
mean_price = df["rental_price_per_day"].mean()
print(f"Prix moyen : {mean_price:.2f} ‚Ç¨")

print("\nLe mod√®le pr√©dit tr√®s bien la tendance g√©n√©rale (R¬≤ test = 0,74).\n" \
"Le RMSE de 16,6 ‚Ç¨ indique la dispersion typique des erreurs de pr√©diction autour des valeurs r√©elles.\n" \
" MAE de 10,75 ‚Ç¨, nos pr√©dictions sont en moyenne √† environ 11 ‚Ç¨ pr√®s du prix r√©el.")


--- Evaluation ---

R2 Score (Train): 0.9273
R2 Score (Test) : 0.7386
RMSE : 16.59
MAE : 10.75
Prix moyen : 121.21 ‚Ç¨

Le mod√®le pr√©dit tr√®s bien la tendance g√©n√©rale (R¬≤ test = 0,74).
Le RMSE de 16,6 ‚Ç¨ indique la dispersion typique des erreurs de pr√©diction autour des valeurs r√©elles.
 MAE de 10,75 ‚Ç¨, nos pr√©dictions sont en moyenne √† environ 11 ‚Ç¨ pr√®s du prix r√©el.


In [108]:
# enregistrement dans ML FLow
with mlflow.start_run(run_name="RandomForestRegressor"):
    # Log du pipeline complet
    mlflow.sklearn.log_model(pipeline_rf, "RandomForest_model")
   # Log des m√©triques
    mlflow.log_metric("R2_train", r2_score(y_train, y_train_pred))
    mlflow.log_metric("R2_test", r2_score(y_test, y_test_pred))
    mlflow.log_metric("RMSE", np.sqrt(mean_squared_error(y_test, y_test_pred)))
    mlflow.log_metric("MAE", mean_absolute_error(y_test, y_test_pred))
    
    print("Mod√®le et m√©triques enregistr√©s dans MLflow")



Mod√®le et m√©triques enregistr√©s dans MLflow
üèÉ View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/956791146109077519/runs/7d6fa534a8454c3bad395f18918ab0de
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/956791146109077519


### Mod√©le XGBoost
---
#### XGBoost (eXtreme Gradient Boosting) est un algorithme de Gradient Boosting bas√© sur des arbres de d√©cision, optimis√© pour la vitesse et la performance, o√π chaque nouvel arbre corrige les erreurs des arbres pr√©c√©dents.

In [109]:
# Target et features
target = "rental_price_per_day"
X = df.drop(columns=target)
y = df[target]

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Colonnes num√©riques et cat√©gorielles
numerical_columns = ['mileage', 'engine_power']
categorical_columns = ['model_key', 'fuel', 'paint_color', 'car_type',
                       'private_parking_available', 'has_gps', 'has_air_conditioning',
                       'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']

# Pr√©traitement num√©rique
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Pr√©traitement cat√©goriel
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine transformations
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numerical_columns),
    ("cat", categorical_transformer, categorical_columns)
])

# Mod√®le XGBoost
xgb_model = xgb.XGBRegressor(
    n_estimators=200,      # nombre d'arbres
    max_depth=9,           # profondeur maximale
    learning_rate=0.1,     # taux d'apprentissage : Contr√¥le combien chaque arbre corrige l‚Äôerreur du pr√©c√©dent. Plus petit = apprentissage plus lent et stable.
    subsample=0.8,         # √©chantillonnage pour r√©gularisation : fraction des √©chantillons utilis√©s pour chaque arbre.Introduit un peu d‚Äôal√©atoire pour r√©duire l‚Äôoverfitting.
)

# Pipeline complet
pipeline_xg = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", xgb_model)
])

# Entra√Ænement
pipeline_xg.fit(X_train, y_train)

# Pr√©diction
y_train_pred = pipeline_xg.predict(X_train)
y_test_pred = pipeline_xg.predict(X_test)

# √âvaluation
print("--- Evaluation ---\n")
print(f"R2 Score (Train): {r2_score(y_train, y_train_pred):.4f}")
print(f"R2 Score (Test) : {r2_score(y_test, y_test_pred):.4f}")
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print(f"RMSE : {rmse :.2f}")
mae = mean_absolute_error(y_test, y_test_pred) # moy des ecarts (valeurs absolues) entre le prix predit et le prix r√©el
print(f"MAE : {mae:.2f}")
mean_price = df["rental_price_per_day"].mean()
print(f"Prix moyen : {mean_price:.2f} ‚Ç¨")


print("\nLe mod√®le XGBoost am√©liore le R2 test avec 0.74, cependant il overfite avec un train √† 0.98 (√©cart √† 0.16)")



--- Evaluation ---

R2 Score (Train): 0.9886
R2 Score (Test) : 0.7406
RMSE : 16.53
MAE : 10.36
Prix moyen : 121.21 ‚Ç¨

Le mod√®le XGBoost am√©liore le R2 test avec 0.74, cependant il overfite avec un train √† 0.98 (√©cart √† 0.16)


In [110]:
# enregistrement dans ML FLow
with mlflow.start_run(run_name="xgb.XGBRegressor"):
    # Log du pipeline complet
    mlflow.sklearn.log_model(pipeline_xg, "XGBoost_model")
   # Log des m√©triques
    mlflow.log_metric("R2_train", r2_score(y_train, y_train_pred))
    mlflow.log_metric("R2_test", r2_score(y_test, y_test_pred))
    mlflow.log_metric("RMSE", np.sqrt(mean_squared_error(y_test, y_test_pred)))
    mlflow.log_metric("MAE", mean_absolute_error(y_test, y_test_pred))



üèÉ View run xgb.XGBRegressor at: http://127.0.0.1:5000/#/experiments/956791146109077519/runs/014571c1864d4982a0bd09f0b761ab04
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/956791146109077519


### Mod√©le XGBoost + GridSearch
---
#### GridSearchCV est un outil qui permet de chercher les meilleures combinaisons de param√®tres pour un mod√®le, afin d‚Äôoptimiser sa performance. Apr√®s avoir entra√Æn√© le GridSearch, je r√©cup√®re best_estimator_, qui contient le pipeline complet optimis√©, et je le sauvegarde imm√©diatement avec joblib.dump. Cela me permet de le recharger plus tard pour pr√©dire sur de nouvelles donn√©es sans refaire le pr√©traitement ni r√©entra√Æner le mod√®le

In [111]:
# Target et features
target = "rental_price_per_day"
X = df.drop(columns=target)
y = df[target]

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Colonnes num√©riques et cat√©gorielles
numerical_columns = ['mileage', 'engine_power']
categorical_columns = ['model_key', 'fuel', 'paint_color', 'car_type',
                       'private_parking_available', 'has_gps', 'has_air_conditioning',
                       'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']

# Pr√©traitement num√©rique
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Pr√©traitement cat√©goriel
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine transformations
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numerical_columns),
    ("cat", categorical_transformer, categorical_columns)
])

# Mod√®le XGBoost
xgb_model = xgb.XGBRegressor(
    random_state=42,
    tree_method="hist"
)

# Pipeline complet
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", xgb_model)
])

# Param√®tres pour GridSearch (r√©duits pour √©viter surcharge)
param_grid = {
    "regressor__max_depth": [3, 4],
    "regressor__learning_rate": [0.05, 0.1],
    "regressor__n_estimators": [100, 200],
    "regressor__subsample": [0.8],
}

# GridSearch avec 3-fold CV pour limiter le temps
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring="r2",
    n_jobs=-1,
    verbose=2
)

# Entra√Ænement GridSearch
grid_search.fit(X_train, y_train)

# Meilleurs param√®tres et score CV
print("Meilleurs param√®tres : ", grid_search.best_params_)
print("Meilleur R2 CV : ", grid_search.best_score_)

# Sauvegarde du pipeline complet
# On r√©cup√®re le pipeline optimis√© (pr√©traitement + XGBoost avec les meilleurs hyperparam√®tres)
best_model = grid_search.best_estimator_

# Sauvegarde au format .pkl
joblib.dump(best_model, "modele_xgb_getaround.pkl")
print(" Mod√®le sauvegard√© sous : modele_xgb_getaround.pkl")


# Pr√©dictions avec le meilleur mod√®le
y_train_pred = grid_search.best_estimator_.predict(X_train)
y_test_pred = grid_search.best_estimator_.predict(X_test)

# √âvaluation
print("--- Evaluation ---\n")
print(f"R2 Score (Train): {r2_score(y_train, y_train_pred):.4f}")
print(f"R2 Score (Test) : {r2_score(y_test, y_test_pred):.4f}")
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print(f"RMSE : {rmse :.2f}")
mae = mean_absolute_error(y_test, y_test_pred) # moy des ecarts (valeurs absolues) entre le prix predit et le prix r√©el
print(f"MAE : {mae:.2f}")
mean_price = df["rental_price_per_day"].mean()
print(f"Prix moyen : {mean_price:.2f} ‚Ç¨")


print("\nApr√®s optimisation via GridSearch, le mod√®le pr√©sente une meilleure performance avec un R¬≤ test de 0.75," \
"\ntout en r√©duisant l‚Äôoverfitting avec le R¬≤ train de 0.82 (√©cart de 0.07).Le mod√®le est ainsi plus robuste et stable.\n" \
"RMSE : 16, signifique que les predictions s'√©cratent en moy d'env 16 de la vraie valeur.\n" \
"MAE : le mod√©le se trompe en moy de 10.70 sur le prix journalier d'une voiture.")


Fitting 3 folds for each of 8 candidates, totalling 24 fits
Meilleurs param√®tres :  {'regressor__learning_rate': 0.05, 'regressor__max_depth': 4, 'regressor__n_estimators': 200, 'regressor__subsample': 0.8}
Meilleur R2 CV :  0.7517865300178528
 Mod√®le sauvegard√© sous : modele_xgb_getaround.pkl
--- Evaluation ---

R2 Score (Train): 0.8234
R2 Score (Test) : 0.7470
RMSE : 16.32
MAE : 10.70
Prix moyen : 121.21 ‚Ç¨

Apr√®s optimisation via GridSearch, le mod√®le pr√©sente une meilleure performance avec un R¬≤ test de 0.75,
tout en r√©duisant l‚Äôoverfitting avec le R¬≤ train de 0.82 (√©cart de 0.07).Le mod√®le est ainsi plus robuste et stable.
RMSE : 16, signifique que les predictions s'√©cratent en moy d'env 16 de la vraie valeur.
MAE : le mod√©le se trompe en moy de 10.70 sur le prix journalier d'une voiture.


Enregistrement du mod√®le 
Apr√®s la recherche d‚Äôhyperparam√®tres avec GridSearch, j‚Äôai r√©cup√©r√© grid_search.best_estimator_, qui contient le pipeline complet (pr√©traitement + XGBoost optimis√©). Je l‚Äôai sauvegard√© avec joblib.dump. Ensuite, dans mon application, je recharge ce fichier avec joblib.load, ce qui me permet de pr√©dire directement sur de nouvelles donn√©es sans refaire tout le preprocessing. ¬ª

In [112]:

with mlflow.start_run(run_name="XGBoost_GridSearch"):
    # Enregistre le pipeline complet
    mlflow.sklearn.log_model(best_model, "xgb_grid_model")
    
    # Log des m√©triques calcul√©es
    mlflow.log_metric("R2_train", r2_score(y_train, y_train_pred))
    mlflow.log_metric("R2_test", r2_score(y_test, y_test_pred))
    mlflow.log_metric("RMSE", np.sqrt(mean_squared_error(y_test, y_test_pred)))
    mlflow.log_metric("MAE", mean_absolute_error(y_test, y_test_pred))
    
    print("Mod√®le et m√©triques enregistr√©s dans MLflow")




Mod√®le et m√©triques enregistr√©s dans MLflow
üèÉ View run XGBoost_GridSearch at: http://127.0.0.1:5000/#/experiments/956791146109077519/runs/35c59ed053b14748b6522a474b68cc4a
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/956791146109077519
