## Model selection


In [9]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
from skopt import BayesSearchCV
from skopt.space import Real, Integer
from dataclasses import dataclass

Cette classe gère l'entraînement des modèles et le calcul des métriques de performance.

In [10]:
@dataclass
class ModelTrainer:
    """
    Classe pour entraîner des modèles et calculer leurs métriques de performance.
    """
    @staticmethod
    def train_model(model, X_train, y_train, X_test, y_test):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        return model, rmse, mae, r2

Cette classe configure et gère le suivi des expériences dans MLflow.

In [11]:
@dataclass
class MLFlowLogger:
    """
    Classe pour gérer le suivi des expériences dans MLflow.
    """
    experiment_name: str

    def __post_init__(self):
        mlflow.set_tracking_uri("http://127.0.0.1:5000")
        mlflow.set_experiment(self.experiment_name)

    def log_experiment(self, model_name, model, X_train, y_train, X_test, y_test, hyperparameters=None):
        with mlflow.start_run(run_name=model_name):
            mlflow.log_param("model_name", model_name)
            if hyperparameters:
                mlflow.log_params(hyperparameters)

            trainer = ModelTrainer()
            trained_model, rmse, mae, r2 = trainer.train_model(model, X_train, y_train, X_test, y_test)

            # Log des métriques
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("mae", mae)
            mlflow.log_metric("r2", r2)

            # Exemple d'entrée
            input_example = pd.DataFrame({
                "MedInc": [1.0], "HouseAge": [15.0], "AveRooms": [6.0],
                "AveBedrms": [2.0], "Population": [300.0], "AveOccup": [4.0],
                "Latitude": [37.0], "Longitude": [-122.0]
            })

            # Log du modèle
            mlflow.sklearn.log_model(trained_model, "model", input_example=input_example)

            return trained_model, rmse, mae, r2

Cette classe s'occupe d'optimiser les hyperparamètres des modèles en utilisant une recherche bayésienne.

In [12]:
@dataclass
class ModelOptimizer:
    """
    Classe pour optimiser les hyperparamètres des modèles avec recherche bayésienne.
    """
    @staticmethod
    def optimize_model(model, param_space, X_train, y_train):
        opt = BayesSearchCV(
            model,
            param_space,
            n_iter=20,
            scoring='neg_mean_squared_error',
            cv=3,
            n_jobs=-1,
            random_state=42
        )
        opt.fit(X_train, y_train)
        return opt.best_estimator_, opt.best_params_, -opt.best_score_

Cette classe compare différents modèles, les optimise, et suit leurs résultats avec MLflow

In [32]:
@dataclass
class ModelComparator:
    """
    Classe pour comparer différents modèles, les optimiser, et suivre les résultats.
    """
    logger: MLFlowLogger

    def compare_models(self, X_train, y_train, X_test, y_test):
        models = {
            "Linear Regression": LinearRegression(),
            "Random Forest": RandomForestRegressor(random_state=42),
            "Gradient Boosting": GradientBoostingRegressor(random_state=42)
        }

        param_spaces = {
            "Linear Regression": {
                'fit_intercept': [True, False]
            },
            "Random Forest": {
                'n_estimators': Integer(50, 150),
                'max_depth': Integer(5, 10),
                'min_samples_split': Integer(2, 10),
                'min_samples_leaf': Integer(1, 5)
            },
            "Gradient Boosting": {
                'n_estimators': Integer(50, 150),
                'learning_rate': Real(0.001, 0.01, prior='uniform'),
                'max_depth': Integer(3, 8),
                'min_samples_split': Integer(2, 10),
                'min_samples_leaf': Integer(1, 5)
            }
        }

        best_model = None
        best_rmse = float('inf')
        best_model_name = ""
        best_model_params = {}
        final_best_model = None  # Initialize it here

        for model_name, model in models.items():
            print(f"\nOptimizing {model_name}...")

            optimizer = ModelOptimizer()
            best_model, best_params, best_score = optimizer.optimize_model(model, param_spaces[model_name], X_train, y_train)

            print(f"Best parameters for {model_name}: {best_params}")
            print(f"Best score (RMSE) for {model_name}: {best_score}")

            self.logger.log_experiment(model_name, best_model, X_train, y_train, X_test, y_test, hyperparameters=best_params)

            if best_score < best_rmse:
                best_rmse = best_score
                best_model_name = model_name
                final_best_model = best_model  # Update the final_best_model
                best_model_params = best_params  # Update the best_model_params

        print(f"Best model: {best_model_name} with RMSE: {best_rmse}")
        print(f"\n \n Best model: {final_best_model} with RMSE: {best_model_params} \n \n ")
        return final_best_model, best_model_params

Cette classe permet d'enregistrer le meilleur modèle dans le registre MLflow.

In [15]:
@dataclass
class BestModelRegistry:
    """
    Classe pour enregistrer le meilleur modèle dans le registre MLflow.
    """
    @staticmethod
    def register_best_model(best_model, best_params):
        model_name = f"California_Housing_Best_Model_{best_model.__class__.__name__}"
        input_example = pd.DataFrame({
            "MedInc": [1.0], "HouseAge": [15.0], "AveRooms": [6.0],
            "AveBedrms": [2.0], "Population": [300.0], "AveOccup": [4.0],
            "Latitude": [37.0], "Longitude": [-122.0]
        })

        with mlflow.start_run(run_name=f"Best Model Registration: {model_name}"):
            mlflow.sklearn.log_model(
                sk_model=best_model,
                artifact_path="best_model",
                registered_model_name=model_name,
                input_example=input_example
            )
            mlflow.log_params(best_params)
            print(f"Le meilleur modèle a été enregistré avec le nom '{model_name}' et ses hyperparamètres.")


#### Compiler et voir les résultats dans MLflow

In [16]:
data_path = "../data/Std data"

X_train = pd.read_csv(f"{data_path}/X_train.csv")
X_test = pd.read_csv(f"{data_path}/X_test.csv")
y_train = pd.read_csv(f"{data_path}/y_train.csv").squeeze()  
y_test = pd.read_csv(f"{data_path}/y_test.csv").squeeze()

print("Les données sont chargées.")

Les données sont chargées.


In [33]:
# Initialiser le logger MLflow
logger = MLFlowLogger(experiment_name="California Housing Project")

# Initialiser le comparateur de modèles
comparator = ModelComparator(logger=logger)

print("Les classes sont initialisées.")

Les classes sont initialisées.


In [36]:
# Comparer les modèles
best_model, best_params = comparator.compare_models(X_train, y_train, X_test, y_test)

print("Comparaison des modèles terminée.")


Optimizing Linear Regression...




Best parameters for Linear Regression: OrderedDict([('fit_intercept', True)])
Best score (RMSE) for Linear Regression: 0.7014875053735343


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1362.80it/s]


🏃 View run Linear Regression at: http://127.0.0.1:5000/#/experiments/616656509998415542/runs/e81c6a073a284d4291a211e0f952754e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/616656509998415542

Optimizing Random Forest...
Best parameters for Random Forest: OrderedDict([('max_depth', 10), ('min_samples_leaf', 4), ('min_samples_split', 2), ('n_estimators', 150)])
Best score (RMSE) for Random Forest: 0.2964069726506357


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1145.58it/s]


🏃 View run Random Forest at: http://127.0.0.1:5000/#/experiments/616656509998415542/runs/b165759eae29478081832820d90400bb
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/616656509998415542

Optimizing Gradient Boosting...
Best parameters for Gradient Boosting: OrderedDict([('learning_rate', 0.01), ('max_depth', 8), ('min_samples_leaf', 5), ('min_samples_split', 10), ('n_estimators', 150)])
Best score (RMSE) for Gradient Boosting: 0.3558230178694675


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 501.63it/s]


🏃 View run Gradient Boosting at: http://127.0.0.1:5000/#/experiments/616656509998415542/runs/5988cbcfecad4a06aa636b08d92d602f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/616656509998415542
Best model: Random Forest with RMSE: 0.2964069726506357

 
 Best model: RandomForestRegressor(max_depth=10, min_samples_leaf=4, n_estimators=150,
                      random_state=42) with RMSE: OrderedDict([('max_depth', 10), ('min_samples_leaf', 4), ('min_samples_split', 2), ('n_estimators', 150)]) 
 
 
Comparaison des modèles terminée.


In [37]:
# Enregistrer le meilleur modèle
registry = BestModelRegistry()
registry.register_best_model(best_model, best_params)

print("Le meilleur modèle a été enregistré avec succès.")

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 570.83it/s] 
Registered model 'California_Housing_Best_Model_RandomForestRegressor' already exists. Creating a new version of this model...
2025/01/11 12:45:42 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: California_Housing_Best_Model_RandomForestRegressor, version 4


Le meilleur modèle a été enregistré avec le nom 'California_Housing_Best_Model_RandomForestRegressor' et ses hyperparamètres.
🏃 View run Best Model Registration: California_Housing_Best_Model_RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/616656509998415542/runs/5416af88d0554ff3bb07d21a26631903
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/616656509998415542
Le meilleur modèle a été enregistré avec succès.


Created version '4' of model 'California_Housing_Best_Model_RandomForestRegressor'.
