In [1]:
%reload_ext autoreload
%autoreload 2

import mlflow
import os
import numpy as np
import pandas as pd
import pendulum
import sys
import xgboost as xgb

from loguru import logger
from pathlib import Path
from pycaret import regression
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor


sys.path.append(str(Path.cwd().parent))
sys.path.append(os.path.join(str(Path.cwd().parent), "src"))
from settings.params import *
from metrics import eval_metrics
from pipeline import define_pipeline
from plot import prediction_error_plot, residual_plot
from tracking import mlflow_log_search

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

In [2]:
data = pd.read_csv(CLEANED_DATA)
TARGET_NAME = MODEL_PARAMS['TARGET_NAME']

In [3]:
mlflow.set_tracking_uri(uri="http://localhost:8080")

# Modeling


## Train/Test Split


In [4]:
x_train, x_test, y_train, y_test = train_test_split(data, data[TARGET_NAME], test_size=MODEL_PARAMS["TEST_SIZE"], random_state=SEED)

logger.info(f"\nX train: {x_train.shape}\nY train: {y_train.shape}\n"
            f"X test: {x_test.shape}\nY test: {y_test.shape}")

[32m2024-08-05 21:58:11.385[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1m
X train: (5205, 8)
Y train: (5205,)
X test: (1302, 8)
Y test: (1302,)[0m


## Training


In [5]:
df = x_train.copy()
df[TARGET_NAME] = np.log(y_train)

In [6]:
exp_reg = regression.setup(df, target=TARGET_NAME, max_encoding_ohe=200, log_experiment=True, experiment_name="building-energy-prediction-training", train_size=0.8)
regression.set_config('seed', SEED)

# Removing useless metrics improve training speed
regression.remove_metric('MAPE')
regression.remove_metric('MSE')
regression.remove_metric('RMSLE')

Unnamed: 0,Description,Value
0,Session id,3844
1,Target,SiteEnergyUse(kBtu)
2,Target type,Regression
3,Original data shape,"(5205, 8)"
4,Transformed data shape,"(5205, 67)"
5,Transformed train set shape,"(4164, 67)"
6,Transformed test set shape,"(1041, 67)"
7,Numeric features,5
8,Categorical features,2
9,Preprocess,True


In [7]:
best_threes_model = regression.compare_models(n_select=3)

Unnamed: 0,Model,MAE,RMSE,R2,TT (Sec)
et,Extra Trees Regressor,0.2506,0.4473,0.8281,0.365
rf,Random Forest Regressor,0.2906,0.4576,0.8207,0.451
xgboost,Extreme Gradient Boosting,0.3225,0.4699,0.8112,0.621
lightgbm,Light Gradient Boosting Machine,0.3503,0.5003,0.7864,0.481
gbr,Gradient Boosting Regressor,0.3739,0.5187,0.7706,0.193
dt,Decision Tree Regressor,0.3053,0.5447,0.7443,0.048
ada,AdaBoost Regressor,0.5051,0.6634,0.6255,0.113
ridge,Ridge Regression,0.5192,0.6874,0.5965,0.03
br,Bayesian Ridge,0.5193,0.6876,0.5963,0.031
knn,K Neighbors Regressor,0.5153,0.6886,0.5962,0.037




2024/08/05 21:58:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run Extra Trees Regressor at: http://localhost:8080/#/experiments/584040955558151400/runs/e48c49e8e85949af809f57a49e177cfe.


2024/08/05 21:58:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.




2024/08/05 21:58:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest Regressor at: http://localhost:8080/#/experiments/584040955558151400/runs/e03e0fb168444f87a23ba7572fc0348e.


2024/08/05 21:58:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.




2024/08/05 21:58:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run Extreme Gradient Boosting at: http://localhost:8080/#/experiments/584040955558151400/runs/985e7db404194bde9d6bf8c5dbda9241.


2024/08/05 21:58:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.




2024/08/05 21:58:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run Light Gradient Boosting Machine at: http://localhost:8080/#/experiments/584040955558151400/runs/a5d2d9079a50473c84b6a5dbe924c567.


2024/08/05 21:58:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.




2024/08/05 21:58:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run Gradient Boosting Regressor at: http://localhost:8080/#/experiments/584040955558151400/runs/cdb127a755974cb692bdca2d5610a73e.


2024/08/05 21:58:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.




2024/08/05 21:58:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run Decision Tree Regressor at: http://localhost:8080/#/experiments/584040955558151400/runs/0594aef9fd624db3bf29352a531bd440.


2024/08/05 21:58:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.




2024/08/05 21:58:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run AdaBoost Regressor at: http://localhost:8080/#/experiments/584040955558151400/runs/475e209b56d94c73a6a19f157a549822.


2024/08/05 21:58:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.




2024/08/05 21:58:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run Ridge Regression at: http://localhost:8080/#/experiments/584040955558151400/runs/648ea87e90b74400b45d8111460ec0c2.


2024/08/05 21:58:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.




2024/08/05 21:58:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run Bayesian Ridge at: http://localhost:8080/#/experiments/584040955558151400/runs/e4bef51910174feb974fe01a91668ae3.


2024/08/05 21:58:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.




2024/08/05 21:58:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run K Neighbors Regressor at: http://localhost:8080/#/experiments/584040955558151400/runs/d478b892cee345f1b8298584d3293d64.


2024/08/05 21:58:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.




2024/08/05 21:58:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run Linear Regression at: http://localhost:8080/#/experiments/584040955558151400/runs/53df74af133242f090b84c4f8207a471.


2024/08/05 21:58:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.




2024/08/05 21:58:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run Orthogonal Matching Pursuit at: http://localhost:8080/#/experiments/584040955558151400/runs/c088105afd8f462bac67a1b6091818cb.


2024/08/05 21:58:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.




2024/08/05 21:58:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run Lasso Regression at: http://localhost:8080/#/experiments/584040955558151400/runs/00199ac93430443d9bb44c082a0fc447.


2024/08/05 21:58:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.




2024/08/05 21:58:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run Elastic Net at: http://localhost:8080/#/experiments/584040955558151400/runs/c6abc2103e93430f86e839bdecfdbd34.


2024/08/05 21:58:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.




2024/08/05 21:58:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run Lasso Least Angle Regression at: http://localhost:8080/#/experiments/584040955558151400/runs/66de1b39c02947a7aff1669562779fdf.


2024/08/05 21:58:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.




2024/08/05 21:58:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run Dummy Regressor at: http://localhost:8080/#/experiments/584040955558151400/runs/14e1e71cef3944fda7f389c263810178.


2024/08/05 21:58:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.




2024/08/05 21:58:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run Huber Regressor at: http://localhost:8080/#/experiments/584040955558151400/runs/90fcb256721c40ff91b5a93c62dfeace.


2024/08/05 21:58:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.




2024/08/05 21:58:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run Passive Aggressive Regressor at: http://localhost:8080/#/experiments/584040955558151400/runs/4ef2a9bbe4f1440ca79165afb3668704.


2024/08/05 21:58:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.




2024/08/05 21:58:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run Least Angle Regression at: http://localhost:8080/#/experiments/584040955558151400/runs/f11264f6a96b40cba6a01e4afcaa32a6.


2024/08/05 21:58:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.


In [8]:
mlflow.end_run()

2024/08/05 21:58:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run Session Initialized c71a at: http://localhost:8080/#/experiments/584040955558151400/runs/03af506f74824c52b66a8512f9a120be.


2024/08/05 21:58:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/584040955558151400.


L'entrainement sur plusieurs types de modèles avec PyCaret montre que pour ce problème, les modèles: ExtraTreesRegressor, XGBRegressor et RandomForestRegressor sont les plus adaptés. Toutefois, nous pensons que les performances obtenus lors de ce premier entrainement peuvent être nettement améliorées. Nous allons attendre de faire le réglage de paramètres et d'obtenir les modèles finaux avant de faire un choix.


## Fine-Tuning


In [9]:
ESTIMATOR_PARAMS = {
    ExtraTreesRegressor.__name__: {
        "estimator": ExtraTreesRegressor(),
        "params": {
            'regressor__estimator__n_estimators': np.arange(10, 200, 5),
        }
    },
    RandomForestRegressor.__name__: {
        "estimator": RandomForestRegressor(),
        "params": {
            'regressor__estimator__n_estimators': np.arange(10, 200, 5),
        }
    },
    xgb.XGBRegressor.__name__: {
        "estimator": xgb.XGBRegressor(),
        "params": {
            'regressor__estimator__n_estimators': np.arange(10, 200, 5),
        }
    }
}

In [None]:
CURRENT_DATE = pendulum.now()

search_cvs = {}

def rmse(actual, predicted):
    return np.sqrt(mean_squared_error(actual, predicted))

scoring = {'r2': make_scorer(r2_score),
          'rmse': make_scorer(rmse, greater_is_better=False),
          'mae': make_scorer(mean_absolute_error, greater_is_better=False)}

# Create an experiment if not exists
exp_name = "building-energy-prediction-tuning-sklearn"
experiment = mlflow.get_experiment_by_name(exp_name)
if not experiment:
    experiment_id = mlflow.create_experiment(exp_name)
else:
    experiment_id = experiment.experiment_id

with mlflow.start_run(run_name=f"Session-{CURRENT_DATE.strftime('%Y%m%d_%H%m%S')}", experiment_id=experiment_id) as parent_run:
    for estimator_name, settings in ESTIMATOR_PARAMS.items():
        with mlflow.start_run(run_name=estimator_name, nested=True, experiment_id=experiment_id):  
            estimator = settings["estimator"]
            param_grid = settings["params"]
            pipeline = define_pipeline(numerical_transformer=[SimpleImputer(strategy="median"), RobustScaler()],
                            categorical_transformer=[SimpleImputer(strategy="constant", fill_value="undefined"), OneHotEncoder(drop="if_binary", handle_unknown="ignore")],
                            target_transformer=True,
                            estimator=estimator
                        ) 
            grid_search = GridSearchCV(
                estimator=pipeline,  # Instantiate the estimator
                param_grid=param_grid,
                scoring=scoring,
                refit='r2',
                cv=5,  # Adjust the number of cross-validation folds as needed
                n_jobs=-1  # Use all available cores
            )
            grid_search.fit(x_train, y_train)
            search_cvs[estimator_name] = grid_search

            mlflow.log_param("Estimator", estimator_name)
            mlflow_log_search(grid_search)
mlflow.end_run()

## Model Evaluation on Test Data


We are going to evaluate the fine-tuned models to see which one we are going to pick as the final model.


In [None]:
# Obtaining best_models after fine-tuning
models = { f"{estimator_name}": search_cv.best_estimator_ for estimator_name, search_cv in search_cvs.items()}

In [None]:
exp_name = "building-energy-prediction-evaluation"
experiment = mlflow.get_experiment_by_name(exp_name)
if not experiment:
    experiment_id = mlflow.create_experiment(exp_name)
else:
    experiment_id = experiment.experiment_id

def evaluate_models(estimators, x_train, x_test, y_train, y_test):
    # Dict of R2 scores for the estimators
    r2_scores = {}
    with mlflow.start_run(run_name=f"Session-{CURRENT_DATE.strftime('%Y%m%d_%H%m%S')}", experiment_id=experiment_id):
        for estimator_name, estimator in estimators.items():
            with mlflow.start_run(run_name=estimator_name, nested=True, experiment_id=experiment_id): 
                y_train_pred = estimator.predict(x_train)
                y_test_pred = estimator.predict(x_test)

                train_metrics = eval_metrics(y_train, y_train_pred)
                test_metrics = eval_metrics(y_test, y_test_pred)

                # Add the R2 score of the model to the global dict
                r2_scores[estimator_name] = test_metrics['r2']

                # Log the regressor parameters
                mlflow.log_params(estimator.regressor.steps[-1][1].get_params())

                # Log the best metric
                mlflow.log_metrics(test_metrics)

                # Log the model
                mlflow.sklearn.log_model(estimator.best_estimator_, "model")

                logger.info(f"""{estimator_name} performance \n{pd.DataFrame({'train': train_metrics, 'test': test_metrics}).T}""")
    return max(r2_scores.items(), key=lambda item: item[1])

mlflow.end_run()

best_estimator, score = evaluate_models(models, x_train, x_test, y_train, y_test)

logger.info(f"""{best_estimator} is the best estimator found for this problem with an R2 score of {score}""")

### Prediction Error Plot


In [None]:
prediction_error_plot(models, x_train, x_test, y_train, y_test)

### Residual Plot


In [None]:
residual_plot(models, x_train, x_test, y_train, y_test)