In [1]:
import os
import sys
os.chdir('../')
sys.path.append(os.path.join(os.getcwd(), "src"))

In [2]:
from dataclasses import dataclass
from pathlib import Path
from electron.utils.helpers import *
from electron.utils.exception import *
from electron.constants import *
from electron import logger

In [3]:
@dataclass
class ModelTrainerConfig:
    root_dir: Path
    x_transform: Path
    y_transform: Path
    model_name: str
    scoring: str
    cv_folds: int
    n_jobs: int
    n_trials: int
    early_stopping_rounds: int

In [4]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_PATH,
                       params_filepath=PARAMS_PATH,
                       schema_filepath=SCHEMA_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.model_trainer

        create_directories([config.root_dir])

        model_trainer_config =  ModelTrainerConfig(
            root_dir=Path(config.root_dir),
            x_transform=Path(config.x_transform),
            y_transform=Path(config.y_transform),
            model_name=config.model_name,
            scoring=params.scoring,
            cv_folds=params.cv_folds,
            n_jobs=params.n_jobs,
            n_trials=params.n_trials,
            early_stopping_rounds=params.early_stopping_rounds,
        )

        return model_trainer_config

In [5]:
import os
import sys
import mlflow
import optuna
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

        mlflow.set_tracking_uri("file:./mlruns")
        mlflow.set_experiment("Electricity Demand Prediction")
        logger.info("MLflow tracking setup complete.")

        self.models = {
            "XGBoost": {
                "class": XGBRegressor,
                "search_space": lambda trial: {
                    "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                    "max_depth": trial.suggest_int("max_depth", 3, 10),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                },
                "mlflow_module": mlflow.xgboost,
            },
            "LightGBM": {
                "class": LGBMRegressor,
                "search_space": lambda trial: {
                    "num_leaves": trial.suggest_int("num_leaves", 20, 150),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                    "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                },
                "mlflow_module": mlflow.lightgbm,
            },
        }

    def train(self):
        train_x = np.load(self.config.x_transform, allow_pickle=True)
        train_y = np.load(self.config.y_transform, allow_pickle=True).squeeze()



        best_overall = {"model_name": None, "score": float("inf"), "params": None}

        for model_name, model_info in self.models.items():
            logger.info(f"Starting Optuna HPO for {model_name}")

            def objective(trial):
                params = model_info["search_space"](trial)
                model = model_info["class"](**params)

                # Train/val split for early stopping
                x_train, x_val, y_train, y_val = train_test_split(
                    train_x, train_y, test_size=0.2, shuffle=False
                )

                if model_name == "XGBoost":
                    model.set_params(early_stopping_rounds=0, eval_metric="rmse")
                    model.fit(x_train, y_train, eval_set=[(x_val, y_val)], verbose=False)
                elif model_name == "LightGBM":
                    model.set_params(early_stopping_rounds=0)
                    model.fit(x_train, y_train, eval_set=[(x_val, y_val)], eval_metric="rmse")

                preds = model.predict(x_val)
                rmse = root_mean_squared_error(y_val, preds)
                return rmse

            study = optuna.create_study(direction="minimize")
            study.optimize(objective, n_trials=self.config.n_trials)

            best_params = study.best_params
            logger.info(f"Best params for {model_name}: {best_params}")

            model = model_info["class"](**best_params)
            kf = KFold(n_splits=self.config.cv_folds, shuffle=False)
            scores = cross_val_score(model, train_x, train_y, cv=kf, scoring="neg_root_mean_squared_error")
            mean_score = -scores.mean()

            with mlflow.start_run(run_name=f"{model_name}_best"):
                mlflow.log_params(best_params)
                mlflow.log_metric("cv_rmse", mean_score)
                mlflow.set_tag("model_name", model_name)

            if mean_score < best_overall["score"]:
                best_overall.update({
                    "model_name": model_name,
                    "score": mean_score,
                    "params": best_params
                })

        best_model_class = self.models[best_overall["model_name"]]["class"]
        final_params = best_overall["params"]
        best_model = best_model_class(**final_params)
        best_model.fit(train_x, train_y)

        model_path = Path(self.config.root_dir) / self.config.model_name
        create_directories([model_path.parent])
        save_bin(best_model, model_path)

        # Log final model
        with mlflow.start_run(run_name=f"{best_overall['model_name']}_final"):
            mlflow.log_params(final_params)
            mlflow.log_metric("cv_rmse", best_overall["score"])
            mlflow.set_tag("stage", "final")

        logger.info(f"Best model: {best_overall}")
        return best_overall


In [6]:
try:    
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.train()

except Exception as e:
    raise CustomException(e, sys) from e

[2025-07-06 10:05:30,070: INFO: helpers: yaml file: config_file\config.yaml loaded successfully]
[2025-07-06 10:05:30,074: INFO: helpers: yaml file: config_file\params.yaml loaded successfully]
[2025-07-06 10:05:30,078: INFO: helpers: yaml file: config_file\schema.yaml loaded successfully]
[2025-07-06 10:05:30,080: INFO: helpers: created directory at: artifacts]
[2025-07-06 10:05:30,080: INFO: helpers: created directory at: artifacts/model_trainer]
[2025-07-06 10:05:30,138: INFO: 1860240262: MLflow tracking setup complete.]
[2025-07-06 10:05:30,769: INFO: 1860240262: Starting Optuna HPO for XGBoost]


[I 2025-07-06 10:05:30,769] A new study created in memory with name: no-name-69b21feb-5316-496a-8b28-4a4531b62c08
[I 2025-07-06 10:05:51,188] Trial 0 finished with value: 28.86133972577451 and parameters: {'n_estimators': 74, 'max_depth': 6, 'learning_rate': 0.16310651485547192}. Best is trial 0 with value: 28.86133972577451.


[2025-07-06 10:05:51,203: INFO: 1860240262: Best params for XGBoost: {'n_estimators': 74, 'max_depth': 6, 'learning_rate': 0.16310651485547192}]
[2025-07-06 10:07:22,992: INFO: 1860240262: Starting Optuna HPO for LightGBM]


[I 2025-07-06 10:07:23,025] A new study created in memory with name: no-name-36764848-a3de-44a0-95e1-904b7c0eceba


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.215136 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171625
[LightGBM] [Info] Number of data points in the train set: 58273, number of used features: 674
[LightGBM] [Info] Start training from score 1587.798294


[I 2025-07-06 10:07:48,209] Trial 0 finished with value: 26.900460717823087 and parameters: {'num_leaves': 53, 'learning_rate': 0.15758423542934524, 'n_estimators': 132}. Best is trial 0 with value: 26.900460717823087.


[2025-07-06 10:07:48,209: INFO: 1860240262: Best params for LightGBM: {'num_leaves': 53, 'learning_rate': 0.15758423542934524, 'n_estimators': 132}]
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.208169 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171624
[LightGBM] [Info] Number of data points in the train set: 58273, number of used features: 674
[LightGBM] [Info] Start training from score 1477.632849




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.191197 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171625
[LightGBM] [Info] Number of data points in the train set: 58273, number of used features: 674
[LightGBM] [Info] Start training from score 1322.078201




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.207842 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171625
[LightGBM] [Info] Number of data points in the train set: 58274, number of used features: 674
[LightGBM] [Info] Start training from score 1708.010331




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.197559 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171625
[LightGBM] [Info] Number of data points in the train set: 58274, number of used features: 674
[LightGBM] [Info] Start training from score 1629.085235




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.232115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171625
[LightGBM] [Info] Number of data points in the train set: 58274, number of used features: 674
[LightGBM] [Info] Start training from score 1587.802090




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.299074 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171626
[LightGBM] [Info] Number of data points in the train set: 72842, number of used features: 674
[LightGBM] [Info] Start training from score 1544.922737
[2025-07-06 10:09:54,367: INFO: helpers: created directory at: artifacts\model_trainer]
[2025-07-06 10:09:54,386: INFO: helpers: binary file saved at: artifacts\model_trainer\model.joblib]
[2025-07-06 10:09:54,484: INFO: 1860240262: Best model: {'model_name': 'LightGBM', 'score': 317.30270711594756, 'params': {'num_leaves': 53, 'learning_rate': 0.15758423542934524, 'n_estimators': 132}}]
