In [1]:
import os
import sys
os.chdir('../')
sys.path.append(os.path.join(os.getcwd(), "src"))

In [None]:
import os
import sys
import optuna
import joblib
import pandas as pd
from hsml.schema import Schema
from hsml.model_schema import ModelSchema
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from WattPredictor.utils.helpers import *
from WattPredictor.utils.ts_generator import features_and_target
from WattPredictor.config.model_config import ModelConfigurationManager
from WattPredictor.utils.feature import feature_store_instance
from WattPredictor.entity.config_entity import TrainerConfig
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error,root_mean_squared_error
from WattPredictor.utils.helpers import create_directories
from WattPredictor.utils.exception import CustomException
from WattPredictor.utils.logging import logger



class Trainer:
    def __init__(self, config: TrainerConfig):
        self.config = config
        self.feature_store =feature_store_instance()

        self.models = {
            "XGBoost": {
                "class": XGBRegressor,
                "search_space": lambda trial: {
                    "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                    "max_depth": trial.suggest_int("max_depth", 3, 10),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                }
            },
            "LightGBM": {
                "class": LGBMRegressor,
                "search_space": lambda trial: {
                    "num_leaves": trial.suggest_int("num_leaves", 20, 150),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                    "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                }
            }
        }

    def load_training_data(self):
        try:
            df, _ = self.feature_store.get_training_data("elec_wx_features_view")
            df = df[['date', 'demand', 'sub_region_code', 'temperature_2m']]
            df.sort_values("date", inplace=True)
            return df
        except Exception as e:
            raise CustomException(e, sys)

    def train(self):
        try:
            df = self.load_training_data()
            train_df = df[df['date'] < self.config.cutoff_date]
            test_df = df[df['date'] >= self.config.cutoff_date]

            train_x, train_y = features_and_target(train_df, self.config.input_seq_len, self.config.step_size)
            train_x.drop(columns=["date"], errors="ignore", inplace=True)

            best_overall = {"model_name": None, "score": float("inf"), "params": None}

            for model_name, model_info in self.models.items():
                logger.info(f"Running Optuna HPO for {model_name}")

                def objective(trial):
                    params = model_info["search_space"](trial)
                    model = model_info["class"](**params)
                    x_tr, x_val, y_tr, y_val = train_test_split(train_x, train_y, test_size=0.2, shuffle=False)
                    model.fit(x_tr, y_tr)
                    preds = model.predict(x_val)
                    return mean_squared_error(y_val, preds)

                study = optuna.create_study(direction="minimize")
                study.optimize(objective, n_trials=self.config.n_trials)

                best_params = study.best_params
                model = model_info["class"](**best_params)
                score = -cross_val_score(model, train_x, train_y, cv=KFold(n_splits=5), scoring="neg_root_mean_squared_error").mean()

                if score < best_overall["score"]:
                    best_overall.update({
                        "model_name": model_name,
                        "score": score,
                        "params": best_params
                    })


            final_model_class = self.models[best_overall["model_name"]]["class"]
            final_model = final_model_class(**best_overall["params"])
            final_model.fit(train_x, train_y)

            model_path = Path(self.config.root_dir) / self.config.model_name
            create_directories([model_path.parent])
            save_bin(final_model, model_path)

            input_schema = Schema(train_x.head(10))
            output_schema = Schema(pd.DataFrame(train_y))
            model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)


            model_registry = self.feature_store.project.get_model_registry()
            hops_model = model_registry.python.create_model(
                name="wattpredictor_" + best_overall["model_name"].lower(),
                metrics = {'rmse':score},
                input_example=train_x.head(10),
                model_schema=model_schema,
                description="Best model trained on electricity demand"
            )
            hops_model.save(model_path.as_posix())

            logger.info(f"Best model registered: {best_overall}")
            return best_overall

        except Exception as e:
            raise CustomException(e, sys)

In [3]:
try:    
    config = ModelConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = Trainer(config=model_trainer_config)
    model_trainer.train()

except Exception as e:
    raise CustomException(e, sys) from e

[2025-07-17 16:56:24,982: INFO: helpers: yaml file: config_file\config.yaml loaded successfully]
[2025-07-17 16:56:24,987: INFO: helpers: yaml file: config_file\params.yaml loaded successfully]
[2025-07-17 16:56:24,991: INFO: helpers: yaml file: config_file\schema.yaml loaded successfully]
[2025-07-17 16:56:24,992: INFO: helpers: created directory at: artifacts]
[2025-07-17 16:56:24,995: INFO: helpers: created directory at: artifacts/model_trainer]
[2025-07-17 16:56:25,002: INFO: helpers: yaml file: config_file\config.yaml loaded successfully]
[2025-07-17 16:56:25,006: INFO: helpers: yaml file: config_file\params.yaml loaded successfully]
[2025-07-17 16:56:25,009: INFO: helpers: yaml file: config_file\schema.yaml loaded successfully]
[2025-07-17 16:56:25,012: INFO: external: Initializing external client]
[2025-07-17 16:56:25,014: INFO: external: Base URL: https://c.app.hopsworks.ai:443]
[2025-07-17 16:56:27,770: INFO: python: Python Engine initialized.]

Logged in to project, explore i

Generating TS features: 100%|██████████| 11/11 [00:00<00:00, 13.46it/s]

[2025-07-17 16:56:48,845: INFO: 2787390486: Running Optuna HPO for XGBoost]



[I 2025-07-17 16:56:48,846] A new study created in memory with name: no-name-ced89ad1-056c-48c4-94e1-02a6eb32b150
[I 2025-07-17 16:56:55,705] Trial 0 finished with value: 3549718.8096737126 and parameters: {'n_estimators': 231, 'max_depth': 3, 'learning_rate': 0.02016454481201762}. Best is trial 0 with value: 3549718.8096737126.


[2025-07-17 16:57:23,596: INFO: 2787390486: Running Optuna HPO for LightGBM]


[I 2025-07-17 16:57:23,598] A new study created in memory with name: no-name-c0fddb55-da41-4194-8bb8-cb13c2ec47b3


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015240 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171553
[LightGBM] [Info] Number of data points in the train set: 2138, number of used features: 674
[LightGBM] [Info] Start training from score 1211.671188


[I 2025-07-17 16:57:34,433] Trial 0 finished with value: 3226381.525077507 and parameters: {'num_leaves': 72, 'learning_rate': 0.13759360434821385, 'n_estimators': 77}. Best is trial 0 with value: 3226381.525077507.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013783 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171553
[LightGBM] [Info] Number of data points in the train set: 2138, number of used features: 674
[LightGBM] [Info] Start training from score 1574.366698
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015345 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171554
[LightGBM] [Info] Number of data points in the train set: 2138, number of used features: 674
[LightGBM] [Info] Start training from score 1742.909261
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013995 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171554
[LightGBM] [Info] Number of data points in the train set: 2138, number of used features: 674
[LightGBM] [Inf

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading f:\WattPredictor\artifacts/model_trainer/model.joblib: 0.000%|          | 0/519677 elapsed<00:00 rem…

Uploading f:\WattPredictor\input_example.json: 0.000%|          | 0/5256 elapsed<00:00 remaining<?

Uploading f:\WattPredictor\model_schema.json: 0.000%|          | 0/61417 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1240214/models/wattpredictor_lightgbm/2
[2025-07-17 16:58:25,150: INFO: 2787390486: Best model registered: {'model_name': 'LightGBM', 'score': 420.0868412047661, 'params': {'num_leaves': 72, 'learning_rate': 0.13759360434821385, 'n_estimators': 77}}]
