In [1]:
import os
import sys
os.chdir('../')
sys.path.append(os.path.join(os.getcwd(), "src"))

In [2]:
from dataclasses import dataclass
from pathlib import Path
from WattPredictor.utils.helpers import *
from WattPredictor.utils.exception import *
from WattPredictor.constants import *
from WattPredictor import logger

In [3]:
@dataclass
class ModelTrainerConfig:
    root_dir: Path
    input_seq_len: int
    step_size: int
    n_trials: int
    cutoff_date: str
    model_name: Path

@dataclass(frozen=True)
class FeatureStoreConfig:
    hopsworks_project_name: str
    hopsworks_api_key: str

In [None]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_PATH,
                       params_filepath=PARAMS_PATH,
                       schema_filepath=SCHEMA_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.training

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=Path(config.root_dir),
            input_seq_len= params.input_seq_len,
            step_size = params.step_size,
            n_trials=params.n_trials,
            cutoff_date = params.cutoff_date,
            model_name = Path(config.model_name)
        )

        return model_trainer_config
    

    def get_feature_store_config(self) -> FeatureStoreConfig:

        config = self.config.feature_store

        feature_store_config = FeatureStoreConfig(
                hopsworks_project_name=config.hopsworks_project_name,
                hopsworks_api_key=os.environ['hopsworks_api_key'],
        )

        return feature_store_config

In [5]:
import hopsworks
import pandas as pd
import sys
from WattPredictor.utils.exception import CustomException
from WattPredictor import logger

class FeatureStore:
    def __init__(self, config):
        self.config = config
        self.connect()

    def connect(self):
        try:
            self.project = hopsworks.login(
                project=self.config.hopsworks_project_name,
                api_key_value=self.config.hopsworks_api_key
            )
            self.feature_store = self.project.get_feature_store()
            self.dataset_api = self.project.get_dataset_api()
            logger.info(f"Connected to Hopsworks Feature Store: {self.config.hopsworks_project_name}")
        except Exception as e:
            raise CustomException(e, sys)

    def create_feature_group(self, name, df, primary_key, event_time, description, online=False):
        try:
            try:
                fg = self.feature_store.get_feature_group(name=name, version=1)
                logger.info(f"Feature Group '{name}' already exists. Re-inserting data...")
            except:
                logger.info(f"Feature Group '{name}' not found. Creating new one...")
                fg = self.feature_store.create_feature_group(
                    name=name,
                    version=1,
                    primary_key=primary_key,
                    event_time=event_time,
                    description=description,
                    online_enabled=online
                )
            fg.insert(df, write_options={"wait_for_job": True})
            logger.info(f"Feature Group '{name}' is ready and data inserted.")

        except Exception as e:
            raise CustomException(e, sys)

    def create_feature_view(self, name, feature_group_name, features):
        try:
            try:
                existing_fv = self.feature_store.get_feature_view(name=name, version=1)
                existing_fv.delete()
                logger.info(f"Deleted existing Feature View '{name}' for clean recreation.")
            except Exception as delete_error:
                logger.warning(f"No existing Feature View to delete: {delete_error}")

            fg = self.feature_store.get_feature_group(name=feature_group_name, version=1)
            query = fg.select(features)
            self.feature_store.create_feature_view(
                name=name,
                version=1,
                query=query,
                description=f"Feature View for {feature_group_name}"
            )
            logger.info(f"Feature View '{name}' created successfully.")

        except Exception as e:
            raise CustomException(e, sys)

    def save_training_dataset(self, feature_view_name, version_description, output_format="csv"):
        try:
            fv = self.feature_store.get_feature_view(name=feature_view_name, version=1)
            fv.create_training_data(
                description=version_description,
                data_format=output_format,
                write_options={"wait_for_job": True}
            )
            logger.info(f"Training dataset version for Feature View '{feature_view_name}' created.")
        except Exception as e:
            raise CustomException(e, sys)

    def load_latest_training_dataset(self, feature_view_name):
        try:
            fv = self.feature_store.get_feature_view(name=feature_view_name, version=1)
            return fv.training_data()
        except Exception as e:
            raise CustomException(e, sys)

    def get_online_features(self, feature_view_name, key_dict: dict):
        try:
            fv = self.feature_store.get_feature_view(name=feature_view_name, version=1)
            return fv.get_online_features(key_dict)
        except Exception as e:
            raise CustomException(e, sys)

    def upload_file_safely(self, local_path: str, target_name: str):
        try:
            self.dataset_api.upload(
                local_path,
                f"Resources/wattpredictor_artifacts/{target_name}",
                overwrite=True
            )
            logger.info(f"Uploaded file to Feature Store: {target_name}")
        except Exception as e:
            raise CustomException(e, sys)

In [6]:
import os
import sys
import optuna
import joblib
import numpy as np
import pandas as pd
from hsml.schema import Schema
from hsml.model_schema import ModelSchema
from pathlib import Path
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from WattPredictor.utils.helpers import create_directories, save_bin
from WattPredictor.utils.ts_generator import features_and_target
from WattPredictor.utils.exception import CustomException
from WattPredictor import logger


class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig, feature_store_config: FeatureStoreConfig):
        self.config = config
        self.feature_store = FeatureStore(feature_store_config)

        self.models = {
            "XGBoost": {
                "class": XGBRegressor,
                "search_space": lambda trial: {
                    "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                    "max_depth": trial.suggest_int("max_depth", 3, 10),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                }
            },
            "LightGBM": {
                "class": LGBMRegressor,
                "search_space": lambda trial: {
                    "num_leaves": trial.suggest_int("num_leaves", 20, 150),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                    "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                }
            }
        }

    def load_training_data(self):
        try:
            df, _ = self.feature_store.load_latest_training_dataset("elec_wx_features_view")
            df = df[['date', 'demand', 'sub_region_code', 'temperature_2m']]
            df.sort_values("date", inplace=True)
            return df
        except Exception as e:
            raise CustomException(e, sys)

    def train(self):
        try:
            df = self.load_training_data()
            train_df = df[df['date'] < self.config.cutoff_date]
            test_df = df[df['date'] >= self.config.cutoff_date]

            train_x, train_y = features_and_target(train_df, self.config.input_seq_len, self.config.step_size)
            train_x.drop(columns=["date"], errors="ignore", inplace=True)

            best_overall = {"model_name": None, "score": float("inf"), "params": None}

            for model_name, model_info in self.models.items():
                logger.info(f"Running Optuna HPO for {model_name}")

                def objective(trial):
                    params = model_info["search_space"](trial)
                    model = model_info["class"](**params)
                    x_tr, x_val, y_tr, y_val = train_test_split(train_x, train_y, test_size=0.2, shuffle=False)
                    model.fit(x_tr, y_tr)
                    preds = model.predict(x_val)
                    return mean_squared_error(y_val, preds)

                study = optuna.create_study(direction="minimize")
                study.optimize(objective, n_trials=self.config.n_trials)

                best_params = study.best_params
                model = model_info["class"](**best_params)
                score = -cross_val_score(model, train_x, train_y, cv=KFold(n_splits=5), scoring="neg_root_mean_squared_error").mean()

                if score < best_overall["score"]:
                    best_overall.update({
                        "model_name": model_name,
                        "score": score,
                        "params": best_params
                    })

            final_model_class = self.models[best_overall["model_name"]]["class"]
            final_model = final_model_class(**best_overall["params"])
            final_model.fit(train_x, train_y)

            model_path = Path(self.config.root_dir) / self.config.model_name
            create_directories([model_path.parent])
            save_bin(final_model, model_path)

            # Create schema from training data
            input_schema = Schema(train_x)
            output_schema = Schema(pd.DataFrame(train_y))
            model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

            model_registry = self.feature_store.project.get_model_registry()
            hops_model = model_registry.python.create_model(
                name="wattpredictor_" + best_overall["model_name"].lower(),
                input_example=train_x.head(2),
                model_schema=model_schema,
                description="Best model trained on electricity demand"
            )
            hops_model.save(model_path.as_posix())

            logger.info(f"Best model registered: {best_overall}")
            return best_overall

        except Exception as e:
            raise CustomException(e, sys)


In [7]:
try:    
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    feature_store_config = config.get_feature_store_config() 
    model_trainer = ModelTrainer(config=model_trainer_config,feature_store_config=feature_store_config)
    model_trainer.train()

except Exception as e:
    raise CustomException(e, sys) from e

[2025-07-12 17:43:24,078: INFO: helpers: yaml file: config_file\config.yaml loaded successfully]


[2025-07-12 17:43:24,101: INFO: helpers: yaml file: config_file\params.yaml loaded successfully]
[2025-07-12 17:43:24,103: INFO: helpers: yaml file: config_file\schema.yaml loaded successfully]
[2025-07-12 17:43:24,109: INFO: helpers: created directory at: artifacts]
[2025-07-12 17:43:24,111: INFO: helpers: created directory at: artifacts/model_trainer]
[2025-07-12 17:43:24,117: INFO: external: Initializing external client]
[2025-07-12 17:43:24,118: INFO: external: Base URL: https://c.app.hopsworks.ai:443]
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'
]






[2025-07-12 17:43:26,911: INFO: python: Python Engine initialized.]

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1237149
[2025-07-12 17:43:28,676: INFO: 1043919477: Connected to Hopsworks Feature Store: JavithNaseem]
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (28.23s) 
]


Generating TS features: 100%|██████████| 11/11 [00:00<00:00, 21.48it/s]

[2025-07-12 17:44:04,343: INFO: 1015732353: Running Optuna HPO for XGBoost]



[I 2025-07-12 17:44:04,343] A new study created in memory with name: no-name-f11ae2b4-4e60-4bcb-9ab6-c2a3a8ee5aed
[I 2025-07-12 17:44:18,360] Trial 0 finished with value: 1744714.4885716715 and parameters: {'n_estimators': 110, 'max_depth': 7, 'learning_rate': 0.13984864958341217}. Best is trial 0 with value: 1744714.4885716715.
[I 2025-07-12 17:44:25,411] Trial 1 finished with value: 1673640.4811808087 and parameters: {'n_estimators': 219, 'max_depth': 4, 'learning_rate': 0.20278927198057162}. Best is trial 1 with value: 1673640.4811808087.
[I 2025-07-12 17:44:45,694] Trial 2 finished with value: 1874404.94928579 and parameters: {'n_estimators': 176, 'max_depth': 7, 'learning_rate': 0.05927084491512082}. Best is trial 1 with value: 1673640.4811808087.
[I 2025-07-12 17:44:51,184] Trial 3 finished with value: 1735456.4448541054 and parameters: {'n_estimators': 109, 'max_depth': 5, 'learning_rate': 0.09445412504580998}. Best is trial 1 with value: 1673640.4811808087.
[I 2025-07-12 17:45

[2025-07-12 17:45:41,049: INFO: 1015732353: Running Optuna HPO for LightGBM]


[I 2025-07-12 17:45:41,065] A new study created in memory with name: no-name-9c88847a-82fe-490c-8850-844f1499ea84


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011797 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171519
[LightGBM] [Info] Number of data points in the train set: 1610, number of used features: 674
[LightGBM] [Info] Start training from score 1557.878882


[I 2025-07-12 17:45:49,118] Trial 0 finished with value: 13104.74000242937 and parameters: {'num_leaves': 27, 'learning_rate': 0.141282004272215, 'n_estimators': 137}. Best is trial 0 with value: 13104.74000242937.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011860 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171519
[LightGBM] [Info] Number of data points in the train set: 1610, number of used features: 674
[LightGBM] [Info] Start training from score 1557.878882


[I 2025-07-12 17:45:52,958] Trial 1 finished with value: 21756.812617736417 and parameters: {'num_leaves': 134, 'learning_rate': 0.030979358721963363, 'n_estimators': 78}. Best is trial 0 with value: 13104.74000242937.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014771 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171519
[LightGBM] [Info] Number of data points in the train set: 1610, number of used features: 674
[LightGBM] [Info] Start training from score 1557.878882


[I 2025-07-12 17:46:08,167] Trial 2 finished with value: 13528.214995500874 and parameters: {'num_leaves': 68, 'learning_rate': 0.12944120293578049, 'n_estimators': 283}. Best is trial 0 with value: 13104.74000242937.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012386 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171519
[LightGBM] [Info] Number of data points in the train set: 1610, number of used features: 674
[LightGBM] [Info] Start training from score 1557.878882


[I 2025-07-12 17:46:19,867] Trial 3 finished with value: 13587.923236627577 and parameters: {'num_leaves': 114, 'learning_rate': 0.28338779994954005, 'n_estimators': 214}. Best is trial 0 with value: 13104.74000242937.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015248 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171519
[LightGBM] [Info] Number of data points in the train set: 1610, number of used features: 674
[LightGBM] [Info] Start training from score 1557.878882


[I 2025-07-12 17:46:26,191] Trial 4 finished with value: 12147.725883642057 and parameters: {'num_leaves': 55, 'learning_rate': 0.1713761891911118, 'n_estimators': 132}. Best is trial 4 with value: 12147.725883642057.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014082 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171518
[LightGBM] [Info] Number of data points in the train set: 1610, number of used features: 674
[LightGBM] [Info] Start training from score 1265.234161
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014239 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171519
[LightGBM] [Info] Number of data points in the train set: 1610, number of used features: 674
[LightGBM] [Info] Start training from score 1748.327329
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014378 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171519
[LightGBM] [Info] Number of data points in the train set: 1610, number of used features: 674
[LightGBM] [Inf

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading f:\WattPredictor\artifacts/model_trainer/model.joblib: 0.000%|          | 0/680299 elapsed<00:00 rem…

Uploading f:\WattPredictor\input_example.json: 0.000%|          | 0/5256 elapsed<00:00 remaining<?

Uploading f:\WattPredictor\model_schema.json: 0.000%|          | 0/61417 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1237149/models/wattpredictor_lightgbm/1
[2025-07-12 17:47:23,854: INFO: 1015732353: Best model registered: {'model_name': 'LightGBM', 'score': 417.9229396728917, 'params': {'num_leaves': 55, 'learning_rate': 0.1713761891911118, 'n_estimators': 132}}]
