In [1]:
import os
import sys
os.chdir('../')
sys.path.append(os.path.join(os.getcwd(), "src"))

In [2]:
import os
import sys
import optuna
import joblib
import pandas as pd
from hsml.schema import Schema
from hsml.model_schema import ModelSchema
from datetime import datetime, timedelta
from WattPredictor.utils.helpers import *
from WattPredictor.utils.ts_generator import features_and_target, get_pipeline, average_demand_last_4_weeks
from WattPredictor.config.model_config import ModelConfigurationManager
from WattPredictor.utils.feature import feature_store_instance
from WattPredictor.entity.config_entity import TrainerConfig
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from WattPredictor.utils.exception import CustomException
from WattPredictor.utils.logging import logger

class Trainer:
    def __init__(self, config: TrainerConfig):
        self.config = config
        self.feature_store = feature_store_instance()
        self.models = {
            "XGBoost": {
                "search_space": lambda trial: {
                    "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                    "max_depth": trial.suggest_int("max_depth", 3, 10),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                }
            },
            "LightGBM": {
                "search_space": lambda trial: {
                    "num_leaves": trial.suggest_int("num_leaves", 20, 150),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                    "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                }
            }
        }

    def load_training_data(self):
        try:
            df, _ = self.feature_store.get_training_data("elec_wx_features_view")
            df = df[['date', 'demand', 'sub_region_code', 'temperature_2m', 
                     'hour', 'day_of_week', 'month', 'is_weekend', 'is_holiday']]
            df.sort_values("date", inplace=True)
            logger.info(f"Loaded training data with columns: {list(df.columns)}")
            logger.info(f"Data range: {df['date'].min()} to {df['date'].max()}")
            return df
        except Exception as e:
            raise CustomException(f"Failed to load data from elec_wx_features_view: {e}", sys)

    def train(self):
        try:
            df = self.load_training_data()
            if df.empty:
                raise CustomException("Loaded DataFrame is empty", sys)

            # Validate data per sub_region_code
            for code in df['sub_region_code'].unique():
                count = len(df[df['sub_region_code'] == code])
                if count < self.config.input_seq_len + 1:
                    logger.warning(f"sub_region_code {code} has insufficient data points ({count} < {self.config.input_seq_len + 1})")

            cutoff_date = (datetime.now() - timedelta(days=90)).strftime("%Y-%m-%d")
            train_df, test_df = df[df['date'] < cutoff_date], df[df['date'] >= cutoff_date]

            logger.info(f"train_df shape: {train_df.shape}, date range: {train_df['date'].min()} to {train_df['date'].max()}")
            logger.info(f"test_df shape: {test_df.shape}, date range: {test_df['date'].min()} to {test_df['date'].max()}")

            if train_df.empty:
                raise CustomException("Training DataFrame is empty after applying cutoff_date", sys)
            if test_df.empty:
                raise CustomException("Test DataFrame is empty after applying cutoff_date", sys)

            train_x, train_y = features_and_target(train_df, self.config.input_seq_len, self.config.step_size)
            train_x.drop(columns=["date"], errors="ignore", inplace=True)

            # Validate dtypes
            non_numeric_cols = train_x.select_dtypes(exclude=['int64', 'float64', 'bool']).columns
            if not non_numeric_cols.empty:
                raise CustomException(f"Non-numeric columns found in train_x: {non_numeric_cols}", sys)

            # Validate expected features
            expected_features = [f'demand_previous_{i+1}_hour' for i in reversed(range(self.config.input_seq_len))] + \
                                ['temperature_2m', 'hour', 'day_of_week', 'month', 'is_weekend', 'is_holiday']
            missing_features = [col for col in expected_features if col not in train_x.columns]
            if missing_features:
                logger.warning(f"Missing expected features in train_x: {missing_features}. Proceeding with available features.")

            # Preprocess train_x for input schema
            train_x_transformed = train_x.copy()
            train_x_transformed = average_demand_last_4_weeks(train_x_transformed)

            best_overall = {"model_name": None, "score": float("inf"), "params": None}

            for model_name, model_info in self.models.items():
                logger.info(f"Running Optuna HPO for {model_name}")

                def objective(trial):
                    params = model_info["search_space"](trial)
                    pipeline = get_pipeline(model_type=model_name, **params)
                    x_tr, x_val, y_tr, y_val = train_test_split(train_x, train_y, test_size=0.2, shuffle=False)
                    pipeline.fit(x_tr, y_tr)
                    preds = pipeline.predict(x_val)
                    return mean_squared_error(y_val, preds)

                study = optuna.create_study(direction="minimize")
                study.optimize(objective, n_trials=self.config.n_trials)

                best_params = study.best_params
                pipeline = get_pipeline(model_type=model_name, **best_params)
                score = -cross_val_score(pipeline, train_x, train_y, cv=KFold(n_splits=self.config.cv_folds), 
                                        scoring="neg_root_mean_squared_error").mean()

                if score < best_overall["score"]:
                    best_overall.update({
                        "model_name": model_name,
                        "score": score,
                        "params": best_params
                    })

            final_pipeline = get_pipeline(model_type=best_overall["model_name"], **best_overall["params"])
            final_pipeline.fit(train_x, train_y)

            model_path = Path(self.config.root_dir) / self.config.model_name
            create_directories([model_path.parent])
            save_bin(final_pipeline, model_path)

            input_schema = Schema(train_x_transformed.head(10))
            output_schema = Schema(pd.DataFrame(train_y))
            model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

            model_registry = self.feature_store.project.get_model_registry()
            hops_model = model_registry.python.create_model(
                name=f"wattpredictor_{best_overall['model_name'].lower()}",
                metrics={'rmse': best_overall["score"]},
                input_example=train_x_transformed.head(10),
                model_schema=model_schema,
                description=f"Best {best_overall['model_name']} model with feature engineering pipeline for electricity demand"
            )
            hops_model.save(model_path.as_posix())

            logger.info(f"Best model registered: {best_overall['model_name']} with RMSE {best_overall['score']}, params {best_overall['params']}")
            return best_overall

        except Exception as e:
            raise CustomException(f"Training failed: {e}", sys)

In [3]:
try:    
    config = ModelConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = Trainer(config=model_trainer_config)
    model_trainer.train()

except Exception as e:
    raise CustomException(e, sys) from e

[2025-07-20 18:44:46,994: INFO: helpers: yaml file: config_file\config.yaml loaded successfully]
[2025-07-20 18:44:46,999: INFO: helpers: yaml file: config_file\params.yaml loaded successfully]
[2025-07-20 18:44:47,003: INFO: helpers: yaml file: config_file\schema.yaml loaded successfully]
[2025-07-20 18:44:47,004: INFO: helpers: created directory at: artifacts]
[2025-07-20 18:44:47,005: INFO: helpers: created directory at: artifacts/trainer]
[2025-07-20 18:44:47,014: INFO: helpers: yaml file: config_file\config.yaml loaded successfully]
[2025-07-20 18:44:47,017: INFO: external: Initializing external client]
[2025-07-20 18:44:47,018: INFO: external: Base URL: https://c.app.hopsworks.ai:443]
[2025-07-20 18:44:49,902: INFO: python: Python Engine initialized.]

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1240214
[2025-07-20 18:44:52,592: INFO: feature_store: Connected to Hopsworks Feature Store: WattPredictor]
Finished: Reading data from Hopsworks, using Hopswor

Generating TS features:   0%|          | 0/11 [00:00<?, ?it/s]

[2025-07-20 18:45:08,512: INFO: ts_generator: Columns for sub_region_code 7: ['date', 'demand', 'temperature_2m', 'hour', 'day_of_week', 'month', 'is_weekend', 'is_holiday']]


Generating TS features:   9%|▉         | 1/11 [00:00<00:02,  4.00it/s]

[2025-07-20 18:45:08,761: INFO: ts_generator: Columns for sub_region_code 6: ['date', 'demand', 'temperature_2m', 'hour', 'day_of_week', 'month', 'is_weekend', 'is_holiday']]


Generating TS features:  18%|█▊        | 2/11 [00:00<00:01,  5.27it/s]

[2025-07-20 18:45:08,909: INFO: ts_generator: Columns for sub_region_code 1: ['date', 'demand', 'temperature_2m', 'hour', 'day_of_week', 'month', 'is_weekend', 'is_holiday']]


Generating TS features:  27%|██▋       | 3/11 [00:00<00:01,  5.54it/s]

[2025-07-20 18:45:09,081: INFO: ts_generator: Columns for sub_region_code 2: ['date', 'demand', 'temperature_2m', 'hour', 'day_of_week', 'month', 'is_weekend', 'is_holiday']]


Generating TS features:  36%|███▋      | 4/11 [00:00<00:01,  5.37it/s]

[2025-07-20 18:45:09,273: INFO: ts_generator: Columns for sub_region_code 0: ['date', 'demand', 'temperature_2m', 'hour', 'day_of_week', 'month', 'is_weekend', 'is_holiday']]


Generating TS features:  45%|████▌     | 5/11 [00:00<00:01,  5.83it/s]

[2025-07-20 18:45:09,423: INFO: ts_generator: Columns for sub_region_code 4: ['date', 'demand', 'temperature_2m', 'hour', 'day_of_week', 'month', 'is_weekend', 'is_holiday']]


Generating TS features:  55%|█████▍    | 6/11 [00:01<00:00,  5.61it/s]

[2025-07-20 18:45:09,610: INFO: ts_generator: Columns for sub_region_code 3: ['date', 'demand', 'temperature_2m', 'hour', 'day_of_week', 'month', 'is_weekend', 'is_holiday']]


Generating TS features:  64%|██████▎   | 7/11 [00:01<00:00,  5.90it/s]

[2025-07-20 18:45:09,761: INFO: ts_generator: Columns for sub_region_code 9: ['date', 'demand', 'temperature_2m', 'hour', 'day_of_week', 'month', 'is_weekend', 'is_holiday']]


Generating TS features:  73%|███████▎  | 8/11 [00:01<00:00,  5.92it/s]

[2025-07-20 18:45:09,932: INFO: ts_generator: Columns for sub_region_code 10: ['date', 'demand', 'temperature_2m', 'hour', 'day_of_week', 'month', 'is_weekend', 'is_holiday']]


Generating TS features:  82%|████████▏ | 9/11 [00:01<00:00,  6.09it/s]

[2025-07-20 18:45:10,082: INFO: ts_generator: Columns for sub_region_code 5: ['date', 'demand', 'temperature_2m', 'hour', 'day_of_week', 'month', 'is_weekend', 'is_holiday']]


Generating TS features:  91%|█████████ | 10/11 [00:01<00:00,  5.92it/s]

[2025-07-20 18:45:10,262: INFO: ts_generator: Columns for sub_region_code 8: ['date', 'demand', 'temperature_2m', 'hour', 'day_of_week', 'month', 'is_weekend', 'is_holiday']]


Generating TS features: 100%|██████████| 11/11 [00:01<00:00,  5.65it/s]

[2025-07-20 18:45:10,454: INFO: ts_generator: Features columns: ['demand_previous_672_hour', 'demand_previous_671_hour', 'demand_previous_670_hour', 'demand_previous_669_hour', 'demand_previous_668_hour', 'demand_previous_667_hour', 'demand_previous_666_hour', 'demand_previous_665_hour', 'demand_previous_664_hour', 'demand_previous_663_hour', 'demand_previous_662_hour', 'demand_previous_661_hour', 'demand_previous_660_hour', 'demand_previous_659_hour', 'demand_previous_658_hour', 'demand_previous_657_hour', 'demand_previous_656_hour', 'demand_previous_655_hour', 'demand_previous_654_hour', 'demand_previous_653_hour', 'demand_previous_652_hour', 'demand_previous_651_hour', 'demand_previous_650_hour', 'demand_previous_649_hour', 'demand_previous_648_hour', 'demand_previous_647_hour', 'demand_previous_646_hour', 'demand_previous_645_hour', 'demand_previous_644_hour', 'demand_previous_643_hour', 'demand_previous_642_hour', 'demand_previous_641_hour', 'demand_previous_640_hour', 'demand_pre


[I 2025-07-20 18:45:10,477] A new study created in memory with name: no-name-bcb62c8f-9bcb-4caa-a00e-8e05f5a71053
[I 2025-07-20 18:45:52,355] Trial 0 finished with value: 1920.5953278229097 and parameters: {'n_estimators': 85, 'max_depth': 10, 'learning_rate': 0.22004247289851087}. Best is trial 0 with value: 1920.5953278229097.


[2025-07-20 18:48:18,471: INFO: 2453969390: Running Optuna HPO for LightGBM]


[I 2025-07-20 18:48:18,495] A new study created in memory with name: no-name-f67939cd-8c4f-4e62-b22e-09108079b80e


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012006 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171881
[LightGBM] [Info] Number of data points in the train set: 2270, number of used features: 678
[LightGBM] [Info] Start training from score 1657.170044


[I 2025-07-20 18:48:45,427] Trial 0 finished with value: 1303.827559258936 and parameters: {'num_leaves': 69, 'learning_rate': 0.04171679963012042, 'n_estimators': 256}. Best is trial 0 with value: 1303.827559258936.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017719 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171881
[LightGBM] [Info] Number of data points in the train set: 2270, number of used features: 678
[LightGBM] [Info] Start training from score 1745.764317
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013619 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171881
[LightGBM] [Info] Number of data points in the train set: 2270, number of used features: 678
[LightGBM] [Info] Start training from score 1553.943172
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014863 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171881
[LightGBM] [Info] Number of data points in the train set: 2270, number of used features: 678
[LightGBM] [Inf

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading f:\WattPredictor\artifacts/trainer/model.joblib: 0.000%|          | 0/1590937 elapsed<00:00 remainin…

Uploading f:\WattPredictor\input_example.json: 0.000%|          | 0/4744 elapsed<00:00 remaining<?

Uploading f:\WattPredictor\model_schema.json: 0.000%|          | 0/61796 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1240214/models/wattpredictor_lightgbm/2
[2025-07-20 18:51:00,427: INFO: 2453969390: Best model registered: LightGBM with RMSE 497.51533190325415, params {'num_leaves': 69, 'learning_rate': 0.04171679963012042, 'n_estimators': 256}]
