In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

import src.component.feature_group_config as config
from comet_ml import Experiment




In [3]:
import hopsworks

# connect to the project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

# connect to the feature store
feature_store = project.get_feature_store()

# connect to the feature group
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
)

2025-04-05 18:51:18,179 INFO: Initializing external client
2025-04-05 18:51:18,180 INFO: Base URL: https://c.app.hopsworks.ai:443
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'







2025-04-05 18:51:23,575 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1213640


In [4]:
# create feature view (if it doesn't exist yet)
# This feature view only uses on feature group, so the query is trivial
try:
    # create feature view if it doesn't exist yet
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already existed. Skip creation.')


# get feature view
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)

Feature view already existed. Skip creation.


In [5]:
config.FEATURE_VIEW_NAME

'electricity_demand_feature_view'

In [6]:
data, _ = feature_view.training_data(
    description='Time-series hourly electricity demand values',
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (65.49s) 



In [7]:
print(data.columns)


Index(['date', 'sub_region_code', 'demand', 'temperature_2m', 'seconds'], dtype='object')


In [8]:
# drop `date` column
data.drop('seconds', axis=1, inplace=True)

# sort by `pickup_location_id` and `pickup_hour`
data.sort_values(by=['sub_region_code', 'date'], inplace=True)
data

Unnamed: 0,date,sub_region_code,demand,temperature_2m
134351,2023-01-01 04:00:00+00:00,0,1537,15.808500
53740,2023-01-01 05:00:00+00:00,0,1477,14.608500
183640,2023-01-01 06:00:00+00:00,0,1428,14.458500
12337,2023-01-01 07:00:00+00:00,0,1392,14.458500
136041,2023-01-01 08:00:00+00:00,0,1366,14.608500
...,...,...,...,...
123264,2025-04-05 08:00:00+00:00,10,1669,15.408501
123195,2025-04-05 09:00:00+00:00,10,1675,16.608500
123239,2025-04-05 10:00:00+00:00,10,1718,18.958500
123243,2025-04-05 11:00:00+00:00,10,1789,20.008501


In [9]:
print("Min date:", data["date"].min())
print("Max date:", data["date"].max())


Min date: 2023-01-01 04:00:00+00:00
Max date: 2025-04-05 12:00:00+00:00


In [10]:
#transform the batch of data to features and target
from src.component.data_info import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(
    data,
    input_seq_len=24*28*1, # one month
    step_size=23,
)

features_and_target = features.copy()
features_and_target['target_demand_values_next_hour'] = targets

print(f'{features_and_target.shape=}')

100%|██████████| 11/11 [00:03<00:00,  3.18it/s]

features_and_target.shape=(9152, 676)





In [11]:
features_and_target

Unnamed: 0,demand_previous_672_hour,demand_previous_671_hour,demand_previous_670_hour,demand_previous_669_hour,demand_previous_668_hour,demand_previous_667_hour,demand_previous_666_hour,demand_previous_665_hour,demand_previous_664_hour,demand_previous_663_hour,...,demand_previous_6_hour,demand_previous_5_hour,demand_previous_4_hour,demand_previous_3_hour,demand_previous_2_hour,demand_previous_1_hour,date,sub_region_code,temperature_2m,target_demand_values_next_hour
0,1537.0,1477.0,1428.0,1392.0,1366.0,1353.0,1352.0,1361.0,1394.0,1429.0,...,1721.0,1786.0,1807.0,1776.0,1745.0,1732.0,2023-01-29 04:00:00+00:00,0,0.408500,1661.0
1,1572.0,1516.0,1450.0,1384.0,1351.0,1335.0,1334.0,1341.0,1375.0,1425.0,...,1716.0,1744.0,1796.0,1807.0,1789.0,1764.0,2023-01-30 03:00:00+00:00,0,2.158500,1719.0
2,1734.0,1664.0,1597.0,1530.0,1487.0,1455.0,1435.0,1437.0,1456.0,1510.0,...,1916.0,1910.0,1933.0,1983.0,1995.0,1968.0,2023-01-31 02:00:00+00:00,0,2.158500,1933.0
3,1826.0,1783.0,1718.0,1636.0,1549.0,1493.0,1459.0,1443.0,1440.0,1459.0,...,1916.0,1888.0,1862.0,1922.0,1994.0,2029.0,2023-02-01 01:00:00+00:00,0,3.458500,2004.0
4,1868.0,1816.0,1776.0,1731.0,1650.0,1556.0,1516.0,1481.0,1472.0,1471.0,...,1898.0,1882.0,1846.0,1873.0,1935.0,1994.0,2023-02-02 00:00:00+00:00,0,3.258500,2041.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9147,1946.0,2103.0,2262.0,2405.0,2368.0,2279.0,2150.0,1986.0,1832.0,1727.0,...,2099.0,2103.0,2057.0,2029.0,2115.0,2066.0,2025-03-31 21:00:00+00:00,10,10.358500,2037.0
9148,2245.0,2262.0,2315.0,2407.0,2484.0,2426.0,2336.0,2191.0,2019.0,1859.0,...,2079.0,2041.0,2037.0,2127.0,2226.0,2236.0,2025-04-01 20:00:00+00:00,10,9.608500,2231.0
9149,2188.0,2204.0,2234.0,2308.0,2415.0,2474.0,2424.0,2325.0,2182.0,2004.0,...,2221.0,2285.0,2343.0,2363.0,2393.0,2405.0,2025-04-02 19:00:00+00:00,10,10.058500,2418.0
9150,2049.0,2045.0,2018.0,2104.0,2232.0,2357.0,2436.0,2395.0,2314.0,2186.0,...,2175.0,2181.0,2134.0,2089.0,2020.0,2037.0,2025-04-03 18:00:00+00:00,10,11.508500,2006.0


In [12]:
features_and_target.date.min()

'2023-01-29 04:00:00+00:00'

In [13]:
features_and_target.date.max()

'2025-04-04 17:00:00+00:00'

In [14]:
#split the data
from datetime import date, timedelta
from pytz import timezone
import pandas as pd
from src.component.data_info import train_test_split


cutoff_date = pd.to_datetime(date.today() - timedelta(days=60), utc=True)

print(f'{cutoff_date=}')

X_train, y_train, X_test, y_test = train_test_split(
    features_and_target,
    cutoff_date,
    target_column_name='target_demand_values_next_hour'   
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')
print(f"Training data range: {X_train['date'].min()} to {X_train['date'].max()}")
print(f"Testing data range: {X_test['date'].min()} to {X_test['date'].max()}")


cutoff_date=Timestamp('2025-02-04 00:00:00+0000', tz='UTC')
X_train.shape=(8459, 675)
y_train.shape=(8459,)
X_test.shape=(693, 675)
y_test.shape=(693,)
Training data range: 2023-01-29 04:00:00+00:00 to 2025-02-03 08:00:00+00:00
Testing data range: 2025-02-04 07:00:00+00:00 to 2025-04-04 17:00:00+00:00


In [15]:
x_tr=X_train.drop(['date'],axis=1)
x_ts=X_test.drop(['date'],axis=1)

In [16]:
print(x_ts.shape)
print(x_ts.head() if hasattr(x_ts, "head") else x_ts)

(693, 674)
   demand_previous_672_hour  demand_previous_671_hour  \
0                    1646.0                    1616.0   
1                    1584.0                    1540.0   
2                    1773.0                    1682.0   
3                    1872.0                    1798.0   
4                    1869.0                    1820.0   

   demand_previous_670_hour  demand_previous_669_hour  \
0                    1589.0                    1581.0   
1                    1520.0                    1513.0   
2                    1629.0                    1594.0   
3                    1712.0                    1667.0   
4                    1754.0                    1674.0   

   demand_previous_668_hour  demand_previous_667_hour  \
0                    1598.0                    1677.0   
1                    1515.0                    1540.0   
2                    1576.0                    1562.0   
3                    1629.0                    1607.0   
4                 

In [17]:
#basleline model linear regrisson
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create and train the linear regression model
model = LinearRegression()
model.fit(x_tr, y_train)

In [18]:
y_pred = model.predict(x_ts)
y_pred

array([1735.13378484, 1773.25898628, 1738.5095794 , 1752.57775746,
       1698.55803462, 1730.71697123, 1829.18526481, 2003.1150055 ,
       2031.88433324, 1904.43866105, 1999.08461142, 1915.41958798,
       1805.80853378, 1872.34102596, 1817.6055151 , 1828.52041794,
       1786.63279272, 1819.31421883, 1859.42199953, 1767.56784337,
       1758.14126749, 1593.64610198, 1577.30729536, 1499.03170023,
       1753.98827702, 1766.74412576, 1697.03765325, 1649.46335363,
       1722.35996668, 1738.20609234, 1824.98201883, 1857.29095868,
       1751.25680246, 1696.46445399, 1689.87737026, 1692.84544604,
       1709.39398881, 1587.09418543, 1570.26835066, 1627.68491474,
       1837.07902019, 1656.13546424, 1512.44024979, 1888.03702156,
       1845.70193074, 1674.56499833, 1701.94255038, 1714.23578488,
       1686.90536088, 1599.32659158, 1636.09968917, 1574.81807269,
       1700.68806348, 1829.01706685, 1815.79213852, 1802.27767884,
       1697.41432288, 1476.00496293, 1501.88480443, 1763.76671

In [19]:
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error
def evaluate_model(y_test, y_pred):
    test_mae = mean_absolute_error(y_test, y_pred)
    test_mape = mean_absolute_percentage_error(y_test, y_pred)
    return f"MAE is {test_mae:.4f} and MAPE is: {test_mape:.4f}"

In [20]:
evaluate_model(y_test, y_pred)

'MAE is 16.2229 and MAPE is: 0.0137'

In [21]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.component.model_info import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100), 
        #"learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True)
  
    }
       
    tss = KFold(n_splits=5)
    scores = []
        
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

In [22]:
import warnings
warnings.filterwarnings("ignore")

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=6)

[I 2025-04-05 18:52:59,969] A new study created in memory with name: no-name-55e8cf10-9468-4f21-bf66-39575406d6c8
[I 2025-04-05 18:53:28,218] Trial 0 finished with value: 357.29910339104987 and parameters: {'num_leaves': 100, 'feature_fraction': 0.5687202027191849, 'bagging_fraction': 0.7705502988047592, 'min_child_samples': 3}. Best is trial 0 with value: 357.29910339104987.
[I 2025-04-05 18:53:42,906] Trial 1 finished with value: 355.42635693702846 and parameters: {'num_leaves': 151, 'feature_fraction': 0.38851901159394225, 'bagging_fraction': 0.700560549723001, 'min_child_samples': 68}. Best is trial 1 with value: 355.42635693702846.
[I 2025-04-05 18:53:57,883] Trial 2 finished with value: 360.3768963375404 and parameters: {'num_leaves': 220, 'feature_fraction': 0.43624594142595374, 'bagging_fraction': 0.23039918197466047, 'min_child_samples': 88}. Best is trial 1 with value: 355.42635693702846.
[I 2025-04-05 18:54:13,388] Trial 3 finished with value: 351.92955213667625 and paramete

In [23]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 191, 'feature_fraction': 0.7125854464500763, 'bagging_fraction': 0.4837611212062033, 'min_child_samples': 53}


In [24]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train);

In [25]:
from src.component.model_info import evaluate_model
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=25.3364


In [26]:
#plot the result
from src.plot import plot_one_sample

plot_one_sample(
    example_id=1,
    features=X_test,
    targets=y_test,
    predictions=pd.Series(predictions)
)

In [27]:
import joblib
from src.paths import MODELS_DIR

joblib.dump(pipeline, MODELS_DIR /'LGB_model.pkl')

['D:\\projects\\electricity_demand_predictor-main\\models\\LGB_model.pkl']

In [28]:
#in order to save to model for model registoy we have to create scema first
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [29]:
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name="electricity_demand_predictor_next_hour",
    metrics={"test_mae": test_mae},
    description="LightGBM regressor with a bit of hyper-parameter tuning",
    input_example=X_train.sample(),
    model_schema=model_schema
)

model.save(str(MODELS_DIR /'LGB_model.pkl'))

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/1131174 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/5154 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/61601 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1213640/models/electricity_demand_predictor_next_hour/2


Model(name: 'electricity_demand_predictor_next_hour', version: 2)