In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import src.component.feature_group_config as config
from comet_ml import Experiment

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import hopsworks

# connect to the project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

# connect to the feature store
feature_store = project.get_feature_store()

# connect to the feature group
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
)

2025-02-02 21:22:28,299 INFO: Initializing external client
2025-02-02 21:22:28,300 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-02-02 21:22:29,492 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1212591


In [4]:
# create feature view (if it doesn't exist yet)
# This feature view only uses on feature group, so the query is trivial
try:
    # create feature view if it doesn't exist yet
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already existed. Skip creation.')


# get feature view
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1212591/fs/1200218/fv/electricity_demand_feature_view/version/1


In [5]:
data, _ = feature_view.training_data(
    description='Time-series hourly electricity demand values',
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.98s) 




In [6]:
# drop `date` column
data.drop('seconds', axis=1, inplace=True)

# sort by `pickup_location_id` and `pickup_hour`
data.sort_values(by=['sub_region_code', 'date'], inplace=True)
data

Unnamed: 0,date,sub_region_code,demand,temperature_2m
41471,2023-01-01 05:00:00+00:00,0,1477,14.6085
28212,2023-01-01 06:00:00+00:00,0,1428,14.4585
40260,2023-01-01 07:00:00+00:00,0,1392,14.4585
31657,2023-01-01 08:00:00+00:00,0,1366,14.6085
164702,2023-01-01 09:00:00+00:00,0,1353,14.7085
...,...,...,...,...
53030,2025-01-01 20:00:00+00:00,10,2098,6.1585
185277,2025-01-01 21:00:00+00:00,10,2177,6.4585
171740,2025-01-01 22:00:00+00:00,10,2304,6.4585
64070,2025-01-01 23:00:00+00:00,10,2470,6.1585


In [7]:
#transform the batch of data to features and target
from src.component.data_info import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(
    data,
    input_seq_len=24*28*1, # one month
    step_size=23,
)

features_and_target = features.copy()
features_and_target['target_demand_values_next_hour'] = targets

print(f'{features_and_target.shape=}')

100%|██████████| 11/11 [00:03<00:00,  3.54it/s]

features_and_target.shape=(8085, 676)





In [8]:
features_and_target

Unnamed: 0,demand_previous_672_hour,demand_previous_671_hour,demand_previous_670_hour,demand_previous_669_hour,demand_previous_668_hour,demand_previous_667_hour,demand_previous_666_hour,demand_previous_665_hour,demand_previous_664_hour,demand_previous_663_hour,...,demand_previous_6_hour,demand_previous_5_hour,demand_previous_4_hour,demand_previous_3_hour,demand_previous_2_hour,demand_previous_1_hour,date,sub_region_code,temperature_2m,target_demand_values_next_hour
0,1477.0,1428.0,1392.0,1366.0,1353.0,1352.0,1361.0,1394.0,1429.0,1460.0,...,1786.0,1807.0,1776.0,1745.0,1732.0,1661.0,2023-01-29 05:00:00+00:00,0,0.5585,1599.0
1,1516.0,1450.0,1384.0,1351.0,1335.0,1334.0,1341.0,1375.0,1425.0,1486.0,...,1744.0,1796.0,1807.0,1789.0,1764.0,1719.0,2023-01-30 04:00:00+00:00,0,2.0585,1657.0
2,1664.0,1597.0,1530.0,1487.0,1455.0,1435.0,1437.0,1456.0,1510.0,1617.0,...,1910.0,1933.0,1983.0,1995.0,1968.0,1933.0,2023-01-31 03:00:00+00:00,0,2.5585,1874.0
3,1783.0,1718.0,1636.0,1549.0,1493.0,1459.0,1443.0,1440.0,1459.0,1516.0,...,1888.0,1862.0,1922.0,1994.0,2029.0,2004.0,2023-02-01 02:00:00+00:00,0,3.6085,1972.0
4,1816.0,1776.0,1731.0,1650.0,1556.0,1516.0,1481.0,1472.0,1471.0,1503.0,...,1882.0,1846.0,1873.0,1935.0,1994.0,2041.0,2023-02-02 01:00:00+00:00,0,3.8585,2017.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8080,2392.0,2516.0,2506.0,2467.0,2425.0,2352.0,2244.0,2121.0,2010.0,1926.0,...,2222.0,2273.0,2267.0,2248.0,2216.0,2253.0,2024-12-28 22:00:00+00:00,10,-2.3915,2384.0
8081,2295.0,2502.0,2667.0,2666.0,2625.0,2562.0,2450.0,2292.0,2138.0,2019.0,...,1911.0,1952.0,1969.0,1930.0,1995.0,2000.0,2024-12-29 21:00:00+00:00,10,1.5585,2020.0
8082,2170.0,2382.0,2608.0,2758.0,2762.0,2720.0,2635.0,2497.0,2322.0,2156.0,...,1985.0,2015.0,1991.0,1884.0,1802.0,1824.0,2024-12-30 20:00:00+00:00,10,1.5085,1918.0
8083,2149.0,2240.0,2351.0,2566.0,2735.0,2746.0,2709.0,2646.0,2514.0,2340.0,...,2025.0,2010.0,1923.0,1850.0,1863.0,1908.0,2024-12-31 19:00:00+00:00,10,1.8585,1885.0


In [9]:
features_and_target.date.min()

'2023-01-29 05:00:00+00:00'

In [10]:
features_and_target.date.max()

'2025-01-01 18:00:00+00:00'

In [11]:
#split the data
from datetime import date, timedelta
from pytz import timezone
import pandas as pd
from src.component.data_info import train_test_split


cutoff_date = pd.to_datetime(date.today() - timedelta(days=60), utc=True)

print(f'{cutoff_date=}')

X_train, y_train, X_test, y_test = train_test_split(
    features_and_target,
    cutoff_date,
    target_column_name='target_demand_values_next_hour'   
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')
print(f"Training data range: {X_train['date'].min()} to {X_train['date'].max()}")
print(f"Testing data range: {X_test['date'].min()} to {X_test['date'].max()}")


cutoff_date=Timestamp('2024-12-04 00:00:00+0000', tz='UTC')
X_train.shape=(7744, 675)
y_train.shape=(7744,)
X_test.shape=(341, 675)
y_test.shape=(341,)
Training data range: 2023-01-29 05:00:00+00:00 to 2024-12-03 01:00:00+00:00
Testing data range: 2024-12-04 00:00:00+00:00 to 2025-01-01 18:00:00+00:00


In [12]:
x_tr=X_train.drop(['date'],axis=1)
x_ts=X_test.drop(['date'],axis=1)

In [13]:
#basleline model linear regrisson
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create and train the linear regression model
model = LinearRegression()
model.fit(x_tr, y_train)

In [14]:
y_pred = model.predict(x_ts)
y_pred

array([2036.13842821, 2077.94392848, 2074.4198632 , 2036.67092544,
       1951.926135  , 1815.02910986, 1957.86307341, 1869.69065062,
       1890.53944745, 1985.85948492, 2031.61186321, 1913.06795032,
       1730.07271616, 1692.44570325, 1620.43885559, 1635.07539977,
       1629.7751917 , 1733.87785132, 1705.55193425, 1826.43238695,
       1944.92512001, 1944.11676785, 1890.29461429, 1877.38538841,
       1903.96735008, 2006.47844514, 1774.11887438, 1681.8189346 ,
       1853.87429679, 1714.42272721, 1775.30232704, 1340.14346845,
       1384.51240636, 1371.30924102, 1283.27357344, 1239.84495044,
       1107.64990523, 1267.42553912, 1118.29667072, 1172.69077463,
       1267.33329852, 1328.46188649, 1190.20549988, 1079.22886512,
       1028.50408106,  956.79596124,  944.24924132,  974.44176724,
       1017.82585046, 1062.97704604, 1175.97882938, 1266.98686059,
       1274.86382577, 1155.078116  , 1172.53640792, 1361.00558894,
       1317.27066426, 1118.78051691, 1110.48059013, 1183.29950

In [15]:
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error
def evaluate_model(y_test, y_pred):
    test_mae = mean_absolute_error(y_test, y_pred)
    test_mape = mean_absolute_percentage_error(y_test, y_pred)
    return f"MAE is {test_mae:.4f} and MAPE is: {test_mape:.4f}"

In [16]:
evaluate_model(y_test, y_pred)

'MAE is 18.4495 and MAPE is: 0.0139'

In [17]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.component.model_info import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100), 
        #"learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True)
  
    }
       
    tss = KFold(n_splits=5)
    scores = []
        
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

In [18]:
import warnings
warnings.filterwarnings("ignore")

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=6)

[I 2025-02-02 21:22:49,458] A new study created in memory with name: no-name-fd874a45-299e-42db-a1d8-48c63550847f
  File "d:\poetry_virtualenvs\src-o1faXYQI-py3.12\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")
[I 2025-02-02 21:23:41,370] Trial 0 finished with value: 352.97980817213016 and parameters: {'num_leaves': 227, 'feature_fraction': 0.9642195070738409, 'bagging_fraction': 0.41747309590795756, 'min_child_samples': 40}. Best is trial 0 with value: 352.97980817213016.
[I 2025-02-02 21:24:46,141] Trial 1 finished with value: 349.3995875903902 and parameters: {'num_leaves': 212, 'feature_fraction': 0.6739678996101689, 'bagging_fraction': 0.7436567665112206, 'min_child_samples': 22}. Best is trial 1 with value: 349.3995875903902.
[I 2025-02-02 21:25:35,668] Trial 2 finished with value: 354.24485445847455 and parameters: {'num_leaves': 177, 'feature_fraction': 0.9233

In [19]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 212, 'feature_fraction': 0.6739678996101689, 'bagging_fraction': 0.7436567665112206, 'min_child_samples': 22}


In [20]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train);

In [21]:
from src.component.model_info import evaluate_model
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=29.1044


In [22]:
#plot the result
from src.plot import plot_one_sample

plot_one_sample(
    example_id=1,
    features=X_test,
    targets=y_test,
    predictions=pd.Series(predictions)
)

In [23]:
import joblib
from src.paths import MODELS_DIR

joblib.dump(pipeline, MODELS_DIR /'LGB_model.pkl')

['D:\\Electricity_demand_predictor2\\models\\LGB_model.pkl']

In [24]:
#in order to save to model for model registoy we have to create scema first
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [25]:
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name="electricity_demand_predictor_next_hour",
    metrics={"test_mae": test_mae},
    description="LightGBM regressor with a bit of hyper-parameter tuning",
    input_example=X_train.sample(),
    model_schema=model_schema
)

model.save(str(MODELS_DIR /'LGB_model.pkl'))

Uploading: 100.000%|██████████| 1870905/1870905 elapsed<00:03 remaining<00:001.28it/s]
Uploading: 100.000%|██████████| 4764/4764 elapsed<00:02 remaining<00:000:10,  2.52s/it]
Uploading: 100.000%|██████████| 61601/61601 elapsed<00:01 remaining<00:00
Model export complete: 100%|██████████| 6/6 [00:14<00:00,  2.45s/it]                   

Model created, explore it at https://c.app.hopsworks.ai:443/p/1212591/models/electricity_demand_predictor_next_hour/2





Model(name: 'electricity_demand_predictor_next_hour', version: 2)