In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import src.component.feature_group_config as config

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import hopsworks

# connect to the project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

# connect to the feature store
feature_store = project.get_feature_store()

# connect to the feature group
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
)

2025-01-28 21:05:33,431 INFO: Initializing external client
2025-01-28 21:05:33,435 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-28 21:05:35,275 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1212583


In [4]:
# create feature view (if it doesn't exist yet)
# This feature view only uses on feature group, so the query is trivial
try:
    # create feature view if it doesn't exist yet
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already existed. Skip creation.')


# get feature view
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)

Feature view already existed. Skip creation.


In [5]:
data, _ = feature_view.training_data(
    description='Time-series hourly electricity demand values',
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (6.64s) 




In [6]:
# drop `date` column
data.drop('seconds', axis=1, inplace=True)

# sort by `pickup_location_id` and `pickup_hour`
data.sort_values(by=['sub_region_code', 'date'], inplace=True)
data

Unnamed: 0,date,sub_region_code,demand,temperature_2m
633483,2024-01-01 05:00:00+00:00,0,10761,3.9085
504954,2024-01-01 06:00:00+00:00,0,10579,3.7585
581933,2024-01-01 07:00:00+00:00,0,10326,3.4585
744174,2024-01-01 08:00:00+00:00,0,10083,3.9085
265932,2024-01-01 09:00:00+00:00,0,9904,4.9085
...,...,...,...,...
160799,2025-01-28 10:00:00+00:00,82,1841,5.5085
132787,2025-01-28 11:00:00+00:00,82,1962,6.8585
155991,2025-01-28 12:00:00+00:00,82,2153,7.5585
147408,2025-01-28 13:00:00+00:00,82,2297,7.9585


In [7]:
#transform the batch of data to features and target
from src.component.data_info import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(
    data,
    input_seq_len=24*28*1, # one month
    step_size=24,
)

features_and_target = features.copy()
features_and_target['target_demand_values_next_hour'] = targets

print(f'{features_and_target.shape=}')

100%|██████████| 83/83 [01:25<00:00,  1.03s/it]


features_and_target.shape=(30278, 676)


In [8]:
features_and_target.date.min()

'2024-01-29 05:00:00+00:00'

In [9]:
features_and_target.date.max()

'2025-01-28 10:00:00+00:00'

In [10]:
#split the data
from datetime import date, timedelta
from pytz import timezone
import pandas as pd
from src.component.data_info import train_test_split


cutoff_date = pd.to_datetime(date.today() - timedelta(days=28*1), utc=True)

print(f'{cutoff_date=}')

X_train, y_train, X_test, y_test = train_test_split(
    features_and_target,
    cutoff_date,
    target_column_name='target_demand_values_next_hour'   
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')
print(f"Training data range: {X_train['date'].min()} to {X_train['date'].max()}")
print(f"Testing data range: {X_test['date'].min()} to {X_test['date'].max()}")


cutoff_date=Timestamp('2024-12-31 00:00:00+0000', tz='UTC')


ValueError: unconverted data remains when parsing with format "%Y-%m-%d %H:%M:%S": "+00:00", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [31]:
x_tr=X_train.drop(['date'],axis=1)
x_ts=X_test.drop(['date'],axis=1)

In [32]:
#basleline model linear regrisson
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create and train the linear regression model
model = LinearRegression()
model.fit(x_tr, y_train)

In [None]:
y_pred = model.predict(x_ts)
y_pred

array([11407.99970529, 11942.97524434, 12377.56911167, ...,
        1764.85387983,  1796.88051552,  1848.10665766])

In [34]:
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error
def evaluate_model(y_test, y_pred):
    test_mae = mean_absolute_error(y_test, y_pred)
    test_mape = mean_absolute_percentage_error(y_test, y_pred)
    return f"MAE is {test_mae:.4f} and MAPE is: {test_mape:.4f}"

In [None]:
evaluate_model(y_test, y_pred)

'MAE is 36.2222 and MAPE is: 0.0153'

In [28]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

# from src.model_info import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100), 
        #"learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True)
  
    }
       
    tss = KFold(n_splits=6)
    scores = []
        
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

In [29]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=8)

[I 2025-01-28 20:55:57,465] A new study created in memory with name: no-name-39961181-3944-40a4-a68e-9f11345bfe82
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col

KeyboardInterrupt: 

In [None]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 240, 'feature_fraction': 0.8542643833568271, 'bagging_fraction': 0.5402028804645826, 'min_child_samples': 20}


In [24]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train);

In [None]:
from src.component.model_info import evaluate_model
predictions = pipeline.predict(X_test)
test_results = evaluate_model(y_test, predictions)
print(test_results)

MAE is 5953.6442 and MAPE is: 76.6076


In [33]:
# Load the pipeline from the file
import joblib
from src.paths import MODELS_DIR
loaded_pipeline = joblib.load(MODELS_DIR/'pipeline_model.pkl')

In [None]:
predictions=loaded_pipeline.predict(X_test)
test_results = evaluate_model(y_test, predictions)
print(test_results)

MAE is 108.0006 and MAPE is: 0.0410


In [38]:
#plot the result
from src.plot import plot_one_sample

plot_one_sample(
    example_id=1,
    features=X_test,
    targets=y_test,
    predictions=pd.Series(predictions)
)