In this case we will be training the model using date from the feature store (Hopsworks) and not the local `parquet` files and save the model to the Hopsworks model registry instead of local disk.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../')
import src.config as config

In [3]:
import hopsworks

# Connect to the project 
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

# Connect to the feature store
feature_store = project.get_feature_store()

# Connect to the feature group
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION
)

  from .autonotebook import tqdm as notebook_tqdm


2026-01-05 16:59:22,585 INFO: Initializing external client
2026-01-05 16:59:22,586 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-01-05 16:59:24,464 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1329302


In [4]:
# Create feature view (if it doesn't exist yet)
# This feature view only uses on feature group, so query is trivial
try:
    # Create feature view if it doesn't exist yet
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already exists. Skip creation.')

# get feature view
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1329302/fs/1317957/fv/time_series_hourly_feature_view/version/1


In [5]:
ts_data, _ = feature_view.training_data(
    description='Time-series hourly taxi rides',
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (80.49s) 




In [6]:
ts_data.sort_values(by=['pickup_location_id', 'pickup_hour'], inplace=True)
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
632474,2023-01-01 00:00:00+00:00,0,1
1146060,2023-01-01 01:00:00+00:00,0,1
489380,2023-01-01 02:00:00+00:00,0,1
6362923,2023-01-01 03:00:00+00:00,0,1
6770651,2023-01-01 04:00:00+00:00,0,1
...,...,...,...
3666103,2026-01-05 19:00:00+00:00,1,265
2726651,2026-01-05 20:00:00+00:00,4,265
4091078,2026-01-05 21:00:00+00:00,1,265
3165740,2026-01-05 22:00:00+00:00,0,265


In [7]:
from src.data import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(
    ts_data,
    input_seq_len=24*28, # one month
    step_size=23,
)

features_and_target = features.copy()
features_and_target['target_rides_next_hour'] = targets

print(f'{features_and_target.shape=}')

100%|██████████| 263/263 [04:43<00:00,  1.08s/it]


features_and_target.shape=(292340, 675)


In [11]:
from datetime import date, timedelta, timezone
import pandas as pd
from src.data_split import train_test_split

# Convert pickup_hour to datetime if it's a string
features_and_target['pickup_hour'] = pd.to_datetime(features_and_target['pickup_hour'])

# training data - make cutoff_date timezone-aware (UTC) to match pickup_hour
cutoff_date = pd.to_datetime(date.today() - timedelta(days=28*2)).tz_localize('UTC')

print(f'{cutoff_date=}')

X_train, y_train, X_test, y_test = train_test_split(
    features_and_target,
    cutoff_date,
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

cutoff_date=Timestamp('2025-11-10 00:00:00+0000', tz='UTC')
X_train.shape=(279043, 674)
y_train.shape=(279043,)
X_test.shape=(13297, 674)
y_test.shape=(13297,)


In [12]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and 
    computes an average validation error based on a TimeSeriesSplit
    """

    # pick hyper-parameters
    hyperparams = {
        "metric": "mae",
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),
    }

    tss = TimeSeriesSplit(n_splits=2)
    scores = []

    for train_index, val_index in tss.split(X_train):

        # Split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)

        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)

    return np.array(scores).mean()

In [13]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)

[I 2026-01-05 17:31:22,353] A new study created in memory with name: no-name-99a90d6f-d390-4c84-a806-e1cac8c422cd
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col

In [14]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 81, 'feature_fraction': 0.3201254152687832, 'bagging_fraction': 0.60384981778599, 'min_child_samples': 10}


In [15]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('functiontransformer', ...), ('temporalfeaturesengineer', ...), ...]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"func  func: callable, default=None The callable to use for the transformation. This will be passed the same arguments as transform, with args and kwargs forwarded. If func is None, then func will be the identity function.",<function ave...001B783052DE0>
,"inverse_func  inverse_func: callable, default=None The callable to use for the inverse transformation. This will be passed the same arguments as inverse transform, with args and kwargs forwarded. If inverse_func is None, then inverse_func will be the identity function.",
,"validate  validate: bool, default=False Indicate that the input X array should be checked before calling ``func``. The possibilities are: - If False, there is no input validation. - If True, then X will be converted to a 2-dimensional NumPy array or  sparse matrix. If the conversion is not possible an exception is  raised. .. versionchanged:: 0.22  The default of ``validate`` changed from True to False.",False
,"accept_sparse  accept_sparse: bool, default=False Indicate that func accepts a sparse matrix as input. If validate is False, this has no effect. Otherwise, if accept_sparse is false, sparse matrix inputs will cause an exception to be raised.",False
,"check_inverse  check_inverse: bool, default=True Whether to check that or ``func`` followed by ``inverse_func`` leads to the original inputs. It can be used for a sanity check, raising a warning when the condition is not fulfilled. .. versionadded:: 0.20",True
,"feature_names_out  feature_names_out: callable, 'one-to-one' or None, default=None Determines the list of feature names that will be returned by the `get_feature_names_out` method. If it is 'one-to-one', then the output feature names will be equal to the input feature names. If it is a callable, then it must take two positional arguments: this `FunctionTransformer` (`self`) and an array-like of input feature names (`input_features`). It must return an array-like of output feature names. The `get_feature_names_out` method is only defined if `feature_names_out` is not None. See ``get_feature_names_out`` for more details. .. versionadded:: 1.1",
,"kw_args  kw_args: dict, default=None Dictionary of additional keyword arguments to pass to func. .. versionadded:: 0.18",
,"inv_kw_args  inv_kw_args: dict, default=None Dictionary of additional keyword arguments to pass to inverse_func. .. versionadded:: 0.18",

0,1,2
,boosting_type,'gbdt'
,num_leaves,81
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [16]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=4.1683


In [17]:
import joblib
from src.paths import MODELS_DIR

joblib.dump(pipeline, MODELS_DIR / 'model.pkl')

['C:\\Users\\HOME\\Desktop\\Temp\\taxi_demand_predictor\\models\\model.pkl']

In [18]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [20]:
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name='taxi_demand_predictor_next_hour',
    metrics={'test_mae': test_mae},
    description='LightGBM regressor with a bit of hyper-parameter tuning',
    input_example=X_train.sample(),
    model_schema=model_schema
)

model.save(str(MODELS_DIR / 'model.pkl'))

Uploading C:\Users\HOME\Desktop\Temp\taxi_demand_predictor\models\model.pkl: 100.000%|██████████| 789586/789586 elapsed<00:02 remaining<00:00
Uploading c:\Users\HOME\Desktop\Temp\taxi_demand_predictor\notebooks\input_example.json: 100.000%|██████████| 3399/3399 elapsed<00:01 remaining<00:00
Uploading c:\Users\HOME\Desktop\Temp\taxi_demand_predictor\notebooks\model_schema.json: 100.000%|██████████| 60849/60849 elapsed<00:01 remaining<00:00
Model export complete: 100%|██████████| 6/6 [00:12<00:00,  2.13s/it]                   

Model created, explore it at https://c.app.hopsworks.ai:443/p/1329302/models/taxi_demand_predictor_next_hour/1





Model(name: 'taxi_demand_predictor_next_hour', version: 1)