In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import sys
sys.path.append('../')

import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

In [3]:
df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df

Unnamed: 0,rides_previous_756_hour,rides_previous_755_hour,rides_previous_754_hour,rides_previous_753_hour,rides_previous_752_hour,rides_previous_751_hour,rides_previous_750_hour,rides_previous_749_hour,rides_previous_748_hour,rides_previous_747_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,0.0,3.0,...,0.0,0.0,5.0,4.0,3.0,4.0,3.0,2022-02-01 12:00:00,4,7.0
1,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,2.0,12.0,12.0,4.0,2.0,7.0,2022-02-02 12:00:00,4,5.0
2,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,3.0,...,0.0,0.0,11.0,13.0,10.0,5.0,5.0,2022-02-03 12:00:00,4,3.0
3,1.0,1.0,0.0,0.0,0.0,3.0,2.0,3.0,4.0,5.0,...,0.0,2.0,3.0,16.0,5.0,6.0,5.0,2022-02-04 12:00:00,4,4.0
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,1.0,2.0,...,1.0,3.0,0.0,2.0,5.0,4.0,12.0,2022-02-05 12:00:00,4,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-12-27 12:00:00,199,0.0
87504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-12-28 12:00:00,199,0.0
87505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-12-29 12:00:00,199,0.0
87506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-12-30 12:00:00,199,0.0


In [4]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(df, 
                                                    cutoff_date=datetime(2022, 6, 1, 0, 0, 0),
                                                    target_column_name='target_rides_next_hour')
print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(31440, 758)
y_train.shape=(31440,)
X_test.shape=(56068, 758)
y_test.shape=(56068,)


In [5]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

In [6]:
def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "metric": "mae",
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),
    }

    tss = TimeSeriesSplit(n_splits=5)
    scores = []
    for  train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)

        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
    
    # Return the mean score
    return np.array(scores).mean()

In [7]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

[I 2025-12-22 19:20:32,597] A new study created in memory with name: no-name-322ac5e3-8b71-477b-a18c-91763b6d40cc
[I 2025-12-22 19:21:56,193] Trial 0 finished with value: 3.4311866744365425 and parameters: {'num_leaves': 186, 'feature_fraction': 0.4205014580853468, 'bagging_fraction': 0.7912553295588478, 'min_child_samples': 10}. Best is trial 0 with value: 3.4311866744365425.
[I 2025-12-22 19:23:03,083] Trial 1 finished with value: 3.3286041191435216 and parameters: {'num_leaves': 113, 'feature_fraction': 0.7348038196248925, 'bagging_fraction': 0.9180853190937592, 'min_child_samples': 60}. Best is trial 1 with value: 3.3286041191435216.
[I 2025-12-22 19:23:57,022] Trial 2 finished with value: 3.3304526141336246 and parameters: {'num_leaves': 89, 'feature_fraction': 0.6740350192322497, 'bagging_fraction': 0.4916644754455153, 'min_child_samples': 69}. Best is trial 1 with value: 3.3286041191435216.
[I 2025-12-22 19:25:06,456] Trial 3 finished with value: 3.381493085910189 and parameters

In [8]:
best_params = study.best_trial.params
print(f"{best_params=}")

best_params={'num_leaves': 113, 'feature_fraction': 0.7348038196248925, 'bagging_fraction': 0.9180853190937592, 'min_child_samples': 60}


In [9]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('functiontransformer', ...), ('temporalfeaturesengineer', ...), ...]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"func  func: callable, default=None The callable to use for the transformation. This will be passed the same arguments as transform, with args and kwargs forwarded. If func is None, then func will be the identity function.",<function ave...001CC093CBA60>
,"inverse_func  inverse_func: callable, default=None The callable to use for the inverse transformation. This will be passed the same arguments as inverse transform, with args and kwargs forwarded. If inverse_func is None, then inverse_func will be the identity function.",
,"validate  validate: bool, default=False Indicate that the input X array should be checked before calling ``func``. The possibilities are: - If False, there is no input validation. - If True, then X will be converted to a 2-dimensional NumPy array or  sparse matrix. If the conversion is not possible an exception is  raised. .. versionchanged:: 0.22  The default of ``validate`` changed from True to False.",False
,"accept_sparse  accept_sparse: bool, default=False Indicate that func accepts a sparse matrix as input. If validate is False, this has no effect. Otherwise, if accept_sparse is false, sparse matrix inputs will cause an exception to be raised.",False
,"check_inverse  check_inverse: bool, default=True Whether to check that or ``func`` followed by ``inverse_func`` leads to the original inputs. It can be used for a sanity check, raising a warning when the condition is not fulfilled. .. versionadded:: 0.20",True
,"feature_names_out  feature_names_out: callable, 'one-to-one' or None, default=None Determines the list of feature names that will be returned by the `get_feature_names_out` method. If it is 'one-to-one', then the output feature names will be equal to the input feature names. If it is a callable, then it must take two positional arguments: this `FunctionTransformer` (`self`) and an array-like of input feature names (`input_features`). It must return an array-like of output feature names. The `get_feature_names_out` method is only defined if `feature_names_out` is not None. See ``get_feature_names_out`` for more details. .. versionadded:: 1.1",
,"kw_args  kw_args: dict, default=None Dictionary of additional keyword arguments to pass to func. .. versionadded:: 0.18",
,"inv_kw_args  inv_kw_args: dict, default=None Dictionary of additional keyword arguments to pass to inverse_func. .. versionadded:: 0.18",

0,1,2
,boosting_type,'gbdt'
,num_leaves,113
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [10]:
predictions = pipeline.predict(X_test)

test_mae = mean_absolute_error(y_test, predictions)

print(f'{test_mae=:.4f}')

test_mae=2.9645


In [12]:
from src.plot import plot_one_sample

plot_one_sample(
    features=X_test,
    targets=y_test,
    example_id=2979,
    predictions=pd.Series(predictions)
)

In [13]:
plot_one_sample(
    features=X_test,
    targets=y_test,
    example_id=3979,
    predictions=pd.Series(predictions)
)