In [76]:
# 8. LightGBM train
import lightgbm as lgb
import polars as pl


x_train = pl.read_parquet("../../data/favorita_dataset/subset/x_train.parquet")
y_train = pl.read_parquet("../../data/favorita_dataset/subset/y_train.parquet")
c_train = pl.read_parquet("../../data/favorita_dataset/subset/dates_train.parquet")

x_valid = pl.read_parquet("../../data/favorita_dataset/subset/x_valid.parquet")
y_valid = pl.read_parquet("../../data/favorita_dataset/subset/y_valid.parquet")
c_valid = pl.read_parquet("../../data/favorita_dataset/subset/dates_valid.parquet")



In [77]:
import polars.selectors as cs

# get all categorical columns
categorical_cols = x_train.select(cs.categorical(), cs.integer()).columns

In [78]:
categorical_cols

['family',
 'class',
 'store_id',
 'product_id',
 'is_on_promotion',
 'weekday',
 'month',
 'weekofyear',
 'dayofyear',
 'sum_next_3d_is_on_promotion',
 'sum_next_7d_is_on_promotion',
 'sum_next_14d_is_on_promotion',
 'perishable']

In [40]:
params = {
    'num_leaves': 50,
    'objective': 'regression',
    'min_data_in_leaf': 10, #200,
    'learning_rate': 0.02,
    'feature_fraction': 0.1,#0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}

MAX_ROUNDS = 5000
# val_pred = []
# test_pred = []
# cate_vars = []
# for i in range(16):
print("=" * 50)
# print("Step %d" % (i+1))
print("=" * 50)
dataset_train = lgb.Dataset(
    x_train.to_pandas(), label=y_train.get_column("log_units_sold").to_numpy(),
    categorical_feature=categorical_cols,
    # weight=pd.concat([items["perishable"]] * num_days) * 0.25 + 1
)

dataset_valid = lgb.Dataset(
    x_valid.to_pandas(), label=y_valid.get_column("log_units_sold").to_numpy(), 
    reference=dataset_train,
    # weight=items["perishable"] * 0.25 + 1,
    categorical_feature=categorical_cols,
    )


bst = lgb.train(
    params, dataset_train, num_boost_round=MAX_ROUNDS,
    valid_sets=[dataset_train, dataset_valid],
    callbacks=[lgb.early_stopping(125), lgb.log_evaluation(25)],
    # force_col_wise=True
)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10218
[LightGBM] [Info] Number of data points in the train set: 29059, number of used features: 64
[LightGBM] [Info] Start training from score 3.713340
Training until validation scores don't improve for 125 rounds
[25]	training's l2: 0.306271	valid_1's l2: 0.321648
[50]	training's l2: 0.176342	valid_1's l2: 0.193512
[75]	training's l2: 0.125771	valid_1's l2: 0.145189
[100]	training's l2: 0.101676	valid_1's l2: 0.123376
[125]	training's l2: 0.0892252	valid_1's l2: 0.113927
[150]	training's l2: 0.0823788	valid_1's l2: 0.109231
[175]	training's l2: 0.0784748	valid_1's l2: 0.107135
[200]	training's l2: 0.0760689	valid_1's l2: 0.105831
[225]	training's l2: 0.0728269	valid_1's l2: 0.103962
[250]	training's l2: 0.0702399	valid_1's l2: 0.102398
[275]	training's l2: 0.0684989	valid_1's l2: 0.101671
[300]	tr

In [42]:
import plotly.express as px

x_subset = x_valid.with_columns(
    date=x_dates_valid
).filter(
    (pl.col("store_id") == 3) 
    & (pl.col("product_id") == 213652)
)

y_subset = y_valid.with_columns(
    product_id=x_valid.get_column("product_id"),
    store_id=x_valid.get_column("store_id"),
).filter(
    (pl.col("store_id") == 3)
    & (pl.col("product_id") == 213652)
)

x_dates_subset = x_subset.drop_in_place("date")

# apply inverse of logp1 transformation to actuals and round to nearest integer
actuals = y_subset.get_column("log_units_sold")
actuals = (pl.Series(actuals).exp() - 1).round()

# apply inverse of logp1 transformation to predictions and round to nearest integer
predictions = bst.predict(x_subset.to_pandas())
predictions = (pl.Series(predictions).exp() - 1).round()

# plot time series: y_valid vs prediction
fig = px.line(
    x=x_dates_subset,
    # y=[y_subset, predictions],
    y=predictions,
    labels={"x": "Date", "value": "Sales"},
    title="Actual vs Predicted Sales",
    template="plotly_white"
)

fig.add_scatter(
    x=x_dates_subset,
    y=actuals,
    name="Actual Sales",
    line=dict(color="red", width=2)
)

# limit y-axis to 0-5
fig.update_yaxes(range=[0, 50])

fig.show()

In [79]:
from sklearn.base import BaseEstimator, RegressorMixin, clone
import lightgbm as lgbm

class DirectMultihorizonForecaster(BaseEstimator, RegressorMixin):
    def __init__(self, horizons: int, params=None):
        """Construct a gradient boosting model.

        Parameters
        ----------
        boosting_type : str, optional (default='gbdt')
            'gbdt', traditional Gradient Boosting Decision Tree.
            'dart', Dropouts meet Multiple Additive Regression Trees.
            'rf', Random Forest.
        num_leaves : int, optional (default=31)
            Maximum tree leaves for base learners.
        max_depth : int, optional (default=-1)
            Maximum tree depth for base learners, <=0 means no limit.
            If setting this to a positive value, consider also changing ``num_leaves`` to ``<= 2^max_depth``.
        learning_rate : float, optional (default=0.1)
            Boosting learning rate.
            You can use ``callbacks`` parameter of ``fit`` method to shrink/adapt learning rate
            in training using ``reset_parameter`` callback.
            Note, that this will ignore the ``learning_rate`` argument in training.
        n_estimators : int, optional (default=100)
            Number of boosted trees to fit.
        subsample_for_bin : int, optional (default=200000)
            Number of samples for constructing bins.
        objective : str, callable or None, optional (default=None)
            Specify the learning task and the corresponding learning objective or
            a custom objective function to be used (see note below).
            Default: 'regression' for LGBMRegressor, 'binary' or 'multiclass' for LGBMClassifier, 'lambdarank' for LGBMRanker.
        class_weight : dict, 'balanced' or None, optional (default=None)
            Weights associated with classes in the form ``{class_label: weight}``.
            Use this parameter only for multi-class classification task;
            for binary classification task you may use ``is_unbalance`` or ``scale_pos_weight`` parameters.
            Note, that the usage of all these parameters will result in poor estimates of the individual class probabilities.
            You may want to consider performing probability calibration
            (https://scikit-learn.org/stable/modules/calibration.html) of your model.
            The 'balanced' mode uses the values of y to automatically adjust weights
            inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.
            If None, all classes are supposed to have weight one.
            Note, that these weights will be multiplied with ``sample_weight`` (passed through the ``fit`` method)
            if ``sample_weight`` is specified.
        min_split_gain : float, optional (default=0.)
            Minimum loss reduction required to make a further partition on a leaf node of the tree.
        min_child_weight : float, optional (default=1e-3)
            Minimum sum of instance weight (Hessian) needed in a child (leaf).
        min_child_samples : int, optional (default=20)
            Minimum number of data needed in a child (leaf).
        subsample : float, optional (default=1.)
            Subsample ratio of the training instance.
        subsample_freq : int, optional (default=0)
            Frequency of subsample, <=0 means no enable.
        colsample_bytree : float, optional (default=1.)
            Subsample ratio of columns when constructing each tree.
        reg_alpha : float, optional (default=0.)
            L1 regularization term on weights.
        reg_lambda : float, optional (default=0.)
            L2 regularization term on weights.
        random_state : int, RandomState object or None, optional (default=None)
            Random number seed.
            If int, this number is used to seed the C++ code.
            If RandomState or Generator object (numpy), a random integer is picked based on its state to seed the C++ code.
            If None, default seeds in C++ code are used.
        n_jobs : int or None, optional (default=None)
            Number of parallel threads to use for training (can be changed at prediction time by
            passing it as an extra keyword argument).

            For better performance, it is recommended to set this to the number of physical cores
            in the CPU.

            Negative integers are interpreted as following joblib's formula (n_cpus + 1 + n_jobs), just like
            scikit-learn (so e.g. -1 means using all threads). A value of zero corresponds the default number of
            threads configured for OpenMP in the system. A value of ``None`` (the default) corresponds
            to using the number of physical cores in the system (its correct detection requires
            either the ``joblib`` or the ``psutil`` util libraries to be installed).

            .. versionchanged:: 4.0.0

        importance_type : str, optional (default='split')
            The type of feature importance to be filled into ``feature_importances_``.
            If 'split', result contains numbers of times the feature is used in a model.
            If 'gain', result contains total gains of splits which use the feature.
        **kwargs
            Other parameters for the model.
            Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more parameters.

            .. warning::

                \*\*kwargs is not supported in sklearn, it may cause unexpected issues.

        Note
        ----
        A custom objective function can be provided for the ``objective`` parameter.
        In this case, it should have the signature
        ``objective(y_true, y_pred) -> grad, hess``,
        ``objective(y_true, y_pred, weight) -> grad, hess``
        or ``objective(y_true, y_pred, weight, group) -> grad, hess``:

            y_true : numpy 1-D array of shape = [n_samples]
                The target values.
            y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
                The predicted values.
                Predicted values are returned before any transformation,
                e.g. they are raw margin instead of probability of positive class for binary task.
            weight : numpy 1-D array of shape = [n_samples]
                The weight of samples. Weights should be non-negative.
            group : numpy 1-D array
                Group/query data.
                Only used in the learning-to-rank task.
                sum(group) = n_samples.
                For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
                where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
            grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
                The value of the first order derivative (gradient) of the loss
                with respect to the elements of y_pred for each sample point.
            hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
                The value of the second order derivative (Hessian) of the loss
                with respect to the elements of y_pred for each sample point.

        For multi-class task, y_pred is a numpy 2-D array of shape = [n_samples, n_classes],
        and grad and hess should be returned in the same format.
        """
        # boosting_type: str = "gbdt",
        # num_leaves: int = 31,
        # max_depth: int = -1,
        # learning_rate: float = 0.1,
        # n_estimators: int = 100,
        # subsample_for_bin: int = 200000,
        # objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None,
        # class_weight: Optional[Union[Dict, str]] = None,
        # min_split_gain: float = 0.0,
        # min_child_weight: float = 1e-3,
        # min_child_samples: int = 20,
        # subsample: float = 1.0,
        # subsample_freq: int = 0,
        # colsample_bytree: float = 1.0,
        # reg_alpha: float = 0.0,
        # reg_lambda: float = 0.0,
        # random_state: Optional[Union[int, np.random.RandomState, np.random.Generator]] = None,
        # n_jobs: Optional[int] = None,
        # importance_type: str = "split",
        self.params = params if params is not None else {}
        self.base_regressor = lgbm.LGBMRegressor(**params)
        self.horizons = horizons
        self.models_ = []
        self.is_fitted_ = False
        self.target_names_ = None

    def fit(self, X: pl.DataFrame, Y: pl.DataFrame, fit_params=None, target_cols=None):
        # X: _LGBM_ScikitMatrixLike,
        # y: _LGBM_LabelType,
        # sample_weight: Optional[_LGBM_WeightType] = None,
        # init_score: Optional[_LGBM_InitScoreType] = None,
        # eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
        # eval_names: Optional[List[str]] = None,
        # eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
        # eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
        # eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
        # feature_name: _LGBM_FeatureNameConfiguration = "auto",
        # categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto",
        # callbacks: Optional[List[Callable]] = None,
        # init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None,

        self.fit_params = fit_params if fit_params is not None else {}
        self.target_cols_ = Y.columns if target_cols is None else target_cols

        # For direct approach, Y should contain multiple columns, each representing a different horizon.
        # Each column should be sorted out according to the horizon it represents, e.g., 'h1', 'h2', etc.
        if len(self.target_cols_) != self.horizons:
            raise ValueError(
                f"Expected {self.horizons} target columns, but got {len(self.target_cols_)}."
            )
        
        X = X.to_pandas()
        self.models_ = [clone(self.base_regressor) for _ in range(self.horizons)]
        
        # Fit each model for each horizon
        for h in range(self.horizons):
            y_h = Y.get_column(self.target_cols_[h]).to_numpy()
            self.models_[h].fit(X, y_h, **self.fit_params)
        
        self.is_fitted_ = True
        return self

    def predict(self, X: pl.DataFrame):
        """Predict using the fitted models for each horizon.
        """
        if not self.is_fitted_:
            raise RuntimeError("You must fit the model before calling predict.")
    
        X = X.to_pandas()
        preds_df = pl.DataFrame(
            {
                f'pred_{self.target_cols_[horizon]}': model.predict(X)
                for horizon, model in enumerate(self.models_)
            }
        )
        return preds_df
    
    def feature_importances(self) -> pl.DataFrame:
        """Get feature importances from the fitted models.
        
        Parameters
        ----------
        importance_type : str, optional (default='split')
            The type of feature importance to be returned.
            Can be 'split' or 'gain'.
        """
        if not self.is_fitted_:
            raise RuntimeError("You must fit the model before calling feature_importances_.")

        return pl.DataFrame(
            {
                "feature_name": self.models_[0].feature_name_
            }
            |
            {
                f'{self.target_cols_[horizon]}': model.feature_importances_
                for horizon, model in enumerate(self.models_)
            }
        )

In [80]:
weights = 0.25 * x_train.get_column("perishable").to_numpy() + 1

In [82]:
# x_train = lgb.Dataset(
#     X.to_pandas(), label=y_train.get_column("log_units_sold").to_numpy(),
#     categorical_feature=categorical_cols,
#     # weight=pd.concat([items["perishable"]] * num_days) * 0.25 + 1
# )

# # Example usage:

# bst = lgb.train(
#     params, x_train, num_boost_round=MAX_ROUNDS,
#     valid_sets=[x_train, dataset_valid],
#     callbacks=[lgb.early_stopping(125), lgb.log_evaluation(25)],
#     # force_col_wise=True
# )

model_params = {
    # "boosting_type":"dart", # "gdbt"
    "num_leaves": 31, #31
    # "max_depth": 10, #-1,
    "learning_rate": 0.02,
    "n_estimators": 1000,
    # "subsample_for_bin": 200000,
    "objective": "regression",
    # class_weight: Optional[Union[Dict, str]] = None,
    "min_split_gain": 0.1, # 'feature_fraction': 0.1,#0.8,
    # min_child_weight: float = 1e-3,
    "min_child_samples": 20,
    "subsample": 0.4, #'bagging_fraction': 0.7,
    "subsample_freq": 1, # 'bagging_freq': 1,
    # colsample_bytree: float = 1.0,
    # reg_alpha: float = 0.0,
    # reg_lambda: float = 0.0,
    # random_state: Optional[Union[int, np.random.RandomState, np.random.Generator]] = None,
    "n_jobs": 8, # 'num_threads': 16
    # importance_type: str = "split",
}

dataset_valid = lgb.Dataset(
    x_valid.to_pandas(), label=y_valid.get_column("h1_log_units_sold").to_numpy(), 
    # reference=x_train,
    # weight=items["perishable"] * 0.25 + 1,
    # categorical_feature=categorical_cols,
)

fit_params = {
    "eval_metric": "l2",
    "eval_set": [
        # (x_train.to_pandas(), y_train.get_column("h1_log_units_sold").to_numpy()),
        (x_valid.to_pandas(), y_valid.get_column("h1_log_units_sold").to_numpy())
    ],
    "callbacks": [lgbm.early_stopping(125), lgbm.log_evaluation(25)],
    "categorical_feature": categorical_cols,
    "sample_weight": weights,
    # "feature_name": "auto",
}

forecaster = DirectMultihorizonForecaster(
    horizons=7,
    params=model_params
)

forecaster.fit(x_train, y_train, fit_params)#, target_cols=["h1_log_units_sold", "h2_log_units_sold"])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003066 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10353
[LightGBM] [Info] Number of data points in the train set: 29059, number of used features: 54
[LightGBM] [Info] Start training from score 3.696507
Training until validation scores don't improve for 125 rounds
[25]	valid_0's l2: 0.448413
[50]	valid_0's l2: 0.323265
[75]	valid_0's l2: 0.269319
[100]	valid_0's l2: 0.239044
[125]	valid_0's l2: 0.220354
[150]	valid_0's l2: 0.207423
[175]	valid_0's l2: 0.197617
[200]	valid_0's l2: 0.188377
[225]	valid_0's l2: 0.181637
[250]	valid_0's l2: 0.175132
[275]	valid_0's l2: 0.170572
[300]	valid_0's l2: 0.167124
[325]	valid_0's l2: 0.164378
[350]	valid_0's l2: 0.162361
[375]	valid_0's l2: 0.160686
[400]	valid_0's l2: 0.159262
[425]	valid_0's l2: 0.158139
[450]	valid_0's l2: 0.157128
[475]	valid_0's l2: 0.156632
[500]	valid_0's l2: 0.155995
[525]	valid_0's l2

0,1,2
,horizons,7
,params,"{'learning_rate': 0.02, 'min_child_samples': 20, 'min_split_gain': 0.1, 'n_estimators': 1000, ...}"


In [83]:
importances_df = forecaster.feature_importances()
importances_df

feature_name,h1_log_units_sold,h2_log_units_sold,h3_log_units_sold,h4_log_units_sold,h5_log_units_sold,h6_log_units_sold,h7_log_units_sold
str,i32,i32,i32,i32,i32,i32,i32
"""store_id""",90,41,34,41,23,21,20
"""product_id""",202,72,66,72,70,43,44
"""log_units_sold""",503,58,23,26,7,13,92
"""is_on_promotion""",26,29,4,1,0,26,1
"""weekday""",1003,542,494,470,433,437,400
"""month""",414,3,6,2,1,1,0
"""weekofyear""",2717,379,300,333,315,433,425
"""dayofyear""",7256,518,319,340,307,313,350
"""mean_3d_log_units_sold""",347,25,16,16,75,237,70
"""median_3d_log_units_sold""",296,14,6,13,17,82,49


In [84]:
# modify max limit of df display
pl.Config.set_tbl_rows(60)

polars.config.Config

In [106]:
importances_df.select(
    pl.col("feature_name"),
    pl.concat_list(cs.starts_with("h")).list.mean().alias("importance")
).sort("importance", descending=True)


feature_name,importance
str,f64
"""dayofyear""",1343.285714
"""weekofyear""",700.285714
"""weekday""",539.857143
"""diff_mean_3d_log_units_sold""",214.0
"""diff_mean_7d_log_units_sold""",208.0
"""diff_mean_28d_log_units_sold""",184.0
"""mean_4w_log_units_sold""",183.714286
"""std_7d_log_units_sold""",183.285714
"""diff_mean_14d_log_units_sold""",178.0
"""std_14d_log_units_sold""",149.285714


In [107]:
preds_df = forecaster.predict(x_valid)

In [113]:
import plotly.express as px


target_col = "h4_log_units_sold"


x_subset = x_valid.with_columns(
    (y_valid.get_column(target_col).exp() + 1).round(),
    (forecaster.predict(x_valid).get_column(f'pred_{target_col}').exp() + 1).round(),
    c_valid.get_column("date"),
).filter(
    (pl.col("store_id") == 3) 
    & (pl.col("product_id") == 213652)
)

y_true = x_subset.drop_in_place(target_col)
y_preds = x_subset.drop_in_place(f"pred_{target_col}")
x_dates_subset = x_subset.drop_in_place("date")

# apply inverse of logp1 transformation to actuals and round to nearest integer
# actuals = y_subset.get_column("log_units_sold")
# actuals = (pl.Series(actuals).exp() - 1).round()

# apply inverse of logp1 transformation to predictions and round to nearest integer
# predictions = bst.predict(x_subset.to_pandas())
# predictions = (pl.Series(predictions).exp() - 1).round()

# plot time series: y_valid vs prediction
fig = px.line(
    x=x_dates_subset,
    # y=[y_subset, predictions],
    y=y_preds,
    labels={"x": "date", "y": "units"},
    title="Actual vs Predicted Sales",
    template="plotly_white"
)

fig.add_scatter(
    x=x_dates_subset,
    y=y_true,
    name="sales",
    line=dict(color="green", width=2)
)

# limit y-axis to 0-5
fig.update_yaxes(range=[0, 100])
                           
fig.show()