In [17]:
# 8. LightGBM train
import lightgbm as lgb
import polars as pl


x_train = pl.read_parquet("../../data/favorita_dataset/output/train_input.parquet")
y_train = pl.read_parquet("../../data/favorita_dataset/output/train_target.parquet")
c_train = pl.read_parquet("../../data/favorita_dataset/output/train_dates.parquet")

# x_valid = pl.read_parquet("../../data/favorita_dataset/output/x_valid.parquet")
# y_valid = pl.read_parquet("../../data/favorita_dataset/output/y_valid.parquet")
# c_valid = pl.read_parquet("../../data/favorita_dataset/output/dates_valid.parquet")

In [18]:
x_train = x_train.with_columns(
    pl.col.product_group.cast(pl.Categorical)
)

In [19]:
import polars.selectors as cs

# get all categorical columns
categorical_cols = x_train.select(cs.integer(), cs.categorical()).columns

In [20]:
categorical_cols

['product_id',
 'store_id',
 'next_1d_event_id',
 'next_2d_event_id',
 'next_3d_event_id',
 'next_4d_event_id',
 'next_5d_event_id',
 'next_6d_event_id',
 'next_7d_event_id',
 'dayofweek',
 'dayofmonth',
 'dayofyear',
 'weekofyear',
 'month',
 'year',
 'product_group']

In [40]:
params = {
    'num_leaves': 50,
    'objective': 'regression',
    'min_data_in_leaf': 10, #200,
    'learning_rate': 0.02,
    'feature_fraction': 0.1,#0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}

MAX_ROUNDS = 5000
# val_pred = []
# test_pred = []
# cate_vars = []
# for i in range(16):
print("=" * 50)
# print("Step %d" % (i+1))
print("=" * 50)
dataset_train = lgb.Dataset(
    x_train.to_pandas(), label=y_train.get_column("log_units_sold").to_numpy(),
    categorical_feature=categorical_cols,
    # weight=pd.concat([items["perishable"]] * num_days) * 0.25 + 1
)

dataset_valid = lgb.Dataset(
    x_valid.to_pandas(), label=y_valid.get_column("log_units_sold").to_numpy(), 
    reference=dataset_train,
    # weight=items["perishable"] * 0.25 + 1,
    categorical_feature=categorical_cols,
    )


bst = lgb.train(
    params, dataset_train, num_boost_round=MAX_ROUNDS,
    valid_sets=[dataset_train, dataset_valid],
    callbacks=[lgb.early_stopping(125), lgb.log_evaluation(25)],
    # force_col_wise=True
)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10218
[LightGBM] [Info] Number of data points in the train set: 29059, number of used features: 64
[LightGBM] [Info] Start training from score 3.713340
Training until validation scores don't improve for 125 rounds
[25]	training's l2: 0.306271	valid_1's l2: 0.321648
[50]	training's l2: 0.176342	valid_1's l2: 0.193512
[75]	training's l2: 0.125771	valid_1's l2: 0.145189
[100]	training's l2: 0.101676	valid_1's l2: 0.123376
[125]	training's l2: 0.0892252	valid_1's l2: 0.113927
[150]	training's l2: 0.0823788	valid_1's l2: 0.109231
[175]	training's l2: 0.0784748	valid_1's l2: 0.107135
[200]	training's l2: 0.0760689	valid_1's l2: 0.105831
[225]	training's l2: 0.0728269	valid_1's l2: 0.103962
[250]	training's l2: 0.0702399	valid_1's l2: 0.102398
[275]	training's l2: 0.0684989	valid_1's l2: 0.101671
[300]	tr

In [42]:
import plotly.express as px

x_subset = x_valid.with_columns(
    date=x_dates_valid
).filter(
    (pl.col("store_id") == 3) 
    & (pl.col("product_id") == 213652)
)

y_subset = y_valid.with_columns(
    product_id=x_valid.get_column("product_id"),
    store_id=x_valid.get_column("store_id"),
).filter(
    (pl.col("store_id") == 3)
    & (pl.col("product_id") == 213652)
)

x_dates_subset = x_subset.drop_in_place("date")

# apply inverse of logp1 transformation to actuals and round to nearest integer
actuals = y_subset.get_column("log_units_sold")
actuals = (pl.Series(actuals).exp() - 1).round()

# apply inverse of logp1 transformation to predictions and round to nearest integer
predictions = bst.predict(x_subset.to_pandas())
predictions = (pl.Series(predictions).exp() - 1).round()

# plot time series: y_valid vs prediction
fig = px.line(
    x=x_dates_subset,
    # y=[y_subset, predictions],
    y=predictions,
    labels={"x": "Date", "value": "Sales"},
    title="Actual vs Predicted Sales",
    template="plotly_white"
)

fig.add_scatter(
    x=x_dates_subset,
    y=actuals,
    name="Actual Sales",
    line=dict(color="red", width=2)
)

# limit y-axis to 0-5
fig.update_yaxes(range=[0, 50])

fig.show()

In [21]:
from sklearn.base import BaseEstimator, RegressorMixin, clone
import lightgbm as lgbm

class DirectMultihorizonForecaster(BaseEstimator, RegressorMixin):
    def __init__(self, horizons: int, params=None):
        """Construct a gradient boosting model.

        Parameters
        ----------
        boosting_type : str, optional (default='gbdt')
            'gbdt', traditional Gradient Boosting Decision Tree.
            'dart', Dropouts meet Multiple Additive Regression Trees.
            'rf', Random Forest.
        num_leaves : int, optional (default=31)
            Maximum tree leaves for base learners.
        max_depth : int, optional (default=-1)
            Maximum tree depth for base learners, <=0 means no limit.
            If setting this to a positive value, consider also changing ``num_leaves`` to ``<= 2^max_depth``.
        learning_rate : float, optional (default=0.1)
            Boosting learning rate.
            You can use ``callbacks`` parameter of ``fit`` method to shrink/adapt learning rate
            in training using ``reset_parameter`` callback.
            Note, that this will ignore the ``learning_rate`` argument in training.
        n_estimators : int, optional (default=100)
            Number of boosted trees to fit.
        subsample_for_bin : int, optional (default=200000)
            Number of samples for constructing bins.
        objective : str, callable or None, optional (default=None)
            Specify the learning task and the corresponding learning objective or
            a custom objective function to be used (see note below).
            Default: 'regression' for LGBMRegressor, 'binary' or 'multiclass' for LGBMClassifier, 'lambdarank' for LGBMRanker.
        class_weight : dict, 'balanced' or None, optional (default=None)
            Weights associated with classes in the form ``{class_label: weight}``.
            Use this parameter only for multi-class classification task;
            for binary classification task you may use ``is_unbalance`` or ``scale_pos_weight`` parameters.
            Note, that the usage of all these parameters will result in poor estimates of the individual class probabilities.
            You may want to consider performing probability calibration
            (https://scikit-learn.org/stable/modules/calibration.html) of your model.
            The 'balanced' mode uses the values of y to automatically adjust weights
            inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.
            If None, all classes are supposed to have weight one.
            Note, that these weights will be multiplied with ``sample_weight`` (passed through the ``fit`` method)
            if ``sample_weight`` is specified.
        min_split_gain : float, optional (default=0.)
            Minimum loss reduction required to make a further partition on a leaf node of the tree.
        min_child_weight : float, optional (default=1e-3)
            Minimum sum of instance weight (Hessian) needed in a child (leaf).
        min_child_samples : int, optional (default=20)
            Minimum number of data needed in a child (leaf).
        subsample : float, optional (default=1.)
            Subsample ratio of the training instance.
        subsample_freq : int, optional (default=0)
            Frequency of subsample, <=0 means no enable.
        colsample_bytree : float, optional (default=1.)
            Subsample ratio of columns when constructing each tree.
        reg_alpha : float, optional (default=0.)
            L1 regularization term on weights.
        reg_lambda : float, optional (default=0.)
            L2 regularization term on weights.
        random_state : int, RandomState object or None, optional (default=None)
            Random number seed.
            If int, this number is used to seed the C++ code.
            If RandomState or Generator object (numpy), a random integer is picked based on its state to seed the C++ code.
            If None, default seeds in C++ code are used.
        n_jobs : int or None, optional (default=None)
            Number of parallel threads to use for training (can be changed at prediction time by
            passing it as an extra keyword argument).

            For better performance, it is recommended to set this to the number of physical cores
            in the CPU.

            Negative integers are interpreted as following joblib's formula (n_cpus + 1 + n_jobs), just like
            scikit-learn (so e.g. -1 means using all threads). A value of zero corresponds the default number of
            threads configured for OpenMP in the system. A value of ``None`` (the default) corresponds
            to using the number of physical cores in the system (its correct detection requires
            either the ``joblib`` or the ``psutil`` util libraries to be installed).

            .. versionchanged:: 4.0.0

        importance_type : str, optional (default='split')
            The type of feature importance to be filled into ``feature_importances_``.
            If 'split', result contains numbers of times the feature is used in a model.
            If 'gain', result contains total gains of splits which use the feature.
        **kwargs
            Other parameters for the model.
            Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more parameters.

            .. warning::

                \*\*kwargs is not supported in sklearn, it may cause unexpected issues.

        Note
        ----
        A custom objective function can be provided for the ``objective`` parameter.
        In this case, it should have the signature
        ``objective(y_true, y_pred) -> grad, hess``,
        ``objective(y_true, y_pred, weight) -> grad, hess``
        or ``objective(y_true, y_pred, weight, group) -> grad, hess``:

            y_true : numpy 1-D array of shape = [n_samples]
                The target values.
            y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
                The predicted values.
                Predicted values are returned before any transformation,
                e.g. they are raw margin instead of probability of positive class for binary task.
            weight : numpy 1-D array of shape = [n_samples]
                The weight of samples. Weights should be non-negative.
            group : numpy 1-D array
                Group/query data.
                Only used in the learning-to-rank task.
                sum(group) = n_samples.
                For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
                where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
            grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
                The value of the first order derivative (gradient) of the loss
                with respect to the elements of y_pred for each sample point.
            hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
                The value of the second order derivative (Hessian) of the loss
                with respect to the elements of y_pred for each sample point.

        For multi-class task, y_pred is a numpy 2-D array of shape = [n_samples, n_classes],
        and grad and hess should be returned in the same format.
        """
        # boosting_type: str = "gbdt",
        # num_leaves: int = 31,
        # max_depth: int = -1,
        # learning_rate: float = 0.1,
        # n_estimators: int = 100,
        # subsample_for_bin: int = 200000,
        # objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None,
        # class_weight: Optional[Union[Dict, str]] = None,
        # min_split_gain: float = 0.0,
        # min_child_weight: float = 1e-3,
        # min_child_samples: int = 20,
        # subsample: float = 1.0,
        # subsample_freq: int = 0,
        # colsample_bytree: float = 1.0,
        # reg_alpha: float = 0.0,
        # reg_lambda: float = 0.0,
        # random_state: Optional[Union[int, np.random.RandomState, np.random.Generator]] = None,
        # n_jobs: Optional[int] = None,
        # importance_type: str = "split",
        self.params = params if params is not None else {}
        self.base_regressor = lgbm.LGBMRegressor(**params)
        self.horizons = horizons
        self.models_ = []
        self.is_fitted_ = False
        self.target_names_ = None

    def fit(self, X: pl.DataFrame, Y: pl.DataFrame, fit_params=None, target_cols=None):
        # X: _LGBM_ScikitMatrixLike,
        # y: _LGBM_LabelType,
        # sample_weight: Optional[_LGBM_WeightType] = None,
        # init_score: Optional[_LGBM_InitScoreType] = None,
        # eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
        # eval_names: Optional[List[str]] = None,
        # eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
        # eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
        # eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
        # feature_name: _LGBM_FeatureNameConfiguration = "auto",
        # categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto",
        # callbacks: Optional[List[Callable]] = None,
        # init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None,

        self.fit_params = fit_params if fit_params is not None else {}
        self.target_cols_ = Y.columns if target_cols is None else target_cols

        # For direct approach, Y should contain multiple columns, each representing a different horizon.
        # Each column should be sorted out according to the horizon it represents, e.g., 'h1', 'h2', etc.
        if len(self.target_cols_) != self.horizons:
            raise ValueError(
                f"Expected {self.horizons} target columns, but got {len(self.target_cols_)}."
            )
        
        X = X.to_pandas()
        self.models_ = [clone(self.base_regressor) for _ in range(self.horizons)]
        
        # Fit each model for each horizon
        for h in range(self.horizons):
            y_h = Y.get_column(self.target_cols_[h]).to_numpy()
            self.models_[h].fit(X, y_h, **self.fit_params)
        
        self.is_fitted_ = True
        return self

    def predict(self, X: pl.DataFrame):
        """Predict using the fitted models for each horizon.
        """
        if not self.is_fitted_:
            raise RuntimeError("You must fit the model before calling predict.")
    
        X = X.to_pandas()
        preds_df = pl.DataFrame(
            {
                f'pred_{self.target_cols_[horizon]}': model.predict(X)
                for horizon, model in enumerate(self.models_)
            }
        )
        return preds_df
    
    def feature_importances(self) -> pl.DataFrame:
        """Get feature importances from the fitted models.
        
        Parameters
        ----------
        importance_type : str, optional (default='split')
            The type of feature importance to be returned.
            Can be 'split' or 'gain'.
        """
        if not self.is_fitted_:
            raise RuntimeError("You must fit the model before calling feature_importances_.")

        return pl.DataFrame(
            {
                "feature_name": self.models_[0].feature_name_
            }
            |
            {
                f'{self.target_cols_[horizon]}': model.feature_importances_
                for horizon, model in enumerate(self.models_)
            }
        )

In [22]:
weights = 0.25 * x_train.get_column("perishable").to_numpy() + 1

ColumnNotFoundError: "perishable" not found

In [None]:
# x_train = lgb.Dataset(
#     X.to_pandas(), label=y_train.get_column("log_units_sold").to_numpy(),
#     categorical_feature=categorical_cols,
#     # weight=pd.concat([items["perishable"]] * num_days) * 0.25 + 1
# )

# # Example usage:

# bst = lgb.train(
#     params, x_train, num_boost_round=MAX_ROUNDS,
#     valid_sets=[x_train, dataset_valid],
#     callbacks=[lgb.early_stopping(125), lgb.log_evaluation(25)],
#     # force_col_wise=True
# )

model_params = {
    # "boosting_type":"dart", # "gdbt"
    "num_leaves": 31, #31
    # "max_depth": 10, #-1,
    "learning_rate": 0.02,
    "n_estimators": 10000,
    # "subsample_for_bin": 200000,
    "objective": "regression",
    # class_weight: Optional[Union[Dict, str]] = None,
    "min_split_gain": 0.01, # 'feature_fraction': 0.1,#0.8,
    # min_child_weight: float = 1e-3,
    "min_child_samples": 10,
    "subsample": 0.4, #'bagging_fraction': 0.7,
    "subsample_freq": 10, # 'bagging_freq': 1,
    # colsample_bytree: float = 1.0,
    # reg_alpha: float = 0.0,
    # reg_lambda: float = 0.0,
    # random_state: Optional[Union[int, np.random.RandomState, np.random.Generator]] = None,
    "n_jobs": 16, # 'num_threads': 16
    # importance_type: str = "split",
}

# dataset_valid = lgb.Dataset(
#     # x_valid.to_pandas(), label=y_valid.get_column("h1_log_units_sold").to_numpy(),
#     # reference=x_train,
#     # weight=items["perishable"] * 0.25 + 1,
#     # categorical_feature=categorical_cols,
# )

fit_params = {
    "eval_metric": "l2",
    "eval_set": [
        (x_train.to_pandas(), y_train.get_column("h1_log_units_sold").to_numpy()),
        # (x_valid.to_pandas(), y_valid.get_column("h1_log_units_sold").to_numpy())
    ],
    "callbacks": [lgbm.early_stopping(125), lgbm.log_evaluation(25)],
    "categorical_feature": categorical_cols,
    # "sample_weight": weights,
    # "feature_name": "auto",
}

forecaster = DirectMultihorizonForecaster(
    horizons=7,
    params=model_params
)

forecaster.fit(x_train, y_train, fit_params) #, target_cols=["h1_log_units_sold", "h2_log_units_sold"])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004403 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12395
[LightGBM] [Info] Number of data points in the train set: 46039, number of used features: 71
[LightGBM] [Info] Start training from score 3.606862
Training until validation scores don't improve for 125 rounds
[25]	valid_0's l2: 0.255361
[50]	valid_0's l2: 0.144549
[75]	valid_0's l2: 0.10025
[100]	valid_0's l2: 0.0814864
[125]	valid_0's l2: 0.0722764
[150]	valid_0's l2: 0.067358
[175]	valid_0's l2: 0.0643582
[200]	valid_0's l2: 0.0623735
[225]	valid_0's l2: 0.0609436
[250]	valid_0's l2: 0.0595133
[275]	valid_0's l2: 0.0582818
[300]	valid_0's l2: 0.0574224
[325]	valid_0's l2: 0.0566501
[350]	valid_0's l2: 0.056037
[375]	valid_0's l2: 0.0553311
[400]	valid_0's l2: 0.0547586
[425]	valid_0's l2: 0.0541929
[450]	valid_0's l2: 0.0536918
[475]	valid_0's l2: 0.0531929
[500]	valid_0's l2: 0.0527277
[525

0,1,2
,horizons,7
,params,"{'learning_rate': 0.02, 'min_child_samples': 10, 'min_split_gain': 0.01, 'n_estimators': 10000, ...}"


In [7]:
importances_df = forecaster.feature_importances()
importances_df

feature_name,h1_log_units_sold,h2_log_units_sold,h3_log_units_sold,h4_log_units_sold,h5_log_units_sold,h6_log_units_sold,h7_log_units_sold
str,i32,i32,i32,i32,i32,i32,i32
"""product_id""",3800,182,133,130,100,51,121
"""store_id""",1505,37,20,15,11,3,12
"""log_units_sold""",2693,18,9,7,1,1,69
"""product_group""",462,61,67,56,58,18,32
"""next_1d_event_id""",1159,2,0,0,0,0,0
…,…,…,…,…,…,…,…
"""h3_ewm_3y_log_units_sold""",5650,296,49,1,0,5,4
"""h4_ewm_3y_log_units_sold""",5491,32,262,52,5,1,0
"""h5_ewm_3y_log_units_sold""",5472,7,26,250,44,6,4
"""h6_ewm_3y_log_units_sold""",5518,10,4,16,253,31,5


In [8]:
# modify max limit of df display
pl.Config.set_tbl_rows(60)

polars.config.Config

In [None]:
importances_df.select(
    pl.col("feature_name"),
    pl.concat_list(cs.starts_with("h")).list.mean().alias("importance")
).sort("importance", descending=True)

feature_name,importance
str,f64
"""dayofyear""",11670.142857
"""weekofyear""",2133.714286
"""diff_mean_14d_log_units_sold""",1024.428571
"""diff_mean_7d_log_units_sold""",1023.428571
"""diff_mean_28d_log_units_sold""",985.428571
"""diff_mean_21d_log_units_sold""",967.285714
"""std_7d_log_units_sold""",913.428571
"""std_28d_log_units_sold""",885.428571
"""h2_ewm_3y_log_units_sold""",879.0
"""max_mean_ratio_7d_log_units_so…",869.428571


In [13]:
import plotly.express as px


def foo(x_valid, y_valid, c_valid, target_col, pid, sid):

    x_subset = x_valid.with_columns(
        (y_valid.get_column(target_col).exp() + 1).round(),
        (forecaster.predict(x_valid).get_column(f'pred_{target_col}').exp() + 1).round(),
        c_valid.get_column("c_date"),
    ).filter(store_id=sid, product_id=pid)
    
    y_true = x_subset.drop_in_place(target_col)
    y_preds = x_subset.drop_in_place(f"pred_{target_col}")
    x_dates_subset = x_subset.drop_in_place("c_date")

    # apply inverse of logp1 transformation to actuals and round to nearest integer
    # actuals = y_subset.get_column("log_units_sold")
    # actuals = (pl.Series(actuals).exp() - 1).round()

    # apply inverse of logp1 transformation to predictions and round to nearest integer
    # predictions = bst.predict(x_subset.to_pandas())
    # predictions = (pl.Series(predictions).exp() - 1).round()

    # plot time series: y_valid vs prediction
    fig = px.line(
        x=x_dates_subset,
        # y=[y_subset, predictions],
        y=y_preds,
        labels={"x": "date", "y": "units"},
        title="Actual vs Predicted Sales",
        template="plotly_white"
    )

    fig.add_scatter(
        x=x_dates_subset,
        y=y_true,
        name="sales",
        line=dict(color="green", width=2)
    )

    # limit y-axis to 0-5
    # fig.update_yaxes(range=[0, 100])
                            
    fig.show()

In [None]:
# f = x_train.filter(
#     (pl.col("store_id") == 1) 
#     & (pl.col("product_id") == 1)
# )

In [10]:
import plotly.express as px

def plot_sales_forecast(x_train, y_train, c_train, p_train, product_id, store_id, target_col, step):
    # target_col = "h7_log_units_sold"
    true_col = f'h{step}_{target_col}'
    pred_col = f'pred_h{step}_{target_col}'

    x = x_train.with_columns(
        c_train.get_column("c_date"),
        y_train.get_column(true_col).exp() - 1,
        p_train.get_column(pred_col).exp() - 1,
    ).filter(product_id=product_id, store_id=store_id)

    # plot with series named "actual" and "predicted"
    px.line(
        x=x.get_column("c_date"),
        y=[x.get_column(true_col), x.get_column(pred_col)],
        labels={"x": "date", "value": "units sold"},
        title=f"{product_id=} | {store_id=}",
        template="plotly_white"
    ).show()

In [25]:
p_train = forecaster.predict(x_train)

In [26]:
y_train - p_train

h1_log_units_sold,h2_log_units_sold,h3_log_units_sold,h4_log_units_sold,h5_log_units_sold,h6_log_units_sold,h7_log_units_sold
f64,f64,f64,f64,f64,f64,f64
0.436216,-0.123037,-0.122787,-0.207052,-0.045426,-0.301117,0.289787
0.181881,-0.282704,-0.491609,-0.269593,-0.495712,0.00756,-0.029678
-0.112953,-0.297852,-0.112157,-0.329798,0.140091,0.132045,-0.630921
-0.134164,-0.039607,-0.302315,0.173057,0.183324,-0.702462,-0.373942
0.003399,-0.216908,0.212212,0.225412,-0.659903,-0.341107,-0.431738
-0.053694,0.29916,0.286597,-0.692641,-0.510921,-0.524354,0.063615
0.136983,0.389337,-0.681674,-0.496475,-0.543042,0.104447,0.100844
0.168987,-0.658118,-0.465173,-0.643194,-0.04131,0.028537,-0.287489
-0.295559,-0.397665,-0.572081,-0.001995,-0.045227,-0.314467,0.132671
-0.023684,-0.500302,0.070106,0.056417,-0.257502,0.156355,-0.280469


In [27]:
plot_sales_forecast(x_train, y_train, c_train, p_train, 1, 2, "log_units_sold", 1)

In [40]:
importances_df.select(
    pl.col("feature_name"),
    pl.col.h7_log_units_sold.alias("h1"),
).sort("h1", descending=True)

feature_name,h1
str,i32
"""dayofyear""",538
"""dayofweek""",509
"""weekofyear""",358
"""mean_28d_log_units_sold""",334
"""ewm_7d_log_units_sold""",201
"""dayofmonth""",181
"""ewm_3d_log_units_sold""",151
"""median_28d_log_units_sold""",149
"""mean_14d_log_units_sold""",136
"""product_id""",115


In [28]:
# save model with todays date
from datetime import datetime
import joblib
joblib.dump(forecaster, f"../demand_prediction/demand_forecaster_{datetime.now().strftime('%Y%m%d')}.pkl")

['../demand_prediction/demand_forecaster_20250717.pkl']

In [86]:
import plotly.graph_objects as go

stores = [
    (-0.18876285785586663, -78.51363094389696),
    (-0.27163083765277934, -78.53891111061955),
    (-0.30421205149991803, -78.54096621059904),
    # (-0.3059071837532822, -78.551673589868),
    (-0.2405121435369437, -78.524529890494),
    (-0.2996537534864276, -78.48578502608005),
]

workshops = [
    (-0.3234604530111714, -78.49425052229458),
    # (-0.3045530623630387, -78.46982632206792),
    (-0.27849168817274433, -78.52270672581136),
    # (-0.29468642829993047, -78.48558770305867),
    (-0.08396566401722372, -78.50741401444287),
    (-0.13905359564145664, -78.46905793453739),
    (-0.20517916260301633, -78.43217662590378),
    (-0.22520129297668467, -78.49007088786165)
]

store_latitudes = [store[0] for store in stores]
store_longitudes = [store[1] for store in stores]
workshop_latitudes = [workshop[0] for workshop in workshops]
workshop_longitudes = [workshop[1] for workshop in workshops]
# 25.56168367447657, -103.49397899075461
# 25.578001288335397, -103.39173110558576
# 25.612244741529203, -103.39757596689567
# 25.534546490556515, -103.4176698981004

# 19.44663516973258, -99.20680859489939
# 19.42904079345824, -99.1416591829526
# 19.424714015472688, -99.15787006949334
# 19.386921926074262, -99.13278907522275
# 19.34825529138672, -99.16674017722318

fig = go.Figure(go.Scattermap(
    mode = "markers+text",
    # 25.561781422559218, -103.43062899263603
    lat = store_latitudes+workshop_latitudes, #[25.561781422559218, 25.534546490556515, 25.612244741529203, 25.578001288335397],
    lon = store_longitudes+workshop_longitudes,#[-103.43062899263603, -103.4176698981004, -103.39757596689567, -103.39173110558576], 
    marker = dict(size=14, symbol=["commercial"] * len(stores) + ["industry"] * len(workshops)),
    text = ["Bus", "Harbor", "airport"], textposition = "bottom right",
    textfont = dict(size=8, color="black", weight=100)
    ))

# 25.560655745859822, -103.45157344672441
# 19.395106657297603, -99.12075148945637
-0.19079953512165104, -78.481357106267
center = dict(
    lat=-0.19079953512165104,
    lon=-78.481357106267
)
fig.update_layout(
    title=dict(text='Nuclear Waste Sites on Campus'),
    autosize=True,
    # hovermode='closest',
    map = dict(
        bearing=-80,
        center=center,
        pitch=0,
        # zoom=10
        # style="light",
        zoom=10.5
        ),
    showlegend = False,)

fig.show()

In [1]:
import mlflow
import numpy as np
from sklearn.linear_model import LinearRegression

import os
os.environ["AWS_ACCESS_KEY_ID"] = "admin"
os.environ["AWS_SECRET_ACCESS_KEY"] = "password"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://localhost:10000"
os.environ["AWS_REGION"] = "us-east-1"


X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
# y = 1 * x_0 + 2 * x_1 + 3
y = np.dot(X, np.array([1, 2])) + 3

mlflow.set_tracking_uri("http://localhost:5000")

has_exp = mlflow.get_experiment_by_name("linear_regression")
# If the experiment does not exist, create it
if has_exp is None:
    experiment_id = mlflow.create_experiment("linear_regression")
else:
    experiment_id = has_exp.experiment_id
# experiment_id = mlflow.create_experiment("linear_regression_example")

# x_train, y_train = load_pickle(train_bundle_path)

mlflow.sklearn.autolog(log_datasets=False, log_models=True)

with mlflow.start_run(experiment_id=experiment_id) as run:
    # X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    # mlflow.sklearn.log_model(scaler, artifact_path="scaler")
    reg = LinearRegression().fit(X, y)
    reg.score(X, y)
    reg.predict(np.array([[3, 5]]))

    # mlflow.log_metrics(
    #     {
    #         "intercept_": reg.intercept_,
    #         "model_size": reg.__sizeof__(),
    #     }
    # )
    # mlflow.sklearn.log_model(reg, name="h1")



🏃 View run defiant-cow-306 at: http://localhost:5000/#/experiments/1/runs/4b3cecc814e14708b46ed082deb88bda
🧪 View experiment at: http://localhost:5000/#/experiments/1
