In [17]:
# 8. LightGBM train
import lightgbm as lgb
import polars as pl


x_train = pl.read_parquet("../../data/favorita_dataset/output/train_input.parquet")
y_train = pl.read_parquet("../../data/favorita_dataset/output/train_target.parquet")
c_train = pl.read_parquet("../../data/favorita_dataset/output/train_dates.parquet")

# x_valid = pl.read_parquet("../../data/favorita_dataset/output/x_valid.parquet")
# y_valid = pl.read_parquet("../../data/favorita_dataset/output/y_valid.parquet")
# c_valid = pl.read_parquet("../../data/favorita_dataset/output/dates_valid.parquet")

In [18]:
x_train = x_train.with_columns(
    pl.col.product_group.cast(pl.Categorical)
)

In [19]:
import polars.selectors as cs

# get all categorical columns
categorical_cols = x_train.select(cs.integer(), cs.categorical()).columns

In [20]:
categorical_cols

['product_id',
 'store_id',
 'next_1d_event_id',
 'next_2d_event_id',
 'next_3d_event_id',
 'next_4d_event_id',
 'next_5d_event_id',
 'next_6d_event_id',
 'next_7d_event_id',
 'dayofweek',
 'dayofmonth',
 'dayofyear',
 'weekofyear',
 'month',
 'year',
 'product_group']

In [40]:
params = {
    'num_leaves': 50,
    'objective': 'regression',
    'min_data_in_leaf': 10, #200,
    'learning_rate': 0.02,
    'feature_fraction': 0.1,#0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}

MAX_ROUNDS = 5000
# val_pred = []
# test_pred = []
# cate_vars = []
# for i in range(16):
print("=" * 50)
# print("Step %d" % (i+1))
print("=" * 50)
dataset_train = lgb.Dataset(
    x_train.to_pandas(), label=y_train.get_column("log_units_sold").to_numpy(),
    categorical_feature=categorical_cols,
    # weight=pd.concat([items["perishable"]] * num_days) * 0.25 + 1
)

dataset_valid = lgb.Dataset(
    x_valid.to_pandas(), label=y_valid.get_column("log_units_sold").to_numpy(), 
    reference=dataset_train,
    # weight=items["perishable"] * 0.25 + 1,
    categorical_feature=categorical_cols,
    )


bst = lgb.train(
    params, dataset_train, num_boost_round=MAX_ROUNDS,
    valid_sets=[dataset_train, dataset_valid],
    callbacks=[lgb.early_stopping(125), lgb.log_evaluation(25)],
    # force_col_wise=True
)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10218
[LightGBM] [Info] Number of data points in the train set: 29059, number of used features: 64
[LightGBM] [Info] Start training from score 3.713340
Training until validation scores don't improve for 125 rounds
[25]	training's l2: 0.306271	valid_1's l2: 0.321648
[50]	training's l2: 0.176342	valid_1's l2: 0.193512
[75]	training's l2: 0.125771	valid_1's l2: 0.145189
[100]	training's l2: 0.101676	valid_1's l2: 0.123376
[125]	training's l2: 0.0892252	valid_1's l2: 0.113927
[150]	training's l2: 0.0823788	valid_1's l2: 0.109231
[175]	training's l2: 0.0784748	valid_1's l2: 0.107135
[200]	training's l2: 0.0760689	valid_1's l2: 0.105831
[225]	training's l2: 0.0728269	valid_1's l2: 0.103962
[250]	training's l2: 0.0702399	valid_1's l2: 0.102398
[275]	training's l2: 0.0684989	valid_1's l2: 0.101671
[300]	tr

In [42]:
import plotly.express as px

x_subset = x_valid.with_columns(
    date=x_dates_valid
).filter(
    (pl.col("store_id") == 3) 
    & (pl.col("product_id") == 213652)
)

y_subset = y_valid.with_columns(
    product_id=x_valid.get_column("product_id"),
    store_id=x_valid.get_column("store_id"),
).filter(
    (pl.col("store_id") == 3)
    & (pl.col("product_id") == 213652)
)

x_dates_subset = x_subset.drop_in_place("date")

# apply inverse of logp1 transformation to actuals and round to nearest integer
actuals = y_subset.get_column("log_units_sold")
actuals = (pl.Series(actuals).exp() - 1).round()

# apply inverse of logp1 transformation to predictions and round to nearest integer
predictions = bst.predict(x_subset.to_pandas())
predictions = (pl.Series(predictions).exp() - 1).round()

# plot time series: y_valid vs prediction
fig = px.line(
    x=x_dates_subset,
    # y=[y_subset, predictions],
    y=predictions,
    labels={"x": "Date", "value": "Sales"},
    title="Actual vs Predicted Sales",
    template="plotly_white"
)

fig.add_scatter(
    x=x_dates_subset,
    y=actuals,
    name="Actual Sales",
    line=dict(color="red", width=2)
)

# limit y-axis to 0-5
fig.update_yaxes(range=[0, 50])

fig.show()

In [7]:
importances_df = forecaster.feature_importances()
importances_df

feature_name,h1_log_units_sold,h2_log_units_sold,h3_log_units_sold,h4_log_units_sold,h5_log_units_sold,h6_log_units_sold,h7_log_units_sold
str,i32,i32,i32,i32,i32,i32,i32
"""product_id""",3800,182,133,130,100,51,121
"""store_id""",1505,37,20,15,11,3,12
"""log_units_sold""",2693,18,9,7,1,1,69
"""product_group""",462,61,67,56,58,18,32
"""next_1d_event_id""",1159,2,0,0,0,0,0
…,…,…,…,…,…,…,…
"""h3_ewm_3y_log_units_sold""",5650,296,49,1,0,5,4
"""h4_ewm_3y_log_units_sold""",5491,32,262,52,5,1,0
"""h5_ewm_3y_log_units_sold""",5472,7,26,250,44,6,4
"""h6_ewm_3y_log_units_sold""",5518,10,4,16,253,31,5


In [8]:
# modify max limit of df display
pl.Config.set_tbl_rows(60)

polars.config.Config

In [None]:
importances_df.select(
    pl.col("feature_name"),
    pl.concat_list(cs.starts_with("h")).list.mean().alias("importance")
).sort("importance", descending=True)

feature_name,importance
str,f64
"""dayofyear""",11670.142857
"""weekofyear""",2133.714286
"""diff_mean_14d_log_units_sold""",1024.428571
"""diff_mean_7d_log_units_sold""",1023.428571
"""diff_mean_28d_log_units_sold""",985.428571
"""diff_mean_21d_log_units_sold""",967.285714
"""std_7d_log_units_sold""",913.428571
"""std_28d_log_units_sold""",885.428571
"""h2_ewm_3y_log_units_sold""",879.0
"""max_mean_ratio_7d_log_units_so…",869.428571


In [13]:
import plotly.express as px


def foo(x_valid, y_valid, c_valid, target_col, pid, sid):

    x_subset = x_valid.with_columns(
        (y_valid.get_column(target_col).exp() + 1).round(),
        (forecaster.predict(x_valid).get_column(f'pred_{target_col}').exp() + 1).round(),
        c_valid.get_column("c_date"),
    ).filter(store_id=sid, product_id=pid)
    
    y_true = x_subset.drop_in_place(target_col)
    y_preds = x_subset.drop_in_place(f"pred_{target_col}")
    x_dates_subset = x_subset.drop_in_place("c_date")

    # apply inverse of logp1 transformation to actuals and round to nearest integer
    # actuals = y_subset.get_column("log_units_sold")
    # actuals = (pl.Series(actuals).exp() - 1).round()

    # apply inverse of logp1 transformation to predictions and round to nearest integer
    # predictions = bst.predict(x_subset.to_pandas())
    # predictions = (pl.Series(predictions).exp() - 1).round()

    # plot time series: y_valid vs prediction
    fig = px.line(
        x=x_dates_subset,
        # y=[y_subset, predictions],
        y=y_preds,
        labels={"x": "date", "y": "units"},
        title="Actual vs Predicted Sales",
        template="plotly_white"
    )

    fig.add_scatter(
        x=x_dates_subset,
        y=y_true,
        name="sales",
        line=dict(color="green", width=2)
    )

    # limit y-axis to 0-5
    # fig.update_yaxes(range=[0, 100])
                            
    fig.show()

In [10]:
import plotly.express as px

def plot_sales_forecast(x_train, y_train, c_train, p_train, product_id, store_id, target_col, step):
    # target_col = "h7_log_units_sold"
    true_col = f'h{step}_{target_col}'
    pred_col = f'pred_h{step}_{target_col}'

    x = x_train.with_columns(
        c_train.get_column("c_date"),
        y_train.get_column(true_col).exp() - 1,
        p_train.get_column(pred_col).exp() - 1,
    ).filter(product_id=product_id, store_id=store_id)

    # plot with series named "actual" and "predicted"
    px.line(
        x=x.get_column("c_date"),
        y=[x.get_column(true_col), x.get_column(pred_col)],
        labels={"x": "date", "value": "units sold"},
        title=f"{product_id=} | {store_id=}",
        template="plotly_white"
    ).show()

In [27]:
plot_sales_forecast(x_train, y_train, c_train, p_train, 1, 2, "log_units_sold", 1)

In [40]:
importances_df.select(
    pl.col("feature_name"),
    pl.col.h7_log_units_sold.alias("h1"),
).sort("h1", descending=True)

feature_name,h1
str,i32
"""dayofyear""",538
"""dayofweek""",509
"""weekofyear""",358
"""mean_28d_log_units_sold""",334
"""ewm_7d_log_units_sold""",201
"""dayofmonth""",181
"""ewm_3d_log_units_sold""",151
"""median_28d_log_units_sold""",149
"""mean_14d_log_units_sold""",136
"""product_id""",115


In [28]:
# save model with todays date
from datetime import datetime
import joblib
joblib.dump(forecaster, f"../demand_prediction/demand_forecaster_{datetime.now().strftime('%Y%m%d')}.pkl")

['../demand_prediction/demand_forecaster_20250717.pkl']

In [23]:
import plotly.graph_objects as go
import polars as pl
def plot_map(df_store: pl.DataFrame , df_workshop: pl.DataFrame):
    """
    Plot a map with stores and workshops locations.
    """

    
    # stores = [
    #     (-0.18876285785586663, -78.51363094389696),
    #     (-0.27163083765277934, -78.53891111061955),
    #     (-0.30421205149991803, -78.54096621059904),
    #     # (-0.3059071837532822, -78.551673589868),
    #     (-0.2405121435369437, -78.524529890494),
    #     (-0.2996537534864276, -78.48578502608005),
    # ]

    # workshops = [
    #     (-0.3234604530111714, -78.49425052229458),
    #     # (-0.3045530623630387, -78.46982632206792),
    #     (-0.27849168817274433, -78.52270672581136),
    #     # (-0.29468642829993047, -78.48558770305867),
    #     (-0.08396566401722372, -78.50741401444287),
    #     (-0.13905359564145664, -78.46905793453739),
    #     (-0.20517916260301633, -78.43217662590378),
    #     (-0.22520129297668467, -78.49007088786165)
    # ]

    store_latitudes = df_store.get_column("s_latitudes").to_list() #[store[0] for store in stores]
    store_longitudes = df_store.get_column("s_longitudes").to_list() #[store[1] for store in stores]
    workshop_latitudes = df_workshop.get_column("w_latitudes").to_list() #[workshop[0] for workshop in workshops]
    workshop_longitudes = df_workshop.get_column("w_longitudes").to_list() #[workshop[1] for workshop in workshops]

    store_names = df_store.get_column("s_name").to_list() # [store[2] for store in stores]
    workshop_names = df_workshop.get_column("w_name").to_list() # [workshop[2] for workshop in workshops]

    fig = go.Figure(go.Scattermap(
        mode = "markers+text",
        # 25.561781422559218, -103.43062899263603
        lat = store_latitudes+workshop_latitudes,
        lon = store_longitudes+workshop_longitudes,
        marker = dict(size=14, symbol=["commercial"] * df_store.height + ["industry"] * df_workshop.height),
        text = store_names + workshop_names, textposition = "bottom right",
        textfont = dict(size=8, color="black", weight=100)
        ))

    center = dict(
        lat=-0.19079953512165104,
        lon=-78.481357106267
    )

    
    fig.update_layout(
        title=dict(text='Nuclear Waste Sites on Campus'),
        autosize=True,
        # hovermode='closest',
        map = dict(
            bearing=-80,
            center=center,
            pitch=0,
            # zoom=10
            style="light",
            zoom=10.5
            ),
        showlegend = False,)

    fig.show()

In [1]:
import mlflow
import numpy as np
from sklearn.linear_model import LinearRegression

import os
os.environ["AWS_ACCESS_KEY_ID"] = "admin"
os.environ["AWS_SECRET_ACCESS_KEY"] = "password"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://localhost:10000"
os.environ["AWS_REGION"] = "us-east-1"


X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
# y = 1 * x_0 + 2 * x_1 + 3
y = np.dot(X, np.array([1, 2])) + 3

mlflow.set_tracking_uri("http://localhost:5000")

has_exp = mlflow.get_experiment_by_name("linear_regression")
# If the experiment does not exist, create it
if has_exp is None:
    experiment_id = mlflow.create_experiment("linear_regression")
else:
    experiment_id = has_exp.experiment_id
# experiment_id = mlflow.create_experiment("linear_regression_example")

# x_train, y_train = load_pickle(train_bundle_path)

mlflow.sklearn.autolog(log_datasets=False, log_models=True)

with mlflow.start_run(experiment_id=experiment_id) as run:
    # X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    # mlflow.sklearn.log_model(scaler, artifact_path="scaler")
    reg = LinearRegression().fit(X, y)
    reg.score(X, y)
    reg.predict(np.array([[3, 5]]))

    # mlflow.log_metrics(
    #     {
    #         "intercept_": reg.intercept_,
    #         "model_size": reg.__sizeof__(),
    #     }
    # )
    # mlflow.sklearn.log_model(reg, name="h1")



🏃 View run languid-wolf-196 at: http://localhost:5000/#/experiments/1/runs/9d6393e1a5c34f6e8215be2c4f000484
🧪 View experiment at: http://localhost:5000/#/experiments/1


In [20]:
def load_model_from_registry(timestamp: str) -> DirectMultihorizonForecaster:
    """
    Load the model from the MLflow Model Registry.
    """
    import sys
    dags_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..', 'dags'))
    if dags_path not in sys.path:
        sys.path.insert(0, dags_path)
    model_name = f"demand_predictor_{timestamp}"
    model_version = "latest"

    # Load the model from the Model Registry
    model_uri = f"models:/{model_name}/{model_version}"
    return mlflow.pyfunc.load_model(model_uri)

In [21]:
load_model_from_registry("2016-08-15")

Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 135.05it/s] 
 - cloudpickle (current: 3.1.1, required: cloudpickle==3.0.0)
 - graphviz (current: uninstalled, required: graphviz==0.21)
 - pyarrow (current: 20.0.0, required: pyarrow==18.1.0)
 - scikit-learn (current: 1.7.1, required: scikit-learn==1.7.0)
 - scipy (current: 1.15.3, required: scipy==1.16.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
  param_names = _check_func_signature(func, "predict")
  func_info = _get_func_info_if_type_hint_supported(predict_attr)


mlflow.pyfunc.loaded_model:
  artifact_path: s3://mlflow-artifacts/4/models/m-6a0f30d33f5142f2bbbccd08d8104120/artifacts
  flavor: mlflow.pyfunc.model
  run_id: ce4cdbecdf5b4ce5a5f30919f589a414

In [18]:
import sys
from pathlib import Path
import numpy as np
import polars as pl
import polars.selectors as cs
import mlflow
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope

# Agregar el directorio padre (services) al path
# current_file = Path(__file__)
# services_dir = current_file.parent.parent
sys.path.insert(0, "..")


import os
os.environ["AWS_ACCESS_KEY_ID"] = "admin"
os.environ["AWS_SECRET_ACCESS_KEY"] = "password"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://localhost:10000"
os.environ["AWS_REGION"] = "us-east-1"

try:
    from shared.forecasters import DirectMultihorizonForecaster
    from shared.s3config import get_s3_params
except ImportError:
    raise ImportError("shared.s3config module not found. Ensure the path is correct.")


def run_model_fit(timestamp: str) -> str:
    """
    Run the model fitting process.
    """
    # Load training data from S3
    # Assuming the S3 path and storage options are correctly set up
    s3_path, s3_storage_options = get_s3_params(timestamp)

    x_train = pl.read_parquet(s3_path + "/train_input.parquet", storage_options=s3_storage_options)
    y_train = pl.read_parquet(s3_path + "/train_target.parquet", storage_options=s3_storage_options)

    x_valid = pl.read_parquet(s3_path + "/valid_input.parquet", storage_options=s3_storage_options)
    y_valid = pl.read_parquet(s3_path + "/valid_target.parquet", storage_options=s3_storage_options)

    categorical_cols = x_train.select(cs.integer(), cs.categorical()).columns

    model_params = {
        # "boosting_type":"dart", # "gdbt"
        "num_leaves": 31,  # 31
        # "max_depth": 10, #-1,
        "learning_rate": 0.05,
        "n_estimators": 100,
        # "subsample_for_bin": 200000,
        "objective": "regression",
        # class_weight: Optional[Union[Dict, str]] = None,
        "min_split_gain": 0.01,  # 'feature_fraction': 0.1,#0.8,
        # min_child_weight: float = 1e-3,
        "min_child_samples": 10,
        "subsample": 0.7,  #'bagging_fraction': 0.7,
        "subsample_freq": 1,  # 'bagging_freq': 1,
        # colsample_bytree: float = 1.0,
        # reg_alpha: float = 0.0,
        # reg_lambda: float = 0.0,
        # random_state: Optional[Union[int, np.random.RandomState, np.random.Generator]] = None,
        "n_jobs": 16,  # 'num_threads': 16
        # importance_type: str = "split",
    }

    fit_params = {
        "eval_metric": "l2",
        "early_stopping_rounds": 10,
        "log_evaluation": 100,
        "categorical_feature": categorical_cols,
        # "sample_weight": weights,
        # "feature_name": "auto",
    }

    mlflow.set_tracking_uri("http://localhost:5000")

    experiment_name = f"demand_predictor_{timestamp}"
    has_exp = mlflow.get_experiment_by_name(experiment_name)
    # If the experiment does not exist, create it
    if has_exp is None:
        experiment_id = mlflow.create_experiment(experiment_name)
    else:
        experiment_id = has_exp.experiment_id
    
    mlflow.lightgbm.autolog(disable=True)  # Disable automatic logging to avoid conflicts

    def objective(model_params):
        with mlflow.start_run(experiment_id=experiment_id):

        # Create and fit the forecaster
            forecaster = DirectMultihorizonForecaster(horizons=7, params=model_params)
            forecaster.fit(x_train, y_train, x_valid, y_valid, fit_params)

            print(forecaster.models_[0].best_score_)
            
            # train_l2 = {
            #     f"train_h{h}_l2": model.best_score_["train"]["l2"]
            #     for h, model in enumerate(forecaster.models_, 1)
            # }

            # valid_l2 = {
            #     f"valid_h{h}_l2": model.best_score_["valid_1"]["l2"]
            #     for h, model in enumerate(forecaster.models_, 1)
            # }

            train_r2 = {
                f"train_h{h}_score": model.score(x_train.to_pandas(), y_train.get_column(f"h{h}_log_units_sold").to_numpy())
                for h, model in enumerate(forecaster.models_, 1)
            }

            valid_r2 = {
                f"valid_h{h}_score": model.score(x_valid.to_pandas(), y_valid.get_column(f"h{h}_log_units_sold").to_numpy())
                for h, model in enumerate(forecaster.models_, 1)
            }


            val_l2 = sum([model.best_score_["valid_1"]["l2"] for model in forecaster.models_])
        
            mlflow.log_params(model_params)
            mlflow.log_metrics(train_r2)
            mlflow.log_metrics(valid_r2)
            mlflow.log_metric("valid_score", sum(valid_r2.values()))

        return {"loss": val_l2, "status": STATUS_OK}
    
    # search_space = {
    #     "max_depth": scope.int(hp.quniform("max_depth", 1, 20, 1)),
    #     "n_estimators": scope.int(hp.quniform("n_estimators", 10, 50, 1)),
    #     "min_samples_split": scope.int(hp.quniform("min_samples_split", 2, 10, 1)),
    #     "min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 1, 4, 1)),
    #     "random_state": 42,
    # }

    search_space = {
        "num_leaves": 100,
        "max_depth": 50,
        "learning_rate": hp.uniform("learning_rate", 1e-5, 0.2),
        "n_estimators": 100,
        "min_child_samples": scope.int(hp.quniform("min_child_samples", 5, 50, 5)),
        "reg_alpha": hp.uniform("reg_alpha", 0.0, 1.0),
        "reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
        "subsample": hp.uniform("subsample", 0.5, 1.0),
        "subsample_freq": scope.int(hp.quniform("subsample_freq", 1, 10, 1)),
        # "min_split_gain": hp.uniform("min_split_gain", 1e-4, 0.1),
        "feature_fraction": 0.6,
        # "bagging_fraction": hp.uniform("bagging_fraction", 0.5, 1.0),
        "n_jobs": 16,
        "verbosity": -1,
    }
    num_trials = 30  # Number of trials for hyperparameter optimization
    rstate = np.random.default_rng(42)  # for reproducible results
    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=num_trials,
        trials=Trials(),
        rstate=rstate,
    )

        # mlflow.lightgbm.log_model()
    # return push_model_to_s3(forecaster, timestamp)
    # return forecaster, x_valid, y_valid

In [2]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

EXPERIMENT_NAME = "demand_predictor"

def create_or_get_experiment(timestamp: str) -> str:
    """
    Get the experiment ID for the given timestamp.
    """
    mlflow.set_tracking_uri("http://localhost:5000")

    experiment_name = f"demand_predictor_{timestamp}"
    has_exp = mlflow.get_experiment_by_name(experiment_name)
    # If the experiment does not exist, create it
    if has_exp is None:
        return mlflow.create_experiment(experiment_name)
    else:
        return has_exp.experiment_id

# transform str to integer
def parse_params(params: dict) -> dict:
    """Transform string parameters to their appropriate types."""
    transformed_params = {}
    for key, value in params.items():
        if value.isnumeric():
            transformed_params[key] = int(value)
        elif "." in value:
            transformed_params[key] = float(value)
        else:
            transformed_params[key] = value
    return transformed_params

def get_best_params(timestamp: str) -> dict:
    """
    Retrieve the best run from the HPO experiment.
    """
    experiment_id = create_or_get_experiment(timestamp)

    client = MlflowClient()
    best_run = client.search_runs(
        experiment_ids=experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=1,
        order_by=["metrics.valid_h1_score DESC"]
    )
    return parse_params(best_run[0].data.params) if best_run else {}


def train_and_register_best_model(timestamp: str) -> str:
    """
    Train the best model using the best hyperparameters from the HPO experiment with all the data
    """
    best_params = get_best_params(timestamp)
    
    # Load training data from S3
    s3_path, s3_storage_options = get_s3_params(timestamp)

    x_train = pl.read_parquet(s3_path + "/train_input.parquet", storage_options=s3_storage_options)
    y_train = pl.read_parquet(s3_path + "/train_target.parquet", storage_options=s3_storage_options)

    x_valid = pl.read_parquet(s3_path + "/valid_input.parquet", storage_options=s3_storage_options)
    y_valid = pl.read_parquet(s3_path + "/valid_target.parquet", storage_options=s3_storage_options)

    x_total = pl.concat([x_train, x_valid], how="vertical")
    y_total = pl.concat([y_train, y_valid], how="vertical")

    
    fit_params = {
        "eval_metric": "l2",
        "early_stopping_rounds": 10,
        "log_evaluation": 100,
        "categorical_feature": x_total.select(cs.integer(), cs.categorical()).columns,
    }

    experiment_id = create_or_get_experiment(timestamp)
    mlflow.lightgbm.autolog(disable=True)  # Disable automatic logging to avoid conflicts

    with mlflow.start_run(experiment_id=experiment_id):
        forecaster = DirectMultihorizonForecaster(horizons=7, params=best_params)
        forecaster.fit(x_total, y_total, fit_params=fit_params)

        mlflow.pyfunc.log_model(
            python_model=forecaster,
            # artifact_path="model",
            registered_model_name=f"demand_predictor_{timestamp}",
            # code_paths=["../shared/forecasters.py"],
            # pip_requirements=["mlflow", "lightgbm", "polars", "hyperopt"]
        )

    return forecaster

def register_model_to_mlflow(forecaster: DirectMultihorizonForecaster, timestamp: str) -> str:
    with mlflow.start_run(experiment_id=mlflow.get_experiment_by_name(experiment_name).experiment_id):
        mlflow.lightgbm.log_model(forecaster, artifact_path="model")
        run_id = mlflow.active_run().info.run_id

    return f"runs:/{run_id}/model"

In [3]:
train_and_register_best_model("2016-08-15")





[100]	training's l2: 0.0397755




[100]	training's l2: 0.0409896




[100]	training's l2: 0.041504




[100]	training's l2: 0.0412016




[100]	training's l2: 0.0417372




[100]	training's l2: 0.0420305




[100]	training's l2: 0.0416717


  signature_from_type_hints = _infer_signature_from_type_hints(
Successfully registered model 'demand_predictor_2016-08-15'.
2025/07/27 18:56:43 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: demand_predictor_2016-08-15, version 1


🏃 View run respected-fawn-765 at: http://localhost:5000/#/experiments/3/runs/1a7e235797a441c7b20b69e600d92d4c
🧪 View experiment at: http://localhost:5000/#/experiments/3


Created version '1' of model 'demand_predictor_2016-08-15'.


0,1,2
,horizons,7
,params,"{'feature_fraction': 0.6, 'learning_rate': 0.07286936603794487, 'max_depth': 50, 'min_child_samples': 20, ...}"


In [15]:
def load_model_from_registry2(timestamp: str) -> DirectMultihorizonForecaster:
    """
    Load the model from the MLflow Model Registry.
    """
    model_name = f"demand_predictor_{timestamp}"
    model_version = "latest"

    # Load the model from the Model Registry
    model_uri = f"models:/{model_name}/{model_version}"
    return mlflow.pyfunc.load_model(model_uri)

In [16]:
load_model_from_registry2("2016-08-15")

Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 152.71it/s]  
 - cloudpickle (current: 3.1.1, required: cloudpickle==3.0.0)
 - graphviz (current: uninstalled, required: graphviz==0.21)
 - pyarrow (current: 20.0.0, required: pyarrow==18.1.0)
 - scikit-learn (current: 1.7.1, required: scikit-learn==1.7.0)
 - scipy (current: 1.15.3, required: scipy==1.16.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


ModuleNotFoundError: No module named 'src'

In [11]:
forecaster2.predict(x_train)

pred_h1_log_units_sold,pred_h2_log_units_sold,pred_h3_log_units_sold,pred_h4_log_units_sold,pred_h5_log_units_sold,pred_h6_log_units_sold,pred_h7_log_units_sold
f64,f64,f64,f64,f64,f64,f64
3.309255,3.173378,3.222193,3.395044,3.053878,2.811761,3.116119
3.291964,3.47564,3.645023,3.259734,3.14042,3.230164,3.122138
3.468076,3.544937,3.274762,3.16133,3.287343,3.12852,3.130051
3.47592,3.241271,3.203577,3.386892,3.142384,3.062042,3.174866
3.20636,3.034716,3.251202,2.990802,3.002681,3.091977,3.168236
…,…,…,…,…,…,…
2.838238,2.889545,3.22411,2.725694,2.910411,3.321557,3.44366
2.891283,3.254554,2.812742,2.81206,3.250133,3.415243,3.255129
3.251512,2.861001,2.878701,3.29583,3.484017,3.280034,2.996319
2.826369,2.874329,3.35551,3.583826,3.209576,3.01825,3.12282
