# train 2 models
milk_price_prediction/notebooks/01_train_model_selection.ipynb

✅ Objective: Load the preprocessed full dataset and train two models:
- XGBoost with Hyperopt
- Random Forest with Hyperopt
Then compare their performance (RMSE) and decide which one should be promoted.

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import joblib

  import pkg_resources


In [2]:
# --- 1. Load data ---
df = pd.read_parquet("../data/processed/full_dataset.parquet")
df = df.dropna(subset=["Precio", "Precio_lag1", "Precio_mean7"])

In [3]:
df

Unnamed: 0,Fecha,Estado,Ciudad,Tipo,Canal,Precio,año,mes,dia,dia_semana,Precio_lag1,Precio_mean7
1,2024-01-31,Aguascalientes,Aguascalientes,Pasteurizada,Autoservicios,23.62,2024,1,31,Wednesday,23.62,23.620000
2,2024-02-02,Aguascalientes,Aguascalientes,Pasteurizada,Autoservicios,23.62,2024,2,2,Friday,23.62,23.620000
3,2024-02-07,Aguascalientes,Aguascalientes,Pasteurizada,Autoservicios,23.62,2024,2,7,Wednesday,23.62,23.620000
4,2024-02-09,Aguascalientes,Aguascalientes,Pasteurizada,Autoservicios,24.15,2024,2,9,Friday,23.62,23.726000
5,2024-02-12,Aguascalientes,Aguascalientes,Pasteurizada,Autoservicios,24.15,2024,2,12,Monday,24.15,23.796667
...,...,...,...,...,...,...,...,...,...,...,...,...
42138,2025-07-21,Zacatecas,Zacatecas,Ultrapasteurizada,Tiendas,33.50,2025,7,21,Monday,33.50,33.500000
42139,2025-07-23,Zacatecas,Zacatecas,Ultrapasteurizada,Tiendas,33.50,2025,7,23,Wednesday,33.50,33.500000
42140,2025-07-25,Zacatecas,Zacatecas,Ultrapasteurizada,Tiendas,33.50,2025,7,25,Friday,33.50,33.500000
42141,2025-07-28,Zacatecas,Zacatecas,Ultrapasteurizada,Tiendas,33.50,2025,7,28,Monday,33.50,33.500000


In [4]:
# --- 2. Feature preparation ---
categorical = ["Estado", "Ciudad", "Tipo", "Canal", "dia_semana"]
numerical = ["Precio_lag1", "Precio_mean7", "mes", "dia", "año"]
df[categorical] = df[categorical].astype(str)

In [5]:
# Convert to dictionaries and vectorize
feature_dicts = df[categorical + numerical].to_dict(orient="records")
dv = DictVectorizer()
X = dv.fit_transform(feature_dicts)
y = df["Precio"].values

In [6]:
# --- 3. Define objective functions ---
def objective_xgb(params):
    model = XGBRegressor(
        max_depth=int(params["max_depth"]),
        learning_rate=params["learning_rate"],
        n_estimators=int(params["n_estimators"]),
        min_child_weight=params["min_child_weight"],
        gamma=params["gamma"],
        subsample=params["subsample"],
        colsample_bytree=params["colsample_bytree"],
        random_state=42,
        n_jobs=-1
    )
    score = cross_val_score(model, X, y, scoring="neg_root_mean_squared_error", cv=3)
    return {"loss": -score.mean(), "status": STATUS_OK}



In [7]:
def objective_rf(params):
    model = RandomForestRegressor(
        n_estimators=int(params["n_estimators"]),
        max_depth=int(params["max_depth"]),
        min_samples_split=int(params["min_samples_split"]),
        min_samples_leaf=int(params["min_samples_leaf"]),
        max_features=params["max_features"],
        random_state=42,
        n_jobs=-1
    )
    score = cross_val_score(model, X, y, scoring="neg_root_mean_squared_error", cv=3)
    return {"loss": -score.mean(), "status": STATUS_OK}

In [8]:
# --- 4. Define search spaces ---
search_space_xgb = {
    "max_depth": hp.quniform("max_depth", 3, 10, 1),
    "learning_rate": hp.loguniform("learning_rate", -4, 0),
    "n_estimators": hp.quniform("n_estimators", 50, 300, 10),
    "min_child_weight": hp.quniform("min_child_weight", 1, 10, 1),
    "gamma": hp.uniform("gamma", 0, 1),
    "subsample": hp.uniform("subsample", 0.5, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
}



In [9]:
search_space_rf = {
    "n_estimators": hp.quniform("n_estimators", 50, 300, 10),
    "max_depth": hp.quniform("max_depth", 5, 20, 1),
    "min_samples_split": hp.quniform("min_samples_split", 2, 10, 1),
    "min_samples_leaf": hp.quniform("min_samples_leaf", 1, 5, 1),
    "max_features": hp.choice("max_features", ["sqrt", "log2"]),
}

In [10]:
# --- 5. Run optimizations ---
trials_xgb = Trials()
best_xgb = fmin(fn=objective_xgb, space=search_space_xgb, algo=tpe.suggest, max_evals=25, trials=trials_xgb)

trials_rf = Trials()
best_rf = fmin(fn=objective_rf, space=search_space_rf, algo=tpe.suggest, max_evals=25, trials=trials_rf)


100%|██████████| 25/25 [00:11<00:00,  2.24trial/s, best loss: 2.022260687576108] 
100%|██████████| 25/25 [03:00<00:00,  7.21s/trial, best loss: 2.5678998580695773]


In [11]:
# --- 6. Retrain best models ---
final_xgb = XGBRegressor(
    max_depth=int(best_xgb["max_depth"]),
    learning_rate=best_xgb["learning_rate"],
    n_estimators=int(best_xgb["n_estimators"]),
    min_child_weight=best_xgb["min_child_weight"],
    gamma=best_xgb["gamma"],
    subsample=best_xgb["subsample"],
    colsample_bytree=best_xgb["colsample_bytree"],
    random_state=42,
    n_jobs=-1
)
final_xgb.fit(X, y)
y_pred_xgb = final_xgb.predict(X)
rmse_xgb = root_mean_squared_error(y, y_pred_xgb)



In [12]:
final_rf = RandomForestRegressor(
    n_estimators=int(best_rf["n_estimators"]),
    max_depth=int(best_rf["max_depth"]),
    min_samples_split=int(best_rf["min_samples_split"]),
    min_samples_leaf=int(best_rf["min_samples_leaf"]),
    max_features=["sqrt", "log2"][best_rf["max_features"]],
    random_state=42,
    n_jobs=-1
)
final_rf.fit(X, y)
y_pred_rf = final_rf.predict(X)
rmse_rf = root_mean_squared_error(y, y_pred_rf)

In [13]:
# --- 7. Compare and save models ---
print("\nRMSE XGBoost:", round(rmse_xgb, 4))
print("RMSE Random Forest:", round(rmse_rf, 4))

best_model_name = "XGBoost" if rmse_xgb < rmse_rf else "RandomForest"
print(f"\n✨ Best model: {best_model_name}")


RMSE XGBoost: 0.3061
RMSE Random Forest: 0.3639

✨ Best model: XGBoost


# Olvidé armar el pipeline, a continuación se ajusta y se separa por modelo

In [16]:
# milk_price_prediction/notebooks/01_train_model_selection.ipynb

# ✅ Objective: Load the preprocessed full dataset and train an XGBoost model with Hyperopt,
# using a pipeline to persist the vectorizer and reuse it later during inference.

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.pipeline import Pipeline
import joblib

# --- 1. Load data ---
df = pd.read_parquet("../data/processed/full_dataset.parquet")
df = df.dropna(subset=["Precio", "Precio_lag1", "Precio_mean7"])

# --- 2. Feature preparation ---
categorical = ["Estado", "Ciudad", "Tipo", "Canal", "dia_semana"]
numerical = ["Precio_lag1", "Precio_mean7", "mes", "dia", "año"]
df[categorical] = df[categorical].astype(str)

feature_dicts = df[categorical + numerical].to_dict(orient="records")
y = df["Precio"].values

# --- 3. Define objective function ---
def objective_xgb(params):
    pipeline = Pipeline([
        ("vectorizer", DictVectorizer()),
        ("regressor", XGBRegressor(
            max_depth=int(params["max_depth"]),
            learning_rate=params["learning_rate"],
            n_estimators=int(params["n_estimators"]),
            min_child_weight=params["min_child_weight"],
            gamma=params["gamma"],
            subsample=params["subsample"],
            colsample_bytree=params["colsample_bytree"],
            random_state=42,
            n_jobs=-1
        ))
    ])
    score = cross_val_score(pipeline, feature_dicts, y, scoring="neg_root_mean_squared_error", cv=3)
    return {"loss": -score.mean(), "status": STATUS_OK}

# --- 4. Define search space ---
search_space_xgb = {
    "max_depth": hp.quniform("max_depth", 3, 10, 1),
    "learning_rate": hp.loguniform("learning_rate", -4, 0),
    "n_estimators": hp.quniform("n_estimators", 50, 300, 10),
    "min_child_weight": hp.quniform("min_child_weight", 1, 10, 1),
    "gamma": hp.uniform("gamma", 0, 1),
    "subsample": hp.uniform("subsample", 0.5, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
}

# --- 5. Run optimization ---
trials_xgb = Trials()
best_xgb = fmin(fn=objective_xgb, space=search_space_xgb, algo=tpe.suggest, max_evals=25, trials=trials_xgb)

# --- 6. Retrain best model with full data ---
dv = DictVectorizer()
X = dv.fit_transform(feature_dicts)

final_xgb = XGBRegressor(
    max_depth=int(best_xgb["max_depth"]),
    learning_rate=best_xgb["learning_rate"],
    n_estimators=int(best_xgb["n_estimators"]),
    min_child_weight=best_xgb["min_child_weight"],
    gamma=best_xgb["gamma"],
    subsample=best_xgb["subsample"],
    colsample_bytree=best_xgb["colsample_bytree"],
    random_state=42,
    n_jobs=-1
)
final_xgb.fit(X, y)
y_pred = final_xgb.predict(X)
rmse = root_mean_squared_error(y, y_pred)
print(f"✅ Final RMSE XGBoost: {rmse:.4f}")

# --- 7. Save artifacts ---
joblib.dump(final_xgb, "models/xgb_model.pkl")
joblib.dump(dv, "models/dv.pkl")


100%|██████████| 25/25 [00:23<00:00,  1.06trial/s, best loss: 2.0443562515608567]
✅ Final RMSE XGBoost: 0.3021


['models/dv.pkl']

success, now lets introduce mlflow

In [17]:
# milk_price_prediction/notebooks/01_train_model_selection.ipynb

# ✅ Objective: Load the preprocessed full dataset and train an XGBoost model with Hyperopt,
# using a pipeline to persist the vectorizer and reuse it later during inference. Track training with MLflow.

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.pipeline import Pipeline
import mlflow
import mlflow.sklearn
from datetime import datetime

# --- 1. Load data ---
df = pd.read_parquet("../data/processed/full_dataset.parquet")
df = df.dropna(subset=["Precio", "Precio_lag1", "Precio_mean7"])

# --- 2. Feature preparation ---
categorical = ["Estado", "Ciudad", "Tipo", "Canal", "dia_semana"]
numerical = ["Precio_lag1", "Precio_mean7", "mes", "dia", "año"]
df[categorical] = df[categorical].astype(str)

feature_dicts = df[categorical + numerical].to_dict(orient="records")
y = df["Precio"].values

# --- 3. Define objective function ---
def objective_xgb(params):
    with mlflow.start_run(nested=True):
        mlflow.log_params(params)

        pipeline = Pipeline([
            ("vectorizer", DictVectorizer()),
            ("regressor", XGBRegressor(
                max_depth=int(params["max_depth"]),
                learning_rate=params["learning_rate"],
                n_estimators=int(params["n_estimators"]),
                min_child_weight=params["min_child_weight"],
                gamma=params["gamma"],
                subsample=params["subsample"] ,
                colsample_bytree=params["colsample_bytree"],
                random_state=42,
                n_jobs=-1
            ))
        ])

        score = cross_val_score(pipeline, feature_dicts, y, scoring="neg_root_mean_squared_error", cv=3)
        rmse = -score.mean()
        mlflow.log_metric("rmse", rmse)
        return {"loss": rmse, "status": STATUS_OK}

# --- 4. Define search space ---
search_space_xgb = {
    "max_depth": hp.quniform("max_depth", 3, 10, 1),
    "learning_rate": hp.loguniform("learning_rate", -4, 0),
    "n_estimators": hp.quniform("n_estimators", 50, 300, 10),
    "min_child_weight": hp.quniform("min_child_weight", 1, 10, 1),
    "gamma": hp.uniform("gamma", 0, 1),
    "subsample": hp.uniform("subsample", 0.5, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
}

# --- 5. Run optimization ---
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("milk-price-xgboost-pipeline")

run_date = datetime.today()
run_name = f"xgb-milk-predictor-{run_date.year}-{run_date.month:02d}"

with mlflow.start_run(run_name=run_name) as run:
    trials_xgb = Trials()
    best_xgb = fmin(fn=objective_xgb, space=search_space_xgb, algo=tpe.suggest, max_evals=25, trials=trials_xgb)

    # --- 6. Retrain best model with full data ---
    dv = DictVectorizer()
    X = dv.fit_transform(feature_dicts)

    final_xgb = XGBRegressor(
        max_depth=int(best_xgb["max_depth"]),
        learning_rate=best_xgb["learning_rate"],
        n_estimators=int(best_xgb["n_estimators"]),
        min_child_weight=best_xgb["min_child_weight"],
        gamma=best_xgb["gamma"],
        subsample=best_xgb["subsample"],
        colsample_bytree=best_xgb["colsample_bytree"],
        random_state=42,
        n_jobs=-1
    )

    pipeline = Pipeline([
        ("vectorizer", dv),
        ("regressor", final_xgb)
    ])
    pipeline.fit(feature_dicts, y)
    y_pred = pipeline.predict(feature_dicts)
    rmse = root_mean_squared_error(y, y_pred)

    mlflow.log_metric("final_rmse", rmse)
    mlflow.sklearn.log_model(pipeline, artifact_path="model")

    print(f"✅ Final RMSE XGBoost: {rmse:.4f}")
    print(f"📌 Model logged to MLflow with run ID: {run.info.run_id}")


🏃 View run gaudy-dolphin-923 at: http://127.0.0.1:5000/#/experiments/1/runs/821e7f3eb5964e3aba57a78a585b0d9f

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1

🏃 View run shivering-carp-496 at: http://127.0.0.1:5000/#/experiments/1/runs/3630c5c93cea4e0f9166cd4a3abcd853

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                  

🏃 View run secretive-jay-377 at: http://127.0.0.1:5000/#/experiments/1/runs/f26d3ea6b48e4e4791070adb91e04525

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                    

🏃 View run selective-ox-970 at: http://127.0.0.1:5000/#/experiments/1/runs/b355825621b9436e8a1fed64290746b0

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                    

🏃 View run valuable-duck-364 at: http://127.0.0.1:5000/#/experiments/1/runs/f14d6b52df084e24b86f6ac34544870b

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                    

🏃 View run blushing-jay-347 at: http://127.0.0.1:5000/#/experiments



✅ Final RMSE XGBoost: 0.3000
📌 Model logged to MLflow with run ID: 2db6aa866e2740749b9f45262f4f2f18
🏃 View run xgb-milk-predictor-2025-07 at: http://127.0.0.1:5000/#/experiments/1/runs/2db6aa866e2740749b9f45262f4f2f18
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


olvidé el registro, ahora va completo

In [18]:
# milk_price_prediction/notebooks/01_train_model_selection.ipynb

# ✅ Objective: Load the preprocessed full dataset and train an XGBoost model with Hyperopt,
# using a pipeline to persist the vectorizer and reuse it later during inference. Track training with MLflow.

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.pipeline import Pipeline
import mlflow
import mlflow.sklearn
from datetime import datetime

# --- 1. Load data ---
df = pd.read_parquet("../data/processed/full_dataset.parquet")
df = df.dropna(subset=["Precio", "Precio_lag1", "Precio_mean7"])

# --- 2. Feature preparation ---
categorical = ["Estado", "Ciudad", "Tipo", "Canal", "dia_semana"]
numerical = ["Precio_lag1", "Precio_mean7", "mes", "dia", "año"]
df[categorical] = df[categorical].astype(str)

feature_dicts = df[categorical + numerical].to_dict(orient="records")
y = df["Precio"].values

# --- 3. Define objective function ---
def objective_xgb(params):
    with mlflow.start_run(nested=True):
        mlflow.log_params(params)

        pipeline = Pipeline([
            ("vectorizer", DictVectorizer()),
            ("regressor", XGBRegressor(
                max_depth=int(params["max_depth"]),
                learning_rate=params["learning_rate"],
                n_estimators=int(params["n_estimators"]),
                min_child_weight=params["min_child_weight"],
                gamma=params["gamma"],
                subsample=params["subsample"] ,
                colsample_bytree=params["colsample_bytree"],
                random_state=42,
                n_jobs=-1
            ))
        ])

        score = cross_val_score(pipeline, feature_dicts, y, scoring="neg_root_mean_squared_error", cv=3)
        rmse = -score.mean()
        mlflow.log_metric("rmse", rmse)
        return {"loss": rmse, "status": STATUS_OK}

# --- 4. Define search space ---
search_space_xgb = {
    "max_depth": hp.quniform("max_depth", 3, 10, 1),
    "learning_rate": hp.loguniform("learning_rate", -4, 0),
    "n_estimators": hp.quniform("n_estimators", 50, 300, 10),
    "min_child_weight": hp.quniform("min_child_weight", 1, 10, 1),
    "gamma": hp.uniform("gamma", 0, 1),
    "subsample": hp.uniform("subsample", 0.5, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
}

# --- 5. Run optimization ---
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("milk-price-xgboost-pipeline")

run_date = datetime.today()
run_name = f"xgb-milk-predictor-{run_date.year}-{run_date.month:02d}"
model_name = "milk-price-predictor"

with mlflow.start_run(run_name=run_name) as run:
    trials_xgb = Trials()
    best_xgb = fmin(fn=objective_xgb, space=search_space_xgb, algo=tpe.suggest, max_evals=25, trials=trials_xgb)

    # --- 6. Retrain best model with full data ---
    dv = DictVectorizer()
    X = dv.fit_transform(feature_dicts)

    final_xgb = XGBRegressor(
        max_depth=int(best_xgb["max_depth"]),
        learning_rate=best_xgb["learning_rate"],
        n_estimators=int(best_xgb["n_estimators"]),
        min_child_weight=best_xgb["min_child_weight"],
        gamma=best_xgb["gamma"],
        subsample=best_xgb["subsample"],
        colsample_bytree=best_xgb["colsample_bytree"],
        random_state=42,
        n_jobs=-1
    )

    pipeline = Pipeline([
        ("vectorizer", dv),
        ("regressor", final_xgb)
    ])
    pipeline.fit(feature_dicts, y)
    y_pred = pipeline.predict(feature_dicts)
    rmse = root_mean_squared_error(y, y_pred)

    mlflow.log_metric("final_rmse", rmse)
    mlflow.sklearn.log_model(pipeline, artifact_path="model")

    # --- 7. Register model ---
    run_id = run.info.run_id
    model_uri = f"runs:/{run_id}/model"
    mlflow.register_model(model_uri=model_uri, name=model_name)

    print(f"✅ Final RMSE XGBoost: {rmse:.4f}")
    print(f"📌 Model logged to MLflow with run ID: {run_id}")
    print(f"📌 Model registered in MLflow Model Registry as '{model_name}'")


🏃 View run nimble-steed-808 at: http://127.0.0.1:5000/#/experiments/1/runs/1911b05b0903441b8b8c38bf0974ce07

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1

🏃 View run puzzled-lynx-927 at: http://127.0.0.1:5000/#/experiments/1/runs/4362167cad1741309385121aaef3c763

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                    

🏃 View run marvelous-wolf-885 at: http://127.0.0.1:5000/#/experiments/1/runs/509f1c528233450d9275041aa2ec7791

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                    

🏃 View run learned-asp-598 at: http://127.0.0.1:5000/#/experiments/1/runs/09f39f3698f64112ba8aaa104898a84e

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                    

🏃 View run ambitious-croc-870 at: http://127.0.0.1:5000/#/experiments/1/runs/0ff11bff129345a79fc3fbfef72ee1a4

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                    

🏃 View run masked-kite-811 at: http://127.0.0.1:5000/#/experiments/

Registered model 'milk-price-predictor' already exists. Creating a new version of this model...
2025/07/31 11:26:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: milk-price-predictor, version 38


✅ Final RMSE XGBoost: 0.2909
📌 Model logged to MLflow with run ID: 9d9704f49df14c4db307fd5552dbf298
📌 Model registered in MLflow Model Registry as 'milk-price-predictor'
🏃 View run xgb-milk-predictor-2025-07 at: http://127.0.0.1:5000/#/experiments/1/runs/9d9704f49df14c4db307fd5552dbf298
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Created version '38' of model 'milk-price-predictor'.


cambié la estructura en mlflow para tener dos modelos, y un solo experimento, posterior se relizará la evaluación y selección para promoción

# Entrenamiento de modelo xgboost funcionando correctamente con:

* actualización de registros diarios
* features adicionales
* optimización de parametros con hyperopt
* tracking con mlflow
* registro de modelo en s3


In [19]:
# milk_price_prediction/notebooks/01_train_model_selection.ipynb

# ✅ Objective: Load the preprocessed full dataset and train an XGBoost model with Hyperopt,
# using a pipeline to persist the vectorizer and reuse it later during inference. Track training with MLflow.

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.pipeline import Pipeline
import mlflow
import mlflow.sklearn
from datetime import datetime

# --- 1. Load data ---
df = pd.read_parquet("../data/processed/full_dataset.parquet")
df = df.dropna(subset=["Precio", "Precio_lag1", "Precio_mean7"])

# --- 2. Feature preparation ---
categorical = ["Estado", "Ciudad", "Tipo", "Canal", "dia_semana"]
numerical = ["Precio_lag1", "Precio_mean7", "mes", "dia", "año"]
df[categorical] = df[categorical].astype(str)

feature_dicts = df[categorical + numerical].to_dict(orient="records")
y = df["Precio"].values

# --- 3. Define objective function ---
def objective_xgb(params):
    with mlflow.start_run(nested=True):
        mlflow.log_params(params)
        mlflow.set_tags({"model_type": "xgboost"})

        pipeline = Pipeline([
            ("vectorizer", DictVectorizer()),
            ("regressor", XGBRegressor(
                max_depth=int(params["max_depth"]),
                learning_rate=params["learning_rate"],
                n_estimators=int(params["n_estimators"]),
                min_child_weight=params["min_child_weight"],
                gamma=params["gamma"],
                subsample=params["subsample"] ,
                colsample_bytree=params["colsample_bytree"],
                random_state=42,
                n_jobs=-1
            ))
        ])

        score = cross_val_score(pipeline, feature_dicts, y, scoring="neg_root_mean_squared_error", cv=3)
        rmse = -score.mean()
        mlflow.log_metric("rmse", rmse)
        return {"loss": rmse, "status": STATUS_OK}

# --- 4. Define search space ---
search_space_xgb = {
    "max_depth": hp.quniform("max_depth", 3, 10, 1),
    "learning_rate": hp.loguniform("learning_rate", -4, 0),
    "n_estimators": hp.quniform("n_estimators", 50, 300, 10),
    "min_child_weight": hp.quniform("min_child_weight", 1, 10, 1),
    "gamma": hp.uniform("gamma", 0, 1),
    "subsample": hp.uniform("subsample", 0.5, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
}

# --- 5. Run optimization ---
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("milk-price-predictor")

run_date = datetime.today()
run_name = f"xgb-milk-predictor-{run_date.year}-{run_date.month:02d}"
model_name = "milk-price-predictor-xgb"

with mlflow.start_run(run_name=run_name) as run:
    trials_xgb = Trials()
    best_xgb = fmin(fn=objective_xgb, space=search_space_xgb, algo=tpe.suggest, max_evals=25, trials=trials_xgb)

    # --- 6. Retrain best model with full data ---
    dv = DictVectorizer()
    X = dv.fit_transform(feature_dicts)

    final_xgb = XGBRegressor(
        max_depth=int(best_xgb["max_depth"]),
        learning_rate=best_xgb["learning_rate"],
        n_estimators=int(best_xgb["n_estimators"]),
        min_child_weight=best_xgb["min_child_weight"],
        gamma=best_xgb["gamma"],
        subsample=best_xgb["subsample"],
        colsample_bytree=best_xgb["colsample_bytree"],
        random_state=42,
        n_jobs=-1
    )

    pipeline = Pipeline([
        ("vectorizer", dv),
        ("regressor", final_xgb)
    ])
    pipeline.fit(feature_dicts, y)
    y_pred = pipeline.predict(feature_dicts)
    rmse = root_mean_squared_error(y, y_pred)

    mlflow.log_metric("final_rmse", rmse)
    mlflow.set_tags({"model_type": "xgboost"})
    mlflow.sklearn.log_model(pipeline, artifact_path="model")

    # --- 7. Register model ---
    run_id = run.info.run_id
    model_uri = f"runs:/{run_id}/model"
    mlflow.register_model(model_uri=model_uri, name=model_name)

    print(f"✅ Final RMSE XGBoost: {rmse:.4f}")
    print(f"📌 Model logged to MLflow with run ID: {run_id}")
    print(f"📌 Model registered in MLflow Model Registry as '{model_name}'")


2025/07/31 11:32:58 INFO mlflow.tracking.fluent: Experiment with name 'milk-price-predictor' does not exist. Creating a new experiment.


🏃 View run vaunted-asp-689 at: http://127.0.0.1:5000/#/experiments/2/runs/501ce85d9a49415ea45ccf5ec99f86d8

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2

🏃 View run handsome-croc-722 at: http://127.0.0.1:5000/#/experiments/2/runs/66583bec62334981b9f2ce3dc768c9c1

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2                    

🏃 View run unruly-gnat-376 at: http://127.0.0.1:5000/#/experiments/2/runs/2d64281bb73b4216826888ce7acc6da2

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2                    

🏃 View run caring-sheep-22 at: http://127.0.0.1:5000/#/experiments/2/runs/7ab574de74fa4deb899afebf53bd732c

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2                   

🏃 View run big-moth-435 at: http://127.0.0.1:5000/#/experiments/2/runs/264dc294775d4d42b15f76add70e6f83

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2                   

🏃 View run fortunate-perch-756 at: http://127.0.0.1:5000/#/experiments/2/runs/

Successfully registered model 'milk-price-predictor-xgb'.
2025/07/31 11:33:33 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: milk-price-predictor-xgb, version 1
Created version '1' of model 'milk-price-predictor-xgb'.


✅ Final RMSE XGBoost: 0.2554
📌 Model logged to MLflow with run ID: 14b70c6581874d61bb95b077cfb88291
📌 Model registered in MLflow Model Registry as 'milk-price-predictor-xgb'
🏃 View run xgb-milk-predictor-2025-07 at: http://127.0.0.1:5000/#/experiments/2/runs/14b70c6581874d61bb95b077cfb88291
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2


# replica pero con random forest

In [20]:
# milk_price_prediction/notebooks/01_train_model_selection.ipynb

# ✅ Objective: Load the preprocessed full dataset and train both XGBoost and Random Forest models with Hyperopt,
# using a pipeline to persist the vectorizer and reuse it later during inference. Track training with MLflow.

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import mlflow
import mlflow.sklearn
from datetime import datetime

# --- 1. Load data ---
df = pd.read_parquet("../data/processed/full_dataset.parquet")
df = df.dropna(subset=["Precio", "Precio_lag1", "Precio_mean7"])

# --- 2. Feature preparation ---
categorical = ["Estado", "Ciudad", "Tipo", "Canal", "dia_semana"]
numerical = ["Precio_lag1", "Precio_mean7", "mes", "dia", "año"]
df[categorical] = df[categorical].astype(str)

feature_dicts = df[categorical + numerical].to_dict(orient="records")
y = df["Precio"].values

# --- 3. Define objective function ---
def objective_rf(params):
    with mlflow.start_run(nested=True):
        mlflow.log_params(params)
        mlflow.set_tags({"model_type": "random_forest"})

        pipeline = Pipeline([
            ("vectorizer", DictVectorizer()),
            ("regressor", RandomForestRegressor(
                n_estimators=int(params["n_estimators"]),
                max_depth=int(params["max_depth"]),
                min_samples_split=int(params["min_samples_split"]),
                min_samples_leaf=int(params["min_samples_leaf"]),
                random_state=42,
                n_jobs=-1
            ))
        ])

        score = cross_val_score(pipeline, feature_dicts, y, scoring="neg_root_mean_squared_error", cv=3)
        rmse = -score.mean()
        mlflow.log_metric("rmse", rmse)
        return {"loss": rmse, "status": STATUS_OK}

# --- 4. Define search space ---
search_space_rf = {
    "n_estimators": hp.quniform("n_estimators", 50, 300, 10),
    "max_depth": hp.quniform("max_depth", 5, 20, 1),
    "min_samples_split": hp.quniform("min_samples_split", 2, 10, 1),
    "min_samples_leaf": hp.quniform("min_samples_leaf", 1, 5, 1)
}

# --- 5. Run optimization ---
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("milk-price-predictor")

run_date = datetime.today()
run_name = f"rf-milk-predictor-{run_date.year}-{run_date.month:02d}"
model_name = "milk-price-predictor-rf"

with mlflow.start_run(run_name=run_name) as run:
    trials_rf = Trials()
    best_rf = fmin(fn=objective_rf, space=search_space_rf, algo=tpe.suggest, max_evals=25, trials=trials_rf)

    # --- 6. Retrain best model with full data ---
    dv = DictVectorizer()
    X = dv.fit_transform(feature_dicts)

    final_rf = RandomForestRegressor(
        n_estimators=int(best_rf["n_estimators"]),
        max_depth=int(best_rf["max_depth"]),
        min_samples_split=int(best_rf["min_samples_split"]),
        min_samples_leaf=int(best_rf["min_samples_leaf"]),
        random_state=42,
        n_jobs=-1
    )

    pipeline = Pipeline([
        ("vectorizer", dv),
        ("regressor", final_rf)
    ])
    pipeline.fit(feature_dicts, y)
    y_pred = pipeline.predict(feature_dicts)
    rmse = root_mean_squared_error(y, y_pred)

    mlflow.log_metric("final_rmse", rmse)
    mlflow.set_tags({"model_type": "random_forest"})
    mlflow.sklearn.log_model(pipeline, artifact_path="model")

    # --- 7. Register model ---
    run_id = run.info.run_id
    model_uri = f"runs:/{run_id}/model"
    mlflow.register_model(model_uri=model_uri, name=model_name)

    print(f"✅ Final RMSE Random Forest: {rmse:.4f}")
    print(f"📌 Model logged to MLflow with run ID: {run_id}")
    print(f"📌 Model registered in MLflow Model Registry as '{model_name}'")

🏃 View run sincere-bat-175 at: http://127.0.0.1:5000/#/experiments/2/runs/a93304a78c5c4d928c81036c11c3c989

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2

🏃 View run tasteful-cow-840 at: http://127.0.0.1:5000/#/experiments/2/runs/d6e999241e0141c587b8b9bb8fadddf3

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2                   

🏃 View run stately-worm-770 at: http://127.0.0.1:5000/#/experiments/2/runs/0e3e5461b95347b0925496ba469f9e4e

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2                  

🏃 View run sassy-goose-434 at: http://127.0.0.1:5000/#/experiments/2/runs/fba598d8f4da44f1b26db9bfbb5f4d2b

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2                    

🏃 View run defiant-mouse-5 at: http://127.0.0.1:5000/#/experiments/2/runs/1cc741c3a80a44658bb7be32e31d553f

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2                    

🏃 View run rebellious-mule-879 at: http://127.0.0.1:5000/#/experiments/2/run

Successfully registered model 'milk-price-predictor-rf'.
2025/07/31 11:49:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: milk-price-predictor-rf, version 1


✅ Final RMSE Random Forest: 0.2710
📌 Model logged to MLflow with run ID: 05870e884bc84f9bb9336cd8044d13f5
📌 Model registered in MLflow Model Registry as 'milk-price-predictor-rf'
🏃 View run rf-milk-predictor-2025-07 at: http://127.0.0.1:5000/#/experiments/2/runs/05870e884bc84f9bb9336cd8044d13f5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2


Created version '1' of model 'milk-price-predictor-rf'.
