## Cambiar 

In [1]:
import os
import time
import pickle
import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

import mlflow
from mlflow import MlflowClient

# === MLflow ===
TRACKING_URI = "sqlite:///mlflow.db"
EXPERIMENT_NAME = "nyc-taxi-challenger"
REGISTERED_MODEL_NAME = "nyc-taxi-model"   # ya existe en tu registry

mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

client = MlflowClient(tracking_uri=TRACKING_URI)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


2025/10/21 15:40:09 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/10/21 15:40:09 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


In [2]:
def read_dataframe(path):
    df = pd.read_parquet(path)
    df["lpep_dropoff_datetime"] = pd.to_datetime(df["lpep_dropoff_datetime"])
    df["lpep_pickup_datetime"] = pd.to_datetime(df["lpep_pickup_datetime"])
    df["duration"] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    df[["PULocationID", "DOLocationID"]] = df[["PULocationID", "DOLocationID"]].astype(str)
    return df

def build_features(df, dv=None, fit_dv=False):
    df = df.copy()
    df["PU_DO"] = df["PULocationID"] + "_" + df["DOLocationID"]
    categorical = ["PU_DO"]
    numerical = ["trip_distance"]
    dicts = df[categorical + numerical].to_dict(orient="records")

    if fit_dv:
        dv = DictVectorizer(sparse=True)
        X = dv.fit_transform(dicts)
    else:
        X = dv.transform(dicts)

    y = df["duration"].values
    return X, y, dv

# paths de entrenamiento/validación (mismo split que tu ejemplo)
train_path = "../data/green_tripdata_2025-01.parquet"
val_path   = "../data/green_tripdata_2025-02.parquet"

df_train = read_dataframe(train_path)
df_val   = read_dataframe(val_path)

# DictVectorizer (compartido por todos los experimentos de este notebook)
X_train, y_train, dv = build_features(df_train, dv=None, fit_dv=True)
X_val,   y_val,   _  = build_features(df_val, dv=dv, fit_dv=False)

# Guarda el preprocesador para adjuntarlo como artifact en cada run
os.makedirs("models", exist_ok=True)
with open("models/preprocessor.b", "wb") as f:
    pickle.dump(dv, f)


In [3]:
def densify_with_sample(X, y, max_rows=120_000):
    n = X.shape[0]
    if n > max_rows:
        idx = np.random.choice(n, size=max_rows, replace=False)
        X_sub = X[idx]
        y_sub = y[idx]
    else:
        X_sub = X
        y_sub = y
    X_dense = X_sub.toarray().astype("float32")  # reducir uso de memoria
    return X_dense, y_sub


In [None]:
"""# === PRECOMPUTE: un solo muestreo pequeño y densificación ===
Xtr_dense_small, ytr_small = densify_with_sample(X_train, y_train, max_rows=30_000)
Xva_dense_small, yva_small = densify_with_sample(X_val,   y_val,   max_rows=15_000)

import optuna
from sklearn.ensemble import GradientBoostingRegressor
import time

PARENT_NAME_GB = "GB_parent_fast"

def gb_objective_fast(trial):
    # Espacio reducido (más rápido)
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 60, 220),
        "learning_rate": trial.suggest_float("learning_rate", 0.03, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 2, 6),
        "subsample": 0.8,  # fijo para acelerar
        "random_state": RANDOM_STATE,
    }

    with mlflow.start_run(nested=True, run_name=f"GB_child_fast_{trial.number}"):
        model = GradientBoostingRegressor(**params)
        tic = time.time()
        model.fit(Xtr_dense_small, ytr_small)
        fit_time = time.time() - tic

        y_pred = model.predict(Xva_dense_small)
        rmse = root_mean_squared_error(yva_small, y_pred)

        mlflow.log_params(params)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("fit_time_s", fit_time)
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
        mlflow.sklearn.log_model(model, "model")

    return rmse

study = optuna.create_study(direction="minimize")

with mlflow.start_run(run_name=PARENT_NAME_GB):
    mlflow.set_tags({"role": "parent", "model_family": "GradientBoosting", "mode": "fast"})
    # Menos trials + límite de tiempo
    study.optimize(gb_objective_fast, n_trials=8, timeout=300, show_progress_bar=False)

best_gb_rmse = study.best_value
best_gb_params = study.best_params
best_gb_rmse, best_gb_params
"""

[I 2025-10-21 15:59:10,167] A new study created in memory with name: no-name-dea92eca-450a-47f1-8bc2-a297cdf622ba
[I 2025-10-21 16:05:59,817] Trial 0 finished with value: 5.477489002306669 and parameters: {'n_estimators': 196, 'learning_rate': 0.2964579088358797, 'max_depth': 5}. Best is trial 0 with value: 5.477489002306669.


(5.477489002306669,
 {'n_estimators': 196, 'learning_rate': 0.2964579088358797, 'max_depth': 5})

In [None]:
"""# === PRECOMPUTE: un solo muestreo pequeño y densificación (rápido) ===
Xtr_dense_small, ytr_small = densify_with_sample(X_train, y_train, max_rows=30_000)
Xva_dense_small, yva_small = densify_with_sample(X_val,   y_val,   max_rows=15_000)

import optuna
import time
from sklearn.ensemble import RandomForestRegressor

PARENT_NAME_RF = "RF_parent_fast"

def rf_objective_fast(trial):
    # Espacio reducido → mucho más rápido
    params = {
        "n_estimators":      trial.suggest_int("n_estimators", 100, 320),   # menos árboles
        "max_depth":         trial.suggest_int("max_depth", 4, 16),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 8),
        "min_samples_leaf":  trial.suggest_int("min_samples_leaf", 1, 6),
        "max_features":      trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "bootstrap":         True,   # fijo
        "n_jobs":            -1,
        "random_state":      RANDOM_STATE,
    }

    with mlflow.start_run(nested=True, run_name=f"RF_child_fast_{trial.number}"):
        model = RandomForestRegressor(**params)
        tic = time.time()
        model.fit(Xtr_dense_small, ytr_small)
        fit_time = time.time() - tic

        y_pred = model.predict(Xva_dense_small)
        rmse = root_mean_squared_error(yva_small, y_pred)

        # logging MLflow
        mlflow.log_params(params)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("fit_time_s", fit_time)
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
        mlflow.sklearn.log_model(model, "model")

    return rmse

study_rf = optuna.create_study(direction="minimize")

with mlflow.start_run(run_name=PARENT_NAME_RF):
    mlflow.set_tags({"role": "parent", "model_family": "RandomForest", "mode": "fast"})
    # Pocos trials + límite de tiempo (5 min). Sube/baja según tu máquina.
    study_rf.optimize(rf_objective_fast, n_trials=8, timeout=300, show_progress_bar=False)

best_rf_rmse   = study_rf.best_value
best_rf_params = study_rf.best_params
best_rf_rmse, best_rf_params
"""

[I 2025-10-21 17:21:45,886] A new study created in memory with name: no-name-b32be769-c253-49c7-82b9-33a327d0f034
[I 2025-10-21 17:21:49,411] Trial 0 finished with value: 8.909299537488362 and parameters: {'n_estimators': 173, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 0 with value: 8.909299537488362.
[I 2025-10-21 17:21:54,217] Trial 1 finished with value: 8.724166369406431 and parameters: {'n_estimators': 148, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 1 with value: 8.724166369406431.
[I 2025-10-21 17:22:05,931] Trial 2 finished with value: 8.366976511121173 and parameters: {'n_estimators': 269, 'max_depth': 11, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 2 with value: 8.366976511121173.
[I 2025-10-21 17:22:08,804] Trial 3 finished with value: 8.973510461656403 and parameters: {'n_estimators': 117, 'max_depth': 8, 'min_sample

(8.204351711306584,
 {'n_estimators': 111,
  'max_depth': 15,
  'min_samples_split': 6,
  'min_samples_leaf': 5,
  'max_features': 'sqrt'})

In [9]:
# === Selección robusta del mejor run por RMSE ===
exp = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if exp is None:
    raise ValueError(f"Experimento '{EXPERIMENT_NAME}' no existe. Asegúrate de haber hecho mlflow.set_experiment('{EXPERIMENT_NAME}') antes de loguear.")

# Trae todo y luego filtramos en pandas (evita problemas con NULL en tags)
runs = mlflow.search_runs(
    experiment_ids=[exp.experiment_id],
    order_by=["metrics.rmse ASC", "start_time DESC"]
)

if runs.empty:
    raise ValueError(
        "No se encontraron runs en este experimento. "
        "Revisa que tus entrenamientos hayan corrido en este EXPERIMENT_NAME."
    )

# Quita explícitamente los parents (role == 'parent') y los runs sin rmse
mask_not_parent = runs["tags.role"].fillna("").ne("parent")
mask_has_rmse   = runs["metrics.rmse"].notna()
child_runs = runs[mask_not_parent & mask_has_rmse].copy()

if child_runs.empty:
    # Diagnóstico rápido
    print("Diagnóstico:")
    print("- Cantidad total de runs:", len(runs))
    print("- Cantidad con tag role='parent':", (runs['tags.role'] == 'parent').sum())
    print("- ¿Hay columna 'metrics.rmse'? ->", 'metrics.rmse' in runs.columns)
    print("- Cantidad con rmse no nulo:", runs['metrics.rmse'].notna().sum())
    raise ValueError(
        "No hay child runs con métrica 'rmse'. "
        "Asegúrate de hacer mlflow.log_metric('rmse', ...) dentro de cada child run "
        "y que no fallen/timeout antes de loguear."
    )

# Ordena y toma el mejor
child_runs = child_runs.sort_values(by=["metrics.rmse", "start_time"], ascending=[True, False])
best_run   = child_runs.iloc[0]

best_run_id = best_run.run_id
best_rmse   = float(best_run["metrics.rmse"])
best_model_family = best_run.get("tags.model_family", "desconocido")

best_run_id, best_model_family, best_rmse


('fa642cf4081e45e897fea88fa5219321', None, 5.477489002306669)

In [10]:
# Registra este mejor modelo en el mismo Registered Model y asígnale alias "challenger"
model_uri = f"runs:/{best_run_id}/model"
result = mlflow.register_model(model_uri=model_uri, name=REGISTERED_MODEL_NAME)

# Espera a que se materialice la versión (en sqlite suele ser inmediato)
time.sleep(2)

# Obtén la última versión (la que acabamos de crear)
versions = client.search_model_versions(f"name = '{REGISTERED_MODEL_NAME}'")
latest_version = max(int(v.version) for v in versions)
client.set_registered_model_alias(name=REGISTERED_MODEL_NAME, alias="challenger", version=latest_version)

latest_version

2025/10/21 19:47:38 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/10/21 19:47:38 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
Created version '7' of model 'nyc-taxi-model'.


7

In [12]:
test_path = "../data/green_tripdata_2024-03.parquet"
df_test = read_dataframe(test_path)

# Log as artifact + como input dataset dentro de un run "dataset-log"
with mlflow.start_run(run_name="log_2024-03_dataset"):
    # artifact
    mlflow.log_artifact(test_path, artifact_path="datasets")

    # input dataset (si tu versión de MLflow lo soporta)
    try:
        ds = mlflow.data.from_pandas(df_test, source=test_path, name="green_tripdata_2024-03")
        mlflow.log_input(ds, context="testing")
    except Exception as e:
        print("log_input no disponible / opcional:", e)


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


In [13]:
# Helpers para cargar modelo por alias + su preprocessor
def load_model_and_dv_by_alias(name, alias):
    mv = client.get_model_version_by_alias(name=name, alias=alias)
    # descarga su preprocessor
    dst = f"tmp/{alias}"
    os.makedirs(dst, exist_ok=True)
    try:
        client.download_artifacts(run_id=mv.run_id, path="preprocessor/preprocessor.b", dst_path=dst)
        with open(os.path.join(dst, "preprocessor", "preprocessor.b"), "rb") as f:
            dv_local = pickle.load(f)
    except Exception as e:
        print(f"[{alias}] No se encontró preprocessor en artifacts; usando el dv del notebook. Motivo:", e)
        dv_local = dv  # fallback (si ambos fueron entrenados con mismo dv)

    model = mlflow.pyfunc.load_model(f"models:/{name}@{alias}")
    return model, dv_local

def preprocess_with_dv(df, dv_):
    df = df.copy()
    df["PU_DO"] = df["PULocationID"] + "_" + df["DOLocationID"]
    categorical = ["PU_DO"]
    numerical   = ["trip_distance"]
    dicts = df[categorical + numerical].to_dict(orient="records")
    return dv_.transform(dicts)

# Construye X_test con cada dv
champion_model, champion_dv = load_model_and_dv_by_alias(REGISTERED_MODEL_NAME, "champion")
challenger_model, challenger_dv = load_model_and_dv_by_alias(REGISTERED_MODEL_NAME, "challenger")

X_test_champion = preprocess_with_dv(df_test, champion_dv)
X_test_challenger = preprocess_with_dv(df_test, challenger_dv)
y_test = df_test["duration"].values

# Predicción y métricas
def timed_predict(model, X):
    tic = time.time()
    y_pred = model.predict(X)
    latency = (time.time() - tic) / len(y_pred)  # s/ejemplo
    return y_pred, latency

y_pred_ch, lat_ch = timed_predict(champion_model, X_test_champion)
y_pred_cl, lat_cl = timed_predict(challenger_model, X_test_challenger)

rmse_champion   = root_mean_squared_error(y_test, y_pred_ch)
rmse_challenger = root_mean_squared_error(y_test, y_pred_cl)

print(f"Champion RMSE (2024-03):   {rmse_champion:.4f} | Latencia: {lat_ch*1000:.4f} ms/ejemplo")
print(f"Challenger RMSE (2024-03): {rmse_challenger:.4f} | Latencia: {lat_cl*1000:.4f} ms/ejemplo")
print(f"Mejora relativa RMSE: {(rmse_champion - rmse_challenger)/rmse_champion*100:.2f}%")


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]


[champion] No se encontró preprocessor en artifacts; usando el dv del notebook. Motivo: Failed to download artifacts from path 'preprocessor.b', please ensure that the path is correct.


Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 2041.02it/s]


[challenger] No se encontró preprocessor en artifacts; usando el dv del notebook. Motivo: [Errno 2] No such file or directory: 'tmp/challenger/preprocessor/preprocessor.b'
Champion RMSE (2024-03):   5.4013 | Latencia: 0.0104 ms/ejemplo
Challenger RMSE (2024-03): 5.4334 | Latencia: 0.0017 ms/ejemplo
Mejora relativa RMSE: -0.59%


In [16]:
# Criterios ejemplo (ajústalos a tu contexto):
# 1) Mejora de RMSE >= 3%
# 2) Latencia no empeora > 20% (por ejemplo en producción)
# 3) Complejidad razonable (para árboles, número de estimadores <= 600, etc.)

improvement = (rmse_champion - rmse_challenger) / rmse_champion
latency_ratio = lat_cl / lat_ch

promote = (improvement >= 0.06) and (latency_ratio <= 1.2)

print(f"Mejora relativa: {improvement*100:.2f}% | Ratio latencia challenger/champion: {latency_ratio:.2f}")
print("¿Promover a champion?:", "SÍ ✅" if promote else "NO ❌")

# Si decides promover:
if promote:
    mv = client.get_model_version_by_alias(name=REGISTERED_MODEL_NAME, alias="challenger")
    client.set_registered_model_alias(name=REGISTERED_MODEL_NAME, alias="champion", version=mv.version)
    print(f"Promovido: challenger v{mv.version} ahora es 'champion'.")


Mejora relativa: -0.59% | Ratio latencia challenger/champion: 0.17
¿Promover a champion?: NO ❌


En los experimentos realizados con Gradient Boosting y Random Forest como modelos candidatos (challengers), el mejor modelo obtuvo un RMSE 0.59 % mayor que el modelo actualmente desplegado (champion).
Aunque presenta menor latencia de inferencia (~83 % más rápido), la pérdida de precisión es prioritaria en este caso.
Por tanto, no se promueve el modelo challenger a champion. Se mantendrá el modelo vigente y se realizarán nuevos experimentos ajustando los hiperparámetros del challenger para mejorar el trade-off entre precisión y velocidad.