### Imports

In [None]:
import os, mlflow
from dotenv import load_dotenv
import pathlib, time, requests
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from mlflow.models.signature import infer_signature
from sklearn.ensemble import RandomForestRegressor
import pandas as pd, pickle, pathlib
from mlflow import MlflowClient
from datetime import datetime
import mlflow.pyfunc
import pickle

### Crear un nuevo jupyter-notebook llamado challenger-experiments.ipynb en la rama creada anteriormente

In [None]:
# --- Preparación de entorno y experimento MLflow, Tarea P2 ---
load_dotenv(override=True)
EXPERIMENT_NAME = "/Users/marianasgg19@gmail.com/nyc-taxi-experiments"  

# Acrónimos:
HW_PREFIX = "P2-F-"                 # Proyecto/Tarea 2
HW_TAG_PROJECT = "Tarea_P2"      # tag 
HW_TAG_PURPOSE = "challenger_selection"

mlflow.set_tracking_uri("databricks")
experiment = mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

print("Tracking URI:", mlflow.get_tracking_uri())
print("Experiment:", experiment)


Tracking URI: databricks
Experiment: <Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/1544369421830463', creation_time=1761101032420, experiment_id='1544369421830463', last_update_time=1761619147784, lifecycle_stage='active', name='/Users/marianasgg19@gmail.com/nyc-taxi-experiments', tags={'mlflow.experiment.sourceName': '/Users/marianasgg19@gmail.com/nyc-taxi-experiments',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'marianasgg19@gmail.com',
 'mlflow.ownerId': '76345526162318'}>


### Descargue en la carpeta data el conjunto de datos correspondiente a marzo del 2025

In [None]:
# --- Descargar de marzo 2025 a la carpeta data ---
DATA_DIR = pathlib.Path("../data")
DATA_DIR.mkdir(parents=True, exist_ok=True)
fname = DATA_DIR / "green_tripdata_2025-03.parquet"
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2025-03.parquet"

# Descargar o mostrar que ya fue descargado o que no se pudo 
if not fname.exists():
    for attempt in range(3):
        try:
            print(f"Descargando {url} -> {fname} (intento {attempt+1})")
            r = requests.get(url, timeout=60); r.raise_for_status()
            fname.write_bytes(r.content)
            print("Descarga completa.")
            break
        except Exception as e:
            print("Fallo:", e); time.sleep(2)
    else:
        raise RuntimeError("No se pudo descargar el archivo después de 3 intentos.")
else:
    print("Archivo ya existe:", fname)


Archivo ya existe: ../data/green_tripdata_2025-03.parquet


### Base de preparación de datos para entrenar/validar y reusar en pruebas

In [None]:
# --- Carga, limpieza y vectorización base (enero y febrero) ---
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    return df

df_train = read_dataframe('../data/green_tripdata_2025-01.parquet')
df_val   = read_dataframe('../data/green_tripdata_2025-02.parquet')

def preprocess(df, dv):
    df = df.copy()
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']; numerical = ['trip_distance']
    records = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(records)

dv = DictVectorizer()
df_train = df_train.copy()
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
X_train = dv.fit_transform(df_train[['PU_DO','trip_distance']].to_dict(orient='records'))

X_val = preprocess(df_val, dv)

#target
target = 'duration'
y_train = df_train[target].values
y_val   = df_val[target].values

# Log opcional de datasets (como en tu base):
training_dataset   = mlflow.data.from_numpy(getattr(X_train, "data", X_train), targets=y_train, name="green_tripdata_2025-01")
validation_dataset = mlflow.data.from_numpy(getattr(X_val, "data", X_val), targets=y_val, name="green_tripdata_2025-02")

X_train.shape, X_val.shape


((46307, 4159), (44218, 4159))

### — Parent #1 Gradient Boost con child experiments

In [None]:
# --- Parent #1: GradientBoostingRegressor con child experiments ---

# - 3 trials, espacio de hiperparámetros pequeño, subset para acelerar.

np.random.seed(42)

# 1) Preparamos DataFrames con nombres de columnas 
feature_names = [f"f{i}" for i in range(X_train.shape[1])]
X_train_df = pd.DataFrame(X_train.toarray().astype("float32"), columns=feature_names)
X_val_df   = pd.DataFrame(X_val.toarray().astype("float32"),   columns=feature_names)

# 2) Subset para la búsqueda (rápido)
N = X_train_df.shape[0]
subset_size = min(10000, N)   # reduce para más velocidad
subset_idx = np.random.choice(N, size=subset_size, replace=False)
X_train_sub_df = X_train_df.iloc[subset_idx].reset_index(drop=True)
y_train_sub    = y_train[subset_idx]

def sample_gbr_params():
    # Espacio chico 
    return {
        "n_estimators": int(np.random.randint(60, 151)),      # 60-150
        "learning_rate": float(np.random.uniform(0.05, 0.30)),# 0.05-0.30
        "max_depth": int(np.random.randint(2, 7)),            # 2-6
        "subsample": float(np.random.uniform(0.7, 1.0)),      # 0.7-1.0
        "min_samples_leaf": int(np.random.randint(10, 41)),   # 10-40
        "n_iter_no_change": 10,
        "validation_fraction": 0.1,
        "tol": 1e-3,
        "random_state": 42
    }

parent_run_name = f"{HW_PREFIX}_GBR_PARENT"
best_gbr = None
best_gbr_rmse = float("inf")
best_gbr_params = None

TRIALS = 3  # pocos trials para rapidez
with mlflow.start_run(run_name=parent_run_name):
    mlflow.set_tags({
        "model_family": "GradientBoostingRegressor",
        "assignment": HW_TAG_PROJECT,
        "purpose": HW_TAG_PURPOSE,
        "parent": "true",
        "acronym": f"{HW_PREFIX}_GBR",
        "fast_mode": "simple_params_subset_es"
    })
    mlflow.log_dict({"search": "random_small", "trials": TRIALS, "subset_size": int(subset_size)}, "search_meta.json")

    for t in range(TRIALS):  # child experiments
        trial_run_name = f"{HW_PREFIX}_GBR_TRIAL_{t+1}"
        params = sample_gbr_params()

        with mlflow.start_run(run_name=trial_run_name, nested=True):
            mlflow.log_params(params)

            model = GradientBoostingRegressor(**params)
            model.fit(X_train_sub_df, y_train_sub)

            # Validación en febrero con el mismo esquema de columnas
            y_pred = model.predict(X_val_df)
            rmse = root_mean_squared_error(y_val, y_pred)
            mlflow.log_metric("validation_rmse", rmse)

            # Firma + logging del modelo 
            input_example = X_val_df.iloc[:5].copy()
            signature = infer_signature(input_example, y_pred[:5])

            mlflow.sklearn.log_model(
                sk_model=model,
                name="model",                 # artifact_path
                input_example=input_example,
                signature=signature
            )

            if rmse < best_gbr_rmse:
                best_gbr_rmse = rmse
                best_gbr = model
                best_gbr_params = params

    mlflow.log_metric("best_validation_rmse", best_gbr_rmse)
    mlflow.log_dict({"best_params": best_gbr_params}, "best_params.json")

best_gbr_rmse, best_gbr_params




🏃 View run P2-F-_GBR_TRIAL_1 at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1544369421830463/runs/cb8319190e084183b3ebf284f9003b34
🧪 View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1544369421830463




🏃 View run P2-F-_GBR_TRIAL_2 at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1544369421830463/runs/499c087bdcdb4be48f6643d787e600ef
🧪 View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1544369421830463




🏃 View run P2-F-_GBR_TRIAL_3 at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1544369421830463/runs/c6b02fe468ff45c5b57daceb6f0a5c66
🧪 View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1544369421830463
🏃 View run P2-F-_GBR_PARENT at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1544369421830463/runs/af0c469c1060459c82f856b5232a297b
🧪 View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1544369421830463


(5.645491587871815,
 {'n_estimators': 148,
  'learning_rate': 0.26522386218030747,
  'max_depth': 4,
  'subsample': 0.9577183631255183,
  'min_samples_leaf': 16,
  'n_iter_no_change': 10,
  'validation_fraction': 0.1,
  'tol': 0.001,
  'random_state': 42})

### — Parent #2 Random Forest con child experiments

In [None]:
# --- Parent #2: RandomForestRegressor con child experiments ---

# - 2 trials
# - n_jobs = -1 para paralelizar

np.random.seed(42)

# Aseguramos DataFrames con nombres de columnas 
if "X_train_df" not in globals() or "X_val_df" not in globals():
    feature_names = [f"f{i}" for i in range(X_train.shape[1])]
    X_train_df = pd.DataFrame(X_train.toarray().astype("float32"), columns=feature_names)
    X_val_df   = pd.DataFrame(X_val.toarray().astype("float32"),   columns=feature_names)

# Subset para acelerar la búsqueda
N = X_train_df.shape[0]
subset_size = min(20000, N)   
subset_idx = np.random.choice(N, size=subset_size, replace=False)
X_train_sub_df = X_train_df.iloc[subset_idx].reset_index(drop=True)
y_train_sub    = y_train[subset_idx]

def sample_rf_params():
    # Rango pequeño y rápido
    return {
        "n_estimators": int(np.random.randint(50, 121)),     # 50-120 árboles
        "max_depth": int(np.random.randint(8, 17)),          # 8-16
        "min_samples_split": int(np.random.randint(2, 6)),   # 2-5
        "min_samples_leaf": int(np.random.randint(5, 21)),   # 5-20
        "max_features": "sqrt",                               # rápido y estándar
        "bootstrap": False,                                   # Mas veloz
        "random_state": 42,
        "n_jobs": -1
    }

parent_run_name = f"{HW_PREFIX}_RF_PARENT"  
best_rf_rmse = float("inf")
best_rf_params = None

TRIALS = 2  
with mlflow.start_run(run_name=parent_run_name):
    mlflow.set_tags({
        "model_family": "RandomForestRegressor",
        "assignment": HW_TAG_PROJECT,
        "purpose": HW_TAG_PURPOSE,
        "parent": "true",
        "acronym": f"{HW_PREFIX}_RF",
        "fast_mode": "simple_params_subset_parallel"
    })
    mlflow.log_dict({"search": "random_small", "trials": TRIALS, "subset_size": int(subset_size)}, "search_meta.json")

    for t in range(TRIALS):
        trial_run_name = f"{HW_PREFIX}_RF_TRIAL_{t+1}"
        params = sample_rf_params()

        with mlflow.start_run(run_name=trial_run_name, nested=True):
            mlflow.log_params(params)

            model = RandomForestRegressor(**params)
            model.fit(X_train_sub_df, y_train_sub)

            y_pred = model.predict(X_val_df)
            rmse = root_mean_squared_error(y_val, y_pred)
            mlflow.log_metric("validation_rmse", rmse)

            input_example = X_val_df.iloc[:5].copy()
            signature = infer_signature(input_example, y_pred[:5])

            mlflow.sklearn.log_model(
                sk_model=model,
                name="model",
                input_example=input_example,
                signature=signature
            )

            if rmse < best_rf_rmse:
                best_rf_rmse = rmse
                best_rf = model
                best_rf_params = params

    mlflow.log_metric("best_validation_rmse", best_rf_rmse)
    mlflow.log_dict({"best_params": best_rf_params}, "best_params.json")

best_rf_rmse, best_rf_params




🏃 View run P2-F-_RF_TRIAL_1 at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1544369421830463/runs/7e94d43782354a66b4d1af2ae1c28b4b
🧪 View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1544369421830463




🏃 View run P2-F-_RF_TRIAL_2 at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1544369421830463/runs/41c63019b5324852b60effd1138cafb4
🧪 View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1544369421830463
🏃 View run P2-F-_RF_PARENT at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1544369421830463/runs/45e2839eb759442c80a83ca695bdefb1
🧪 View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1544369421830463


(8.416632801578565,
 {'n_estimators': 102,
  'max_depth': 11,
  'min_samples_split': 4,
  'min_samples_leaf': 16,
  'max_features': 'sqrt',
  'bootstrap': False,
  'random_state': 42,
  'n_jobs': -1})

### Registrar el mejor en nyc-taxi-model y asignar alias challenger con acrónimos visibles

In [None]:
# --- Selección del mejor + registro como CHALLENGER Tarea P2 ---

if best_gbr_rmse <= best_rf_rmse:
    best_model = best_gbr; family = "GradientBoostingRegressor"
    best_rmse = best_gbr_rmse; best_params = best_gbr_params
    best_acronym = f"{HW_PREFIX}_GBR"
else:
    best_model = best_rf; family = "RandomForestRegressor"
    best_rmse = best_rf_rmse; best_params = best_rf_params
    best_acronym = f"{HW_PREFIX}_RF"

print(f"Mejor familia: {family} | RMSE(val)={best_rmse:.4f}")

# Guardar preprocesador
pathlib.Path("preprocessor").mkdir(exist_ok=True)
with open("preprocessor/dv.pkl", "wb") as f_out:
    pickle.dump(dv, f_out)

feature_names = [f"f{i}" for i in range(X_val.shape[1])]
input_example = pd.DataFrame(X_val[:5].toarray(), columns=feature_names)
signature = infer_signature(input_example, best_model.predict(X_val[:5].toarray()))

model_name = "workspace.default.nyc-taxi-model"

# Run explícito con nombres/tags de la tarea
run_name = f"{HW_PREFIX}_CHALLENGER_REG_{family}"
with mlflow.start_run(run_name=run_name):
    mlflow.set_tags({
        "assignment": HW_TAG_PROJECT,
        "purpose": HW_TAG_PURPOSE,
        "role": "challenger_candidate",
        "model_family": family,
        "acronym": f"{best_acronym}_CHAL",   
    })
    mlflow.log_metric("validation_rmse", best_rmse)
    mlflow.log_dict({"best_params": best_params}, "best_params.json")
    mlflow.log_artifact("preprocessor/dv.pkl", artifact_path="preprocessor")

    mlflow.sklearn.log_model(
        best_model,
        artifact_path="model",
        input_example=input_example,
        signature=signature
    )
    run_id = mlflow.active_run().info.run_id

# Registro en el Model Registry
result = mlflow.register_model(model_uri=f"runs:/{run_id}/model", name=model_name)

client = MlflowClient()
model_version = result.version

# Alias "challenger"
client.set_registered_model_alias(
    name=model_name,
    alias="challenger",
    version=model_version
)

client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"[{HW_PREFIX}_CHAL] {family} registrado el {datetime.today().isoformat(timespec='seconds')} | RMSE(val)={best_rmse:.4f}"
)

print(f"Registrado como versión {model_version} con alias 'challenger' ({best_acronym}_CHAL).")


Mejor familia: GradientBoostingRegressor | RMSE(val)=5.6455




🏃 View run P2-F-_CHALLENGER_REG_GradientBoostingRegressor at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1544369421830463/runs/69c27115d86c41109c44e9d7f9bb5e57
🧪 View experiment at: https://dbc-5922e233-b716.cloud.databricks.com/ml/experiments/1544369421830463


Registered model 'workspace.default.nyc-taxi-model' already exists. Creating a new version of this model...
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00,  9.76it/s]
Uploading artifacts: 100%|██████████| 8/8 [00:03<00:00,  2.40it/s]
Created version '11' of model 'workspace.default.nyc-taxi-model'.


Registrado como versión 11 con alias 'challenger' (P2-F-_GBR_CHAL).


### Usar marzo 2025 para probar Champion vs Challenger y obtener métricas

In [None]:
# --- Evaluación en marzo 2025 de Champion vs Challenger ---
df_mar = read_dataframe("../data/green_tripdata_2025-03.parquet")

with open("preprocessor/dv.pkl", "rb") as f_in:
    dv_loaded = pickle.load(f_in)

X_mar = preprocess(df_mar, dv_loaded)
y_mar = df_mar['duration'].values

feature_names = [f"f{i}" for i in range(X_mar.shape[1])]
X_mar_df = pd.DataFrame(X_mar.toarray(), columns=feature_names)

champion_uri   = f"models:/{model_name}@Champion"
challenger_uri = f"models:/{model_name}@challenger"

champion   = mlflow.pyfunc.load_model(champion_uri)
challenger = mlflow.pyfunc.load_model(challenger_uri)

y_pred_champion   = champion.predict(X_mar_df)
y_pred_challenger = challenger.predict(X_mar_df)

rmse_champion   = root_mean_squared_error(y_mar, y_pred_champion)
rmse_challenger = root_mean_squared_error(y_mar, y_pred_challenger)

print(f"[{HW_PREFIX}_CHAMP] RMSE marzo 2025 - Champion : {rmse_champion:.4f}")
print(f"[{HW_PREFIX}_CHAL ] RMSE marzo 2025 - Challenger: {rmse_challenger:.4f}")


Downloading artifacts: 100%|██████████| 8/8 [00:01<00:00,  4.94it/s]
  model.load_model(xgb_model_path)
Downloading artifacts: 100%|██████████| 8/8 [00:01<00:00,  6.40it/s]


[P2-F-_CHAMP] RMSE marzo 2025 - Champion : 13.2010
[P2-F-_CHAL ] RMSE marzo 2025 - Challenger: 6.1863


### Decida promoción y justifique, tags de la tarea

In [None]:
# --- Decisión de promoción con umbral simple Tarea P2 ---
IMPROVEMENT_THRESHOLD = 0.005  # 0.5% mejora mínima
guardrail_ok = True

improvement = (rmse_champion - rmse_challenger) / rmse_champion
print(f"[{HW_PREFIX}] Mejora relativa Challenger vs Champion en marzo: {improvement*100:.2f}%")
print(f"[{HW_PREFIX}] Guardrails OK: {guardrail_ok}")

decision = "PROMOVER" if (improvement >= IMPROVEMENT_THRESHOLD and guardrail_ok) else "NO PROMOVER"
print(f"[{HW_PREFIX}] DECISIÓN: {decision}")

justificacion = []
if decision == "PROMOVER":
    justificacion.append(f"Challenger reduce RMSE marzo en {improvement*100:.2f}% (umbral {IMPROVEMENT_THRESHOLD*100:.2f}%).")
    justificacion.append("La validación previa no sugiere sobreajuste extremo.")
else:
    if improvement < IMPROVEMENT_THRESHOLD:
        justificacion.append(f"Mejora insuficiente: {improvement*100:.2f}% < {IMPROVEMENT_THRESHOLD*100:.2f}%.")
    if not guardrail_ok:
        justificacion.append("Algún guardrail operativo/ético falló.")
print(" | ".join(justificacion))

# --- Promoción automática a Champion si aplica ---
if decision == "PROMOVER":
    client = MlflowClient()
    mv = client.get_model_version_by_alias(name=model_name, alias="challenger")
    challenger_version = mv.version

    client.set_registered_model_alias(
        name=model_name,
        alias="Champion",
        version=challenger_version
    )
    # Nota en la versión:
    client.update_model_version(
        name=model_name,
        version=challenger_version,
        description=f"[{HW_PREFIX}_PROMOTED]->Champion | Mejora {improvement*100:.2f}% en RMSE marzo"
    )
    print(f"[{HW_PREFIX}] Alias 'Champion' -> versión {challenger_version} (ex 'challenger').")
else:
    print(f"[{HW_PREFIX}] Se mantiene el Champion actual.")


[P2-F-] Mejora relativa Challenger vs Champion en marzo: 53.14%
[P2-F-] Guardrails OK: True
[P2-F-] DECISIÓN: PROMOVER
Challenger reduce RMSE marzo en 53.14% (umbral 0.50%). | La validación previa no sugiere sobreajuste extremo.
[P2-F-] Alias 'Champion' -> versión 11 (ex 'challenger').


Todos los modelos finales de esta tarea tienen el prefijo de P2-F