In [1]:

# Download files using curl
!curl -o ../data/green_tripdata_2024-01.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-01.parquet
!curl -o ../data/green_tripdata_2024-02.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-02.parquet

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1330k  100 1330k    0     0  2197k      0 --:--:-- --:--:-- --:--:-- 2209k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1253k  100 1253k    0     0  2202k      0 --:--:-- --:--:-- --:--:-- 2215k


In [2]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [3]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [4]:
df_train = read_dataframe('../data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2024-02.parquet')

### Feature Eng

In [5]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

### One-HOT

In [6]:
categorical = ['PU_DO']  #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [7]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

Definir el `tracking URI` y el nombre del experimento

In [8]:
import dagshub
import mlflow


dagshub.init(url="https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="nyc-taxi-experiment")

https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow


2024/09/17 22:00:02 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/466268d3d8d74b46943efeddea55f2b9', creation_time=1726632002450, experiment_id='1', last_update_time=1726632002450, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

Definir los `dataset` como objetos de `mlflow` para poderlos trackear

In [9]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2024-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2024-02")

### Subir los dataset al storage que nos brinda `dagshub`

Ahora vamos a entrenar un modelo `xgboost`


In [10]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import pathlib

In [11]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

Definir la función objetivo

In [12]:
def objective(params):
    with mlflow.start_run(nested=True):
         
        # Tag model
        mlflow.set_tag("model_family", "xgboost")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=10
        )
        
        # Log xgboost model with artifact_path
        mlflow.xgboost.log_model(booster, artifact_path="model")
         
        # Predict in the val dataset
        y_pred = booster.predict(valid)
        
        # Calculate metric
        rmse = root_mean_squared_error(y_val, y_pred)
        
        # Log performance metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

Definir el espacio de búsqueda

In [13]:
mlflow.xgboost.autolog()

with mlflow.start_run(run_name="Xgboost Hyper-parameter Optimization", nested=True):
    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
        'learning_rate': hp.loguniform('learning_rate', -3, 0),
        'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
        'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
        'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
        'objective': 'reg:squarederror',
        'seed': 42
    }
    
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["seed"] = 42
    best_params["objective"] = "reg:squarederror"
    
    mlflow.log_params(best_params)

    # Log tags
    mlflow.set_tags(
        tags={
            "project": "NYC Taxi Time Prediction Project",
            "optimizer_engine": "hyper-opt",
            "model_family": "xgboost",
            "feature_set_version": 1,
        }
    )

    # Log a fit model instance
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=10
    )
        
    y_pred = booster.predict(valid)
    
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    
    pathlib.Path("models").mkdir(exist_ok=True)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
        
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

[0]	validation-rmse:8.09896                           
[1]	validation-rmse:7.32575                           
[2]	validation-rmse:6.76319                           
[3]	validation-rmse:6.32962                           
[4]	validation-rmse:6.04984                           
[5]	validation-rmse:5.83818                           
[6]	validation-rmse:5.69000                           
[7]	validation-rmse:5.56775                           
[8]	validation-rmse:5.49712                           
[9]	validation-rmse:5.44679                           
[10]	validation-rmse:5.40702                          
[11]	validation-rmse:5.38711                          
[12]	validation-rmse:5.35783                          
[13]	validation-rmse:5.34736                          
[14]	validation-rmse:5.33147                          
[15]	validation-rmse:5.32508                          
[16]	validation-rmse:5.32554                          
[17]	validation-rmse:5.32389                          
[18]	valid






2024/09/17 22:01:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run legendary-panda-199 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/3d4ba3a541ab4ae79f14edd920a1d99e.

2024/09/17 22:01:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



[0]	validation-rmse:8.54185                                                    
[1]	validation-rmse:8.04739                                                    
[2]	validation-rmse:7.62088                                                    
[3]	validation-rmse:7.25442                                                    
[4]	validation-rmse:6.93883                                                    
[5]	validation-rmse:6.67021                                                    
[6]	validation-rmse:6.44250                                                    
[7]	validation-rmse:6.24822                                                    
[8]	validation-rmse:6.08539                                                    
[9]	validation-rmse:5.94725                                                    
[10]	validation-rmse:5.83103                                                   
[11]	validation-rmse:5.73377                                                   
[12]	validation-rmse:5.65346            






2024/09/17 22:02:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run marvelous-foal-582 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/57bd2cfe71384f71871e4e807ff28e5c.

2024/09/17 22:02:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



[0]	validation-rmse:8.60414                                                    
[1]	validation-rmse:8.15942                                                    
[2]	validation-rmse:7.77146                                                    
[3]	validation-rmse:7.43437                                                    
[4]	validation-rmse:7.14217                                                    
[5]	validation-rmse:6.89060                                                    
[6]	validation-rmse:6.67421                                                    
[7]	validation-rmse:6.48890                                                    
[8]	validation-rmse:6.33058                                                    
[9]	validation-rmse:6.19579                                                    
[10]	validation-rmse:6.08122                                                   
[11]	validation-rmse:5.98359                                                   
[12]	validation-rmse:5.90169            






2024/09/17 22:02:32 INFO mlflow.tracking._tracking_service.client: 🏃 View run omniscient-sheep-901 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/314ffc80cd2a47fc8002cf8729479a7b.

2024/09/17 22:02:32 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



[0]	validation-rmse:7.51942                                                    
[1]	validation-rmse:6.57431                                                    
[2]	validation-rmse:6.04064                                                    
[3]	validation-rmse:5.75007                                                    
[4]	validation-rmse:5.59324                                                    
[5]	validation-rmse:5.50603                                                    
[6]	validation-rmse:5.45577                                                    
[7]	validation-rmse:5.42628                                                    
[8]	validation-rmse:5.40389                                                    
[9]	validation-rmse:5.39220                                                    
[10]	validation-rmse:5.38274                                                   
[11]	validation-rmse:5.37663                                                   
[12]	validation-rmse:5.37217            






2024/09/17 22:02:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run chill-kite-282 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/4b4193362c9e493c9228759333029245.

2024/09/17 22:02:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



[0]	validation-rmse:5.51335                                                    
[1]	validation-rmse:5.46128                                                    
[2]	validation-rmse:5.45392                                                    
[3]	validation-rmse:5.46024                                                    
[4]	validation-rmse:5.42366                                                    
[5]	validation-rmse:5.42133                                                    
[6]	validation-rmse:5.41000                                                    
[7]	validation-rmse:5.40656                                                    
[8]	validation-rmse:5.40666                                                    
[9]	validation-rmse:5.40224                                                    
[10]	validation-rmse:5.38388                                                   
[11]	validation-rmse:5.38150                                                   
[12]	validation-rmse:5.38274            






2024/09/17 22:03:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run likeable-ram-206 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/796cc71049d14ef889adc66910b92efc.

2024/09/17 22:03:34 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



[0]	validation-rmse:5.83908                                                    
[1]	validation-rmse:5.35692                                                    
[2]	validation-rmse:5.28477                                                    
[3]	validation-rmse:5.26693                                                    
[4]	validation-rmse:5.25484                                                    
[5]	validation-rmse:5.24652                                                    
[6]	validation-rmse:5.23643                                                    
[7]	validation-rmse:5.23276                                                    
[8]	validation-rmse:5.23093                                                    
[9]	validation-rmse:5.23243                                                    
[10]	validation-rmse:5.23418                                                   
[11]	validation-rmse:5.23111                                                   
[12]	validation-rmse:5.22038            






2024/09/17 22:04:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run carefree-kite-120 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/cc4552b3937243d18e011639f332d908.

2024/09/17 22:04:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



[0]	validation-rmse:5.66708                                                    
[1]	validation-rmse:5.39337                                                    
[2]	validation-rmse:5.36454                                                    
[3]	validation-rmse:5.36575                                                    
[4]	validation-rmse:5.36137                                                    
[5]	validation-rmse:5.35233                                                    
[6]	validation-rmse:5.34431                                                    
[7]	validation-rmse:5.33861                                                    
[8]	validation-rmse:5.32008                                                    
[9]	validation-rmse:5.31638                                                    
[10]	validation-rmse:5.31182                                                   
[11]	validation-rmse:5.31261                                                   
[12]	validation-rmse:5.30937            






2024/09/17 22:04:43 INFO mlflow.tracking._tracking_service.client: 🏃 View run polite-frog-422 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/0bccd7a9a933400ab88a4f87bdb923b9.

2024/09/17 22:04:43 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



[0]	validation-rmse:8.47250                                                    
[1]	validation-rmse:7.93128                                                    
[2]	validation-rmse:7.47575                                                    
[3]	validation-rmse:7.09466                                                    
[4]	validation-rmse:6.77783                                                    
[5]	validation-rmse:6.51603                                                    
[6]	validation-rmse:6.30104                                                    
[7]	validation-rmse:6.12566                                                    
[8]	validation-rmse:5.98088                                                    
[9]	validation-rmse:5.86384                                                    
[10]	validation-rmse:5.76883                                                   
[11]	validation-rmse:5.69179                                                   
[12]	validation-rmse:5.62982            






2024/09/17 22:05:24 INFO mlflow.tracking._tracking_service.client: 🏃 View run resilient-moose-258 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/2e891aa9e49b4e179ebcb1fc3be0c7cb.

2024/09/17 22:05:24 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



[0]	validation-rmse:6.28359                                                    
[1]	validation-rmse:5.49508                                                    
[2]	validation-rmse:5.29298                                                    
[3]	validation-rmse:5.24171                                                    
[4]	validation-rmse:5.21646                                                    
[5]	validation-rmse:5.20702                                                    
[6]	validation-rmse:5.20131                                                    
[7]	validation-rmse:5.20269                                                    
[8]	validation-rmse:5.19963                                                    
[9]	validation-rmse:5.20109                                                    
[10]	validation-rmse:5.19770                                                   
[11]	validation-rmse:5.19544                                                   
[12]	validation-rmse:5.19382            






2024/09/17 22:05:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run bittersweet-gnu-318 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/bffbc4c8fb9b44e0a19d5c8e3fea97cc.

2024/09/17 22:05:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



[0]	validation-rmse:8.74052                                                    
[1]	validation-rmse:8.40229                                                    
[2]	validation-rmse:8.09427                                                    
[3]	validation-rmse:7.81437                                                    
[4]	validation-rmse:7.56043                                                    
[5]	validation-rmse:7.33086                                                    
[6]	validation-rmse:7.12329                                                    
[7]	validation-rmse:6.93634                                                    
[8]	validation-rmse:6.76792                                                    
[9]	validation-rmse:6.61674                                                    
[10]	validation-rmse:6.48130                                                   
[11]	validation-rmse:6.35975                                                   
[12]	validation-rmse:6.25191            






2024/09/17 22:06:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run merciful-fly-160 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/73d7e73c190044019459d61d474f8a7c.

2024/09/17 22:06:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



100%|██████████| 10/10 [06:47<00:00, 40.78s/trial, best loss: 5.183994724726782]
[0]	validation-rmse:8.54185
[1]	validation-rmse:8.04739
[2]	validation-rmse:7.62088
[3]	validation-rmse:7.25442
[4]	validation-rmse:6.93883
[5]	validation-rmse:6.67021
[6]	validation-rmse:6.44250
[7]	validation-rmse:6.24822
[8]	validation-rmse:6.08539
[9]	validation-rmse:5.94725
[10]	validation-rmse:5.83103
[11]	validation-rmse:5.73377
[12]	validation-rmse:5.65346
[13]	validation-rmse:5.58571
[14]	validation-rmse:5.53000
[15]	validation-rmse:5.48277
[16]	validation-rmse:5.44303
[17]	validation-rmse:5.40992
[18]	validation-rmse:5.38199
[19]	validation-rmse:5.35873
[20]	validation-rmse:5.33923
[21]	validation-rmse:5.32277
[22]	validation-rmse:5.30810
[23]	validation-rmse:5.29622
[24]	validation-rmse:5.28599
[25]	validation-rmse:5.27770
[26]	validation-rmse:5.26939
[27]	validation-rmse:5.26258
[28]	validation-rmse:5.25676
[29]	validation-rmse:5.25125
[30]	validation-rmse:5.24648
[31]	validation-rmse:5.24292
[

2024/09/17 22:07:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run Xgboost Hyper-parameter Optimization at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/b45298034fab4dada6e8c0e9acc2243d.
2024/09/17 22:07:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.


In [14]:
best_params

{'learning_rate': np.float64(0.09671672343916111),
 'max_depth': 36,
 'min_child_weight': np.float64(3.8732527372365584),
 'reg_alpha': np.float64(0.23287797717839576),
 'reg_lambda': np.float64(0.01125039925753359),
 'seed': 42,
 'objective': 'reg:squarederror'}

Ahora vamos a registrar el mejor modelo en el `model registry` y usarlo para hacer predicciones

In [15]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Successfully registered model 'nyc-taxi-model'.
2024/09/17 22:07:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-model, version 1
Created version '1' of model 'nyc-taxi-model'.


In [16]:
from datetime import datetime
from mlflow import MlflowClient

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="nyc-taxi-model",
    description="Model registry for the NYC Taxi Time Prediction Project",
)

new_alias = "champion"
date = datetime.today()
model_version = "1"

# create "champion" alias for version 1 of model "nyc-taxi-model"
client.set_registered_model_alias(
    name="nyc-taxi-model",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="nyc-taxi-model",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=['champion'], creation_timestamp=1726632469987, current_stage='None', description='The model version 1 was transitioned to champion on 2024-09-17 22:07:50.290987', last_updated_timestamp=1726632470584, name='nyc-taxi-model', run_id='57bd2cfe71384f71871e4e807ff28e5c', run_link='', source='mlflow-artifacts:/466268d3d8d74b46943efeddea55f2b9/57bd2cfe71384f71871e4e807ff28e5c/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

In [17]:
import mlflow.pyfunc

model_name = "nyc-taxi-model"
alias = "champion"

model_uri = f"models:/{model_name}@{alias}"

champion_version = mlflow.pyfunc.load_model(
    model_uri=model_uri
)

champion_version.predict(X_val)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

array([19.017244 , 28.313835 ,  9.2324295, ..., 32.791355 , 14.018734 ,
       19.644476 ], dtype=float32)