In [6]:
!mkdir -p ../data

# Download files using curl
!curl -o ../data/green_tripdata_2024-01.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-01.parquet
!curl -o ../data/green_tripdata_2024-02.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-02.parquet

The syntax of the command is incorrect.
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1330k  100 1330k    0     0   983k      0  0:00:01  0:00:01 --:--:--  986k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1253k  100 1253k    0     0  2187k      0 --:--:-- --:--:-- --:--:-- 2195k


In [7]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [8]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [9]:
df_train = read_dataframe('../data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2024-02.parquet')

In [10]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [11]:
categorical = ['PU_DO']  #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [12]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [13]:
import dagshub
import mlflow


dagshub.init(url="https://dagshub.com/luislopez3105/DagsTry", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="nyc-taxi-experiment")

https://dagshub.com/luislopez3105/DagsTry.mlflow


<Experiment: artifact_location='mlflow-artifacts:/c93521310bc748fcb8a0fdae5cf3921e', creation_time=1726635194614, experiment_id='0', last_update_time=1726635194614, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [14]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2024-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2024-02")

In [15]:
from dagshub import get_repo_bucket_client
# Get a boto3.client object
s3 = get_repo_bucket_client("luislopez3105/DagsTry")

# Upload file
s3.upload_file(
    Bucket="DagsTry",  # name of the repo
    Filename="local.csv",  # local path of file to upload
    Key="remote.csv",  # remote path where to upload the file
)
# Download file
s3.download_file(
    Bucket="DagsTry",  # name of the repo
    Key="remote.csv",  #  remote path from where to download the file
    Filename="local.csv",  # local path where to download the file
)

FileNotFoundError: [WinError 2] The system cannot find the file specified: 'local.csv'

In [16]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import pathlib

In [17]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [18]:
def objective(params):
    with mlflow.start_run(nested=True):
         
        # Tag model
        mlflow.set_tag("model_family", "xgboost")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=10
        )
        
        # Log xgboost model with artifact_path
        mlflow.xgboost.log_model(booster, artifact_path="model")
         
        # Predict in the val dataset
        y_pred = booster.predict(valid)
        
        # Calculate metric
        rmse = root_mean_squared_error(y_val, y_pred)
        
        # Log performance metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [19]:
mlflow.xgboost.autolog()

with mlflow.start_run(run_name="Xgboost Hyper-parameter Optimization", nested=True):
    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
        'learning_rate': hp.loguniform('learning_rate', -3, 0),
        'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
        'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
        'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
        'objective': 'reg:squarederror',
        'seed': 42
    }
    
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["seed"] = 42
    best_params["objective"] = "reg:squarederror"
    
    mlflow.log_params(best_params)
        # Log tags
    mlflow.set_tags(
        tags={
            "project": "NYC Taxi Time Prediction Project",
            "optimizer_engine": "hyper-opt",
            "model_family": "xgboost",
            "feature_set_version": 1,
        }
    )

    # Log a fit model instance
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=10
    )
        
    y_pred = booster.predict(valid)
    
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    
    pathlib.Path("models").mkdir(exist_ok=True)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
        
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

[0]	validation-rmse:6.63897                           
[1]	validation-rmse:5.71850                           
[2]	validation-rmse:5.40834                           
[3]	validation-rmse:5.30873                           
[4]	validation-rmse:5.27121                           
[5]	validation-rmse:5.25463                           
[6]	validation-rmse:5.24295                           
[7]	validation-rmse:5.24010                           
[8]	validation-rmse:5.22961                           
[9]	validation-rmse:5.22724                           
[10]	validation-rmse:5.22613                          
[11]	validation-rmse:5.21591                          
[12]	validation-rmse:5.21475                          
[13]	validation-rmse:5.21307                          
[14]	validation-rmse:5.21159                          
[15]	validation-rmse:5.21046                          
[16]	validation-rmse:5.20535                          
[17]	validation-rmse:5.20483                          
[18]	valid






2024/09/24 21:23:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run delightful-wolf-547 at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0/runs/20446b5db6384debab153b6186d191f5.

2024/09/24 21:23:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0.



[0]	validation-rmse:7.82578                                                    
[1]	validation-rmse:6.96785                                                    
[2]	validation-rmse:6.40925                                                    
[3]	validation-rmse:6.05459                                                    
[4]	validation-rmse:5.83670                                                    
[5]	validation-rmse:5.69721                                                    
[6]	validation-rmse:5.60909                                                    
[7]	validation-rmse:5.55285                                                    
[8]	validation-rmse:5.51631                                                    
[9]	validation-rmse:5.49363                                                    
[10]	validation-rmse:5.47488                                                   
[11]	validation-rmse:5.46512                                                   
[12]	validation-rmse:5.45519            






2024/09/24 21:24:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run capricious-fox-977 at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0/runs/a5f5af6997854f9c970e831eb5a8cac8.

2024/09/24 21:24:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0.



[0]	validation-rmse:8.43714                                                    
[1]	validation-rmse:7.87218                                                    
[2]	validation-rmse:7.40458                                                    
[3]	validation-rmse:7.01390                                                    
[4]	validation-rmse:6.70013                                                    
[5]	validation-rmse:6.44091                                                    
[6]	validation-rmse:6.22951                                                    
[7]	validation-rmse:6.05329                                                    
[8]	validation-rmse:5.91798                                                    
[9]	validation-rmse:5.80533                                                    
[10]	validation-rmse:5.70801                                                   
[11]	validation-rmse:5.63706                                                   
[12]	validation-rmse:5.58095            






2024/09/24 21:24:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run casual-perch-782 at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0/runs/a20e87e41b6f40bebbf1600731ef3a57.

2024/09/24 21:24:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0.



[0]	validation-rmse:7.39815                                                    
[1]	validation-rmse:6.43561                                                    
[2]	validation-rmse:5.89359                                                    
[3]	validation-rmse:5.63460                                                    
[4]	validation-rmse:5.49231                                                    
[5]	validation-rmse:5.41834                                                    
[6]	validation-rmse:5.36143                                                    
[7]	validation-rmse:5.34148                                                    
[8]	validation-rmse:5.32772                                                    
[9]	validation-rmse:5.32051                                                    
[10]	validation-rmse:5.31623                                                   
[11]	validation-rmse:5.31192                                                   
[12]	validation-rmse:5.30832            






2024/09/24 21:26:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run crawling-crab-40 at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0/runs/eb417abd27e0499eadcb9892b002be02.

2024/09/24 21:26:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0.



[0]	validation-rmse:7.51652                                                    
[1]	validation-rmse:6.56291                                                    
[2]	validation-rmse:5.99373                                                    
[3]	validation-rmse:5.68804                                                    
[4]	validation-rmse:5.52271                                                    
[5]	validation-rmse:5.43799                                                    
[6]	validation-rmse:5.37369                                                    
[7]	validation-rmse:5.34637                                                    
[8]	validation-rmse:5.32998                                                    
[9]	validation-rmse:5.30633                                                    
[10]	validation-rmse:5.30106                                                   
[11]	validation-rmse:5.29638                                                   
[12]	validation-rmse:5.29080            






2024/09/24 21:27:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run able-bass-789 at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0/runs/ed18352865244def8bcbb5fa2e8076dc.

2024/09/24 21:27:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0.



[0]	validation-rmse:7.16910                                                    
[1]	validation-rmse:6.16670                                                    
[2]	validation-rmse:5.68442                                                    
[3]	validation-rmse:5.45945                                                    
[4]	validation-rmse:5.35682                                                    
[5]	validation-rmse:5.30592                                                    
[6]	validation-rmse:5.28074                                                    
[7]	validation-rmse:5.26436                                                    
[8]	validation-rmse:5.25947                                                    
[9]	validation-rmse:5.25106                                                    
[10]	validation-rmse:5.24628                                                   
[11]	validation-rmse:5.24223                                                   
[12]	validation-rmse:5.23917            






2024/09/24 21:27:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run funny-gnat-765 at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0/runs/a260b4bac6f443a890c2052c50ddda97.

2024/09/24 21:27:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0.



[0]	validation-rmse:6.46859                                                    
[1]	validation-rmse:5.60370                                                    
[2]	validation-rmse:5.35355                                                    
[3]	validation-rmse:5.26473                                                    
[4]	validation-rmse:5.22750                                                    
[5]	validation-rmse:5.21178                                                    
[6]	validation-rmse:5.20477                                                    
[7]	validation-rmse:5.20687                                                    
[8]	validation-rmse:5.20485                                                    
[9]	validation-rmse:5.20287                                                    
[10]	validation-rmse:5.19974                                                   
[11]	validation-rmse:5.19986                                                   
[12]	validation-rmse:5.19798            






2024/09/24 21:28:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run bouncy-deer-5 at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0/runs/56cd5d6513b747fa98deff2772a96c74.

2024/09/24 21:28:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0.



[0]	validation-rmse:8.62484                                                    
[1]	validation-rmse:8.19698                                                    
[2]	validation-rmse:7.82340                                                    
[3]	validation-rmse:7.49674                                                    
[4]	validation-rmse:7.21270                                                    
[5]	validation-rmse:6.96667                                                    
[6]	validation-rmse:6.75373                                                    
[7]	validation-rmse:6.57142                                                    
[8]	validation-rmse:6.41522                                                    
[9]	validation-rmse:6.28138                                                    
[10]	validation-rmse:6.16435                                                   
[11]	validation-rmse:6.06586                                                   
[12]	validation-rmse:5.98226            






2024/09/24 21:28:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run painted-doe-351 at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0/runs/1798bf0acc97424599d190f3411847c2.

2024/09/24 21:28:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0.



[0]	validation-rmse:6.34553                                                    
[1]	validation-rmse:5.61855                                                    
[2]	validation-rmse:5.43939                                                    
[3]	validation-rmse:5.38973                                                    
[4]	validation-rmse:5.37020                                                    
[5]	validation-rmse:5.35522                                                    
[6]	validation-rmse:5.34169                                                    
[7]	validation-rmse:5.33784                                                    
[8]	validation-rmse:5.33510                                                    
[9]	validation-rmse:5.32998                                                    
[10]	validation-rmse:5.32258                                                   
[11]	validation-rmse:5.32138                                                   
[12]	validation-rmse:5.31021            






2024/09/24 21:29:01 INFO mlflow.tracking._tracking_service.client: 🏃 View run brawny-crow-133 at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0/runs/c695f13bded54541aae74b44d8d2b209.

2024/09/24 21:29:01 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0.



[0]	validation-rmse:5.35364                                                    
[1]	validation-rmse:5.28116                                                    
[2]	validation-rmse:5.28337                                                    
[3]	validation-rmse:5.28116                                                    
[4]	validation-rmse:5.28553                                                    
[5]	validation-rmse:5.28979                                                    
[6]	validation-rmse:5.29328                                                    
[7]	validation-rmse:5.29576                                                    
[8]	validation-rmse:5.29899                                                    
[9]	validation-rmse:5.29706                                                    
[10]	validation-rmse:5.29834                                                   
[11]	validation-rmse:5.29465                                                   
 90%|█████████ | 9/10 [05:56<00:30, 30.8






2024/09/24 21:29:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run sassy-ox-347 at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0/runs/f33e92b5931149d0b7b1f290d341cbd9.

2024/09/24 21:29:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0.



100%|██████████| 10/10 [06:19<00:00, 37.96s/trial, best loss: 5.175687056621277]
[0]	validation-rmse:7.16910
[1]	validation-rmse:6.16670
[2]	validation-rmse:5.68442
[3]	validation-rmse:5.45945
[4]	validation-rmse:5.35682
[5]	validation-rmse:5.30592
[6]	validation-rmse:5.28074
[7]	validation-rmse:5.26436
[8]	validation-rmse:5.25947
[9]	validation-rmse:5.25106
[10]	validation-rmse:5.24628
[11]	validation-rmse:5.24223
[12]	validation-rmse:5.23917
[13]	validation-rmse:5.23773
[14]	validation-rmse:5.23591
[15]	validation-rmse:5.23308
[16]	validation-rmse:5.22389
[17]	validation-rmse:5.21697
[18]	validation-rmse:5.21447
[19]	validation-rmse:5.21418
[20]	validation-rmse:5.21218
[21]	validation-rmse:5.21185
[22]	validation-rmse:5.20970
[23]	validation-rmse:5.20660
[24]	validation-rmse:5.20627
[25]	validation-rmse:5.20317
[26]	validation-rmse:5.20205
[27]	validation-rmse:5.19594
[28]	validation-rmse:5.19480
[29]	validation-rmse:5.19293
[30]	validation-rmse:5.19223
[31]	validation-rmse:5.19290
[

2024/09/24 21:29:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run Xgboost Hyper-parameter Optimization at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0/runs/f873d91f99034b62acd7652a2c619c86.
2024/09/24 21:29:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/luislopez3105/DagsTry.mlflow/#/experiments/0.


In [20]:

best_params

{'learning_rate': np.float64(0.35206489986051664),
 'max_depth': 45,
 'min_child_weight': np.float64(4.038555548396214),
 'reg_alpha': np.float64(0.0707815473304588),
 'reg_lambda': np.float64(0.28043485285458397),
 'seed': 42,
 'objective': 'reg:squarederror'}

In [21]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Successfully registered model 'nyc-taxi-model'.


MlflowException: API request to https://dagshub.com/luislopez3105/DagsTry.mlflow/api/2.0/mlflow/runs/get failed with exception HTTPSConnectionPool(host='dagshub.com', port=443): Max retries exceeded with url: /luislopez3105/DagsTry.mlflow/api/2.0/mlflow/runs/get?run_uuid=Primero&run_id=Primero (Caused by ResponseError('too many 500 error responses'))

In [None]:
from datetime import datetime
from mlflow import MlflowClient

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="nyc-taxi-model",
    description="Model registry for the NYC Taxi Time Prediction Project",
)

new_alias = "champion"
date = datetime.today()
model_version = "1"

# create "champion" alias for version 1 of model "nyc-taxi-model"
client.set_registered_model_alias(
    name="nyc-taxi-model",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="nyc-taxi-model",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

In [None]:
import mlflow.pyfunc

model_name = "nyc-taxi-model"
alias = "champion"

model_uri = f"models:/{model_name}@{alias}"

champion_version = mlflow.pyfunc.load_model(
    model_uri=model_uri
)

champion_version.predict(X_val)

OSError: xelatex not found on PATH, if you have not installed xelatex you may need to do so. Find further instructions at https://nbconvert.readthedocs.io/en/latest/install.html#installing-tex.