In [1]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [2]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [3]:
df_train = read_dataframe('../data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2024-02.parquet')

In [4]:
#feature
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [5]:
#one hot
categorical = ['PU_DO']  #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [6]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [7]:
import dagshub
import mlflow


dagshub.init(url="https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="nyc-taxi-experiment")

https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow


<Experiment: artifact_location='mlflow-artifacts:/466268d3d8d74b46943efeddea55f2b9', creation_time=1726632002450, experiment_id='1', last_update_time=1726632002450, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [8]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2024-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2024-02")

In [9]:
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import numpy as np
 

# random forest

In [12]:
mlflow.sklearn.autolog()
 
def objective_rf(params):
    with mlflow.start_run(nested=True):
        # Set model tag
        mlflow.set_tag("model_family", "random_forest")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train RandomForest model
        rf_model = RandomForestRegressor(
            n_estimators=int(params['n_estimators']),
            max_depth=int(params['max_depth']),
            min_samples_split=int(params['min_samples_split']),
            min_samples_leaf=int(params['min_samples_leaf']),
            random_state=42
        )
        rf_model.fit(X_train, y_train)
        
        # Predict on validation dataset
        y_pred = rf_model.predict(X_val)
        
        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        
        # Log RMSE metric
        mlflow.log_metric("rmse", rmse)
 
    return {'loss': rmse, 'status': STATUS_OK}
 
# Define search space for RandomForest
search_space_rf = {
    'n_estimators': hp.quniform('n_estimators', 50, 100, 1),
    'max_depth': hp.quniform('max_depth', 5, 10, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 8, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 2, 1),
}
 
# Run hyperparameter optimization
with mlflow.start_run(run_name="Parent Random Forest", nested=True):
    best_params_rf = fmin(
        fn=objective_rf,
        space=search_space_rf,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    # Log best parameters
    mlflow.log_params(best_params_rf)
 



  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]



2024/09/20 15:44:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run efficient-auk-173 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/d1618bd4eb2d43b4962bce528517b9e7.

2024/09/20 15:44:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



 10%|█         | 1/10 [00:14<02:14, 14.99s/trial, best loss: 5.542766364570989]



2024/09/20 15:45:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run skillful-chimp-991 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/5a23da67b77141b589f3f6001844de6d.

2024/09/20 15:45:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



 20%|██        | 2/10 [00:31<02:08, 16.11s/trial, best loss: 5.471769447414504]



2024/09/20 15:45:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run masked-stoat-259 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/68a6ff4aa3fb45ae91db3968d4300019.

2024/09/20 15:45:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



 30%|███       | 3/10 [00:43<01:38, 14.12s/trial, best loss: 5.471769447414504]



2024/09/20 15:45:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run salty-wasp-538 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/9205e8fd98b34b9ab4139c651ef3aed9.

2024/09/20 15:45:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



 40%|████      | 4/10 [00:57<01:25, 14.19s/trial, best loss: 5.471769447414504]



2024/09/20 15:45:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run intrigued-eel-78 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/1219abe931a448748b047db9c145ce8b.

2024/09/20 15:45:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



 50%|█████     | 5/10 [01:12<01:11, 14.24s/trial, best loss: 5.471769447414504]



2024/09/20 15:46:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run angry-koi-97 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/03e4c88a6ebf49c78b82fef3654d8143.

2024/09/20 15:46:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



 60%|██████    | 6/10 [01:27<00:57, 14.49s/trial, best loss: 5.450624636267601]



2024/09/20 15:46:16 INFO mlflow.tracking._tracking_service.client: 🏃 View run peaceful-trout-512 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/602909b1cc364fb3a19d1d1a9b92abbc.

2024/09/20 15:46:16 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



 70%|███████   | 7/10 [01:38<00:40, 13.45s/trial, best loss: 5.450624636267601]



2024/09/20 15:46:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run bittersweet-frog-498 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/c3f0223f81364829b3eb495693622b4a.

2024/09/20 15:46:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



 80%|████████  | 8/10 [01:51<00:26, 13.32s/trial, best loss: 5.450624636267601]



2024/09/20 15:46:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run clean-kit-414 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/14c35197f365406da457e674cc9f6da5.

2024/09/20 15:46:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



 90%|█████████ | 9/10 [02:03<00:12, 12.92s/trial, best loss: 5.450624636267601]



2024/09/20 15:46:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run brawny-lark-188 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/aba45471604546b19be6ba466cf74ed4.

2024/09/20 15:46:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



100%|██████████| 10/10 [02:19<00:00, 13.93s/trial, best loss: 5.435353486700445]


2024/09/20 15:46:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run Parent Random Forest at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/fa654f9fa42c4738b93d9d76a90253a4.
2024/09/20 15:46:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.


In [13]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
2024/09/20 15:49:41 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-model, version 2
Created version '2' of model 'nyc-taxi-model'.


# gradient Boosting

In [14]:
mlflow.sklearn.autolog()
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
 
def objective_gb(params):
    with mlflow.start_run(nested=True):
        # Set model tag
        mlflow.set_tag("model_family", "gradient_boosting")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train GradientBoosting model
        gb_model = GradientBoostingRegressor(
            n_estimators=int(params['n_estimators']),
            max_depth=int(params['max_depth']),
            min_samples_split=int(params['min_samples_split']),
            min_samples_leaf=int(params['min_samples_leaf']),
            learning_rate=float(params['learning_rate']),
            random_state=42
        )
        gb_model.fit(X_train, y_train)
        
        # Predict on validation dataset
        y_pred = gb_model.predict(X_val)
        
        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        
        # Log RMSE metric
        mlflow.log_metric("rmse", rmse)
 
    return {'loss': rmse, 'status': STATUS_OK}
 
# Define search space for GradientBoosting
search_space_gb = {
    'n_estimators': hp.quniform('n_estimators', 50, 120, 1),
    'max_depth': hp.quniform('max_depth', 3, 8, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 6, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 2, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.1)
}
 
# Run hyperparameter optimization for GradientBoosting
with mlflow.start_run(run_name="Parent Gradient Boosting", nested=True):
    best_params_gb = fmin(
        fn=objective_gb,
        space=search_space_gb,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    # Log best parameters
    mlflow.log_params(best_params_gb)



  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]



2024/09/20 15:57:08 INFO mlflow.tracking._tracking_service.client: 🏃 View run crawling-goose-821 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/779bc7d35b844649bc07023180aabeca.

2024/09/20 15:57:08 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



 10%|█         | 1/10 [00:12<01:53, 12.62s/trial, best loss: 5.441902692404991]



2024/09/20 15:57:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run bittersweet-grub-240 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/87fc083cec8349b6a2704de6deb34b76.

2024/09/20 15:57:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



 20%|██        | 2/10 [00:23<01:34, 11.87s/trial, best loss: 5.418700803120237]



2024/09/20 15:57:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run delightful-yak-423 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/eb342df58a834b719569054168a116c3.

2024/09/20 15:57:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



 30%|███       | 3/10 [00:39<01:34, 13.48s/trial, best loss: 5.390252493073788]



2024/09/20 15:57:48 INFO mlflow.tracking._tracking_service.client: 🏃 View run salty-conch-586 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/d4015ef4c6de4fa8b81977312fa7ba3e.

2024/09/20 15:57:48 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



 40%|████      | 4/10 [00:52<01:20, 13.48s/trial, best loss: 5.377202836696288]



2024/09/20 15:58:05 INFO mlflow.tracking._tracking_service.client: 🏃 View run beautiful-kit-296 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/a0c26d2bc3364d60a5660be3d01b9877.

2024/09/20 15:58:05 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



 50%|█████     | 5/10 [01:09<01:14, 14.81s/trial, best loss: 5.364124503514281]



2024/09/20 15:58:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run painted-mouse-913 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/202d9b9fed6e45cabb006e8418b3fe3c.

2024/09/20 15:58:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



 60%|██████    | 6/10 [01:22<00:55, 13.88s/trial, best loss: 5.364124503514281]



2024/09/20 15:58:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run thoughtful-fowl-529 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/03933df389ef42b7b9a5567b6bc07515.

2024/09/20 15:58:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



 70%|███████   | 7/10 [01:37<00:43, 14.38s/trial, best loss: 5.364124503514281]



2024/09/20 15:58:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run amazing-dog-955 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/eaf16c57593d419b90f686bd53d4095a.

2024/09/20 15:58:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



 80%|████████  | 8/10 [01:53<00:29, 14.93s/trial, best loss: 5.364124503514281]



2024/09/20 15:59:02 INFO mlflow.tracking._tracking_service.client: 🏃 View run popular-ant-920 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/3737b54664504a889ae891494d6c7598.

2024/09/20 15:59:02 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



 90%|█████████ | 9/10 [02:06<00:14, 14.25s/trial, best loss: 5.364124503514281]



2024/09/20 15:59:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run overjoyed-worm-149 at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/78c323e4fdad4f78a20690de33fef555.

2024/09/20 15:59:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.



100%|██████████| 10/10 [02:16<00:00, 13.70s/trial, best loss: 5.364124503514281]


2024/09/20 15:59:13 INFO mlflow.tracking._tracking_service.client: 🏃 View run Parent Gradient Boosting at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1/runs/e9737cea7bf64e928d68ba6c363b58c1.
2024/09/20 15:59:13 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Maria-Paula-PR/nyc-taxi-time-prediction.mlflow/#/experiments/1.


In [17]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
2024/09/20 16:02:32 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-model, version 3
Created version '3' of model 'nyc-taxi-model'.


El mejor modelo era el challenger anterior, ya que tiene mejor metrica entonces nos quedamos con ese. 