In [7]:
# Create the directory if it doesn't exist
!mkdir -p ../data

# Download files using curl
!curl -o ../data/green_tripdata_2024-01.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-01.parquet
!curl -o ../data/green_tripdata_2024-02.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-02.parquet

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1330k  100 1330k    0     0  1440k      0 --:--:-- --:--:-- --:--:-- 1439k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1253k  100 1253k    0     0  2345k      0 --:--:-- --:--:-- --:--:-- 2347k


In [8]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [9]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [10]:
df_train = read_dataframe('../data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2024-02.parquet')

In [11]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [12]:
categorical = ['PU_DO']  #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [13]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [23]:
import dagshub
import mlflow


dagshub.init(url="https://github.com/JuanPab2009/nyc-taxi-time-prediction.git", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="nyc-taxi-experiment")

https://github.com/JuanPab2009/nys-taxi-time-prediction.mlflow


MlflowException: API request to endpoint /api/2.0/mlflow/experiments/get-by-name failed with error code 404 != 200. Response body: 'Not Found'

In [14]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2024-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2024-02")

In [15]:
from dagshub import get_repo_bucket_client
# Get a boto3.client object
s3 = get_repo_bucket_client("JuanPab2009/nys-taxi-time-prediction")

# Upload file
s3.upload_file(
    Bucket="nys-taxi-time-prediction",  # name of the repo
    Filename="../data/green_tripdata_2024-01.parquet",  # local path of file to upload
    Key="train_data.parquet",  # remote path where to upload the file
)
# Upload file
s3.upload_file(
    Bucket="nys-taxi-time-prediction",  # name of the repo
    Filename="../data/green_tripdata_2024-02.parquet",  # local path of file to upload
    Key="eval_data.parquet",  # remote path where to upload the file
)

In [4]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import pathlib
import mlflow


In [5]:
mlflow.sklearn.autolog()

def objective(params):
    with mlflow.start_run(nested=True):
         
        # Tag model
        mlflow.set_tag("model_family", "ridge_regression")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        model = Ridge(**params)
        model.fit(X_train, y_train)
        
        # Log model
        mlflow.sklearn.log_model(model, artifact_path="model")
         
        # Predict in the val dataset
        y_pred = model.predict(X_val)
        
        # Calculate metric
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        
        # Log performance metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}



In [6]:
# Definir el espacio de búsqueda de hiperparámetros para Ridge Regression
search_space = {
    'alpha': hp.loguniform('alpha', -5, 1)
}

# Ejecutar la optimización de hiperparámetros
with mlflow.start_run(run_name="Ridge Hyper-parameter Optimization", nested=True):
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )

    # Convertir parámetros si es necesario
    best_params["alpha"] = float(best_params["alpha"])
    
    # Registrar los mejores parámetros encontrados
    mlflow.log_params(best_params)

    # Log tags
    mlflow.set_tags(
        tags={
            "project": "NYC Taxi Time Prediction Project",
            "optimizer_engine": "hyper-opt",
            "model_family": "ridge_regression",
            "feature_set_version": 1,
        }
    )

    # Entrenar el modelo con los mejores hiperparámetros
    model = Ridge(**best_params)
    model.fit(X_train, y_train)
    
    # Predecir en el conjunto de validación
    y_pred = model.predict(X_val)
    
    # Calcular RMSE
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    # Guardar el preprocesador (DictVectorizer)
    pathlib.Path("models").mkdir(exist_ok=True)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    
    # Subir el preprocesador como artefacto
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

job exception: name 'X_train' is not defined



  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]


NameError: name 'X_train' is not defined