# Load data and packages

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
import pickle
import joblib

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [22]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error

In [59]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

# For open the mlflow ui: mlflow ui --backend-store-uri sqlite:///mlflow.db

<Experiment: artifact_location=('file:///c:/Users/jhorea/OneDrive - Ecopetrol '
 'S.A/Documentos/mlops_zoomcamp_2024/2w_exp_tracking/mlruns/1'), creation_time=1716771880869, experiment_id='1', last_update_time=1716771880869, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [76]:
mlflow.get_artifact_uri()

'file:///c:/Users/jhorea/OneDrive - Ecopetrol S.A/Documentos/mlops_zoomcamp_2024/2w_exp_tracking/mlruns/1/6a05b9a3007a4b6a834b068da2b58aa8/artifacts'

# Relevant functions

In [6]:
def read_clean_data(data_link):
    data = pd.read_parquet(data_link)
    data['duration'] = data.tpep_dropoff_datetime - data.tpep_pickup_datetime
    data['duration'] = data['duration'].apply(lambda x: round(x.total_seconds()/60,4))
    data = data[(data["duration"]>=1) & ((data["duration"]<=60))]
    data['PULocationID']=data['PULocationID'].astype(str)
    data['DOLocationID']=data['DOLocationID'].astype(str)

    return data[['PULocationID','DOLocationID','duration']]    

In [7]:
def create_train_and_val_data(encoder,data_train,data_val,input_columns,target):
    
    encoder.fit(data_train[input_columns])
    column_names=encoder.get_feature_names_out()

    # print(data_train[input_columns])
    # print("--")
    # print(column_names)

    X_train = pd.DataFrame(encoder.transform(data_train[input_columns]),columns=column_names)
    X_val = pd.DataFrame(encoder.transform(data_val[input_columns]),columns=column_names)

    y_train = data_train[target].values
    y_val = data_val[target].values

    return X_train,X_val,y_train,y_val

In [8]:
def val_data_v2(X_val,y_val):
    y_val = pd.Series(y_val)
    row_sum = X_val.sum(axis=1)
    id_obs = X_val[row_sum == 1].index

    X_val_2 = X_val[row_sum == 2]

    #  To gather the y_val taking in consideration only the valid observations
    mask=np.where(y_val.index.isin(id_obs),False,True)
    y_val_2 = y_val[mask]

    return X_val_2,y_val_2
    

# Pre-homework

## 0.1 - Prepare the data

In [40]:
df_january = read_clean_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
df_february = read_clean_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [41]:
encoder = OneHotEncoder(sparse_output=False,handle_unknown='ignore')
X_train,X_val,y_train,y_val = create_train_and_val_data(encoder,df_january,df_february,['PULocationID','DOLocationID'],'duration')

In [42]:
X_val_2,y_val_2 = val_data_v2(X_val, y_val)

In [43]:
del df_january
del df_february
del encoder

In [18]:
model_lasso=Lasso(alpha=0.01)
model_lasso.fit(X_train,y_train)

# Save the model
joblib.dump(model_lasso, 'models/lasso_model.joblib')

del model_lasso

## 0.2 - Example of using mlflow

In [None]:
with mlflow.start_run():

    mlflow.set_tag("developer", "Jhonattan")

    mlflow.log_param("train-data-path", 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
    mlflow.log_param("valid-data-path", 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

    alpha = 0.1
    mlflow.log_param("alpha", alpha)
    model = Lasso(alpha)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    # mlflow.log_artifact(local_path="models/lin_reg.bin", artifact_path="models_pickle")



## 0.3 - Example of using hyperopt and autolog

In [13]:
import xgboost
from hyperopt import fmin, tpe, hp, Trials

In [45]:
def func_objetivo(params):
    with mlflow.start_run():
        mlflow.set_tag("developer", "Jhonattan")
        mlflow.set_tag("model", "xgboost_2wave")
        mlflow.log_params(params) #Not necessary when autolog enabled

        model = xgboost.XGBRegressor(**params)
        print("train start")
        model.fit(X_train,y_train, eval_set=[(X_train, y_train)])
        print("train finish")

        y_pred = model.predict(X_val_2)
        print("predict finish")
        error = root_mean_squared_error(y_pred,y_val_2)
        mlflow.log_metric("rmse",error)
        
        return error    

In [46]:
mlflow.xgboost.autolog(disable=True) #Conflicted with hyperopt or similar and dont allow the training process to finish

espacio = {
    'n_estimators': hp.choice('n_estimators', range(150, 300)),
    'max_depth': hp.choice('max_depth', range(6, 15)),
    'learning_rate': hp.uniform('learning_rate', 0.1, 0.8),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
    'random_state': 40,
    'n_jobs': 12,
    'eval_metric' : root_mean_squared_error,
    'early_stopping_rounds' : 30,
}

trials_ = Trials()

best_estimator = fmin(fn=func_objetivo, space=espacio, algo=tpe.suggest, max_evals=10, trials=trials_)

train start                                           
[0]	validation_0-rmse:8.86982	validation_0-root_mean_squared_error:8.86982
[1]	validation_0-rmse:8.54673	validation_0-root_mean_squared_error:8.54673
[2]	validation_0-rmse:8.11128	validation_0-root_mean_squared_error:8.11128
[3]	validation_0-rmse:7.84284	validation_0-root_mean_squared_error:7.84284
[4]	validation_0-rmse:7.77729	validation_0-root_mean_squared_error:7.77729
[5]	validation_0-rmse:7.70128	validation_0-root_mean_squared_error:7.70127
[6]	validation_0-rmse:7.64288	validation_0-root_mean_squared_error:7.64288
[7]	validation_0-rmse:7.58196	validation_0-root_mean_squared_error:7.58196
[8]	validation_0-rmse:7.52681	validation_0-root_mean_squared_error:7.52681
[9]	validation_0-rmse:7.48385	validation_0-root_mean_squared_error:7.48385
[10]	validation_0-rmse:7.43998	validation_0-root_mean_squared_error:7.43998
[11]	validation_0-rmse:7.41535	validation_0-root_mean_squared_error:7.41535
[12]	validation_0-rmse:7.35659	validation_0

In [48]:
# Now apply the best parameters in a model and save it, using autolog
print(best_estimator)

espacio = {
    'n_estimators': 150,
    'max_depth': 14,
    'learning_rate': 0.6841148295731673,
    'subsample': 0.6792524708491178,
    'colsample_bytree': 0.89215565512306,
    'random_state': 40,
    'n_jobs': 12,
    'eval_metric' : root_mean_squared_error,
    'early_stopping_rounds' : 5,
}

mlflow.xgboost.autolog(disable=False) 

with mlflow.start_run():
    mlflow.set_tag("developer", "Jhonattan")
    mlflow.set_tag("model", "xgboost_autolog")
    # mlflow.log_params(espacio) #Not necessary when autolog enabled
    model = xgboost.XGBRegressor(**espacio)
    print("train start")
    model.fit(X_train,y_train, eval_set=[(X_val_2, y_val_2)])
    print("train finish")
    y_pred = model.predict(X_val_2)
    print("predict finish")
    error = root_mean_squared_error(y_pred,y_val_2)
    mlflow.log_metric("rmse",error)
        

{'colsample_bytree': 0.89215565512306, 'learning_rate': 0.6841148295731673, 'max_depth': 8, 'n_estimators': 140, 'subsample': 0.6792524708491178}
train start
[0]	validation_0-rmse:8.24286	validation_0-root_mean_squared_error:8.24286
[1]	validation_0-rmse:7.91777	validation_0-root_mean_squared_error:7.91777
[2]	validation_0-rmse:7.67222	validation_0-root_mean_squared_error:7.67222
[3]	validation_0-rmse:7.54827	validation_0-root_mean_squared_error:7.54827
[4]	validation_0-rmse:7.37122	validation_0-root_mean_squared_error:7.37122
[5]	validation_0-rmse:7.30401	validation_0-root_mean_squared_error:7.30402
[6]	validation_0-rmse:7.24105	validation_0-root_mean_squared_error:7.24105
[7]	validation_0-rmse:7.14162	validation_0-root_mean_squared_error:7.14162
[8]	validation_0-rmse:7.09791	validation_0-root_mean_squared_error:7.09791
[9]	validation_0-rmse:7.00303	validation_0-root_mean_squared_error:7.00303
[10]	validation_0-rmse:6.91394	validation_0-root_mean_squared_error:6.91394
[11]	validation_



train finish
predict finish


## 0.4 Load a model from artifacts

In [None]:
model_path='runs:/d3ebe367cab24a66aed95d2d39eae3fb/model'
loaded_model = mlflow.xgboost.load_model(model_path)

## 0.5 - Delete everything

In [50]:
del model
del alpha
del rmse
del X_train
del X_val
del X_val_2
del y_train
del y_val
del y_val_2
del y_pred
del best_estimator
del espacio
del trials_
del loaded_model


## 0.6 - Interacting with MLflow client

In [63]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [66]:
# List all available experiments
client.search_experiments()

[<Experiment: artifact_location=('file:///c:/Users/jhorea/OneDrive - Ecopetrol '
  'S.A/Documentos/mlops_zoomcamp_2024/2w_exp_tracking/mlruns/1'), creation_time=1716771880869, experiment_id='1', last_update_time=1716771880869, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>,
 <Experiment: artifact_location=('file:///c:/Users/jhorea/OneDrive - Ecopetrol '
  'S.A/Documentos/mlops_zoomcamp_2024/2w_exp_tracking/mlruns/0'), creation_time=1716771880862, experiment_id='0', last_update_time=1716771880862, lifecycle_stage='active', name='Default', tags={}>]

In [68]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='1',
    filter_string="metrics.rmse < 5.5",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=10,
    order_by=["metrics.rmse ASC"]
)

for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

run id: 2d2a44df7541452cb9aea0580c717020, rmse: 5.2793
run id: dcb6fffc66c942cb9303fe96ab38f0b1, rmse: 5.2928
run id: 36f3eb46ccaf4b0593e672918d99234e, rmse: 5.3532
run id: 712d2444f4a94238b9645bc0fa2ba20c, rmse: 5.3679
run id: d3ebe367cab24a66aed95d2d39eae3fb, rmse: 5.3980
run id: 71732e7e52a642c0b98ae21bdef06c3d, rmse: 5.4312
run id: a4e5d6fe9025451d891ec9f788cf01eb, rmse: 5.4448


In [69]:
# Promoting a model to the registry

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

run_id = "2d2a44df7541452cb9aea0580c717020"
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name="nyc-regressor")

Registered model 'nyc-regressor' already exists. Creating a new version of this model...
Created version '2' of model 'nyc-regressor'.


<ModelVersion: aliases=[], creation_timestamp=1717459722327, current_stage='None', description=None, last_updated_timestamp=1717459722327, name='nyc-regressor', run_id='2d2a44df7541452cb9aea0580c717020', run_link=None, source=('file:///c:/Users/jhorea/OneDrive - Ecopetrol '
 'S.A/Documentos/mlops_zoomcamp_2024/2w_exp_tracking/mlruns/1/2d2a44df7541452cb9aea0580c717020/artifacts/model'), status='READY', status_message=None, tags={}, user_id=None, version=2>

In [73]:
model_name = "nyc-regressor"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 2, stage: None


  latest_versions = client.get_latest_versions(name=model_name)


In [75]:
model_version = 2
new_stage = "Staging"

from datetime import datetime
date = datetime.today().date()

client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_stage} on {date}"
)

<ModelVersion: aliases=[], creation_timestamp=1717459722327, current_stage='Staging', description='The model version 2 was transitioned to Staging on 2024-06-03', last_updated_timestamp=1717460094861, name='nyc-regressor', run_id='2d2a44df7541452cb9aea0580c717020', run_link=None, source=('file:///c:/Users/jhorea/OneDrive - Ecopetrol '
 'S.A/Documentos/mlops_zoomcamp_2024/2w_exp_tracking/mlruns/1/2d2a44df7541452cb9aea0580c717020/artifacts/model'), status='READY', status_message=None, tags={}, user_id=None, version=2>

# Homework development

## Q1 - Install mlflow - whats the version i have?

In [27]:
# !pip install mlflow

In [25]:
!mlflow --version

mlflow, version 2.13.0


Response: My mlflow version is 2.13.0

## Q2 -- Download and preprocess the data -- How many files were saved to output folder?

In [26]:
!python scripts/preprocess_data.py --raw_data_path ./taxi_data_folder --dest_path ./output_folder

In [27]:
os.listdir('./output_folder')

['dv.pkl', 'test.pkl', 'train.pkl', 'val.pkl']

Response = The number of files are 4

## Q3. Train a model with autolog - What is the value of the `min_samples_split` parameter:

In [52]:
!python scripts/train.py --data_path ./output_folder

2024/06/02 22:25:54 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


Response: Min_samples_split has a value of 2

## Q4 - Launch the tracking server locally - 

Response: default-artifact-root

## Q5. Tune model hyperparameters - What's the best validation RMSE that you got?

In [None]:
!python scripts/hpo.py --data_path ./output_folder

Response: Aprox 5.335

## Q6. Promote the best model to the model registry

In [80]:
!python scripts/register_model.py --data_path ./output_folder

Successfully registered model 'best-regressor'.
Created version '1' of model 'best-regressor'.


response: The best test rmse is aprox 5.5674