# **Homework:** 02-Experiment Tracking

In [1]:
# Loading utils libraries
import os

# Loading MLFlow libraries
import mlflow

In [2]:
# Q1. What is the version of the MLflow library?
print("MLflow version:", mlflow.__version__)    

MLflow version: 2.13.0


For Q2 i need to run the next command:

```python preprocess_data.py --raw_data_path green_taxi_data --dest_path ./output```

In [4]:
# Q2. Total output files in the directory
files = os.listdir('output')
print("The total number of files in the output directory is:", len(files), ", whit this extensions:", set([os.path.splitext(file)[1] for file in files]) )

The total number of files in the output directory is: 4 , whit this extensions: {'.pkl'}


In [5]:
# Q3. Train a model with autolog
import os
import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [7]:
# mlflow configuration
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("02-experiment-tracking-autolog")

INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2024/05/29 18:34:35 INFO mlflow.tracking.fluent: Experiment with name '02-experiment-tracking-autolog' does not exist. Creating a new experiment.


<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-traking/mlruns/1', creation_time=1717007675166, experiment_id='1', last_update_time=1717007675166, lifecycle_stage='active', name='02-experiment-tracking-autolog', tags={}>

In [8]:
def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

In [9]:
data_path = "./output"

X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

mlflow.autolog()

with mlflow.start_run():
    rf = RandomForestRegressor(max_depth=10, random_state=0)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)

    rmse = root_mean_squared_error(y_val, y_pred)
    print("Random Forest RMSE:", rmse)

2024/05/29 18:34:54 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


Random Forest RMSE: 5.431162180141208


![Q3. result](https://raw.githubusercontent.com/MateoBarrera/mlops-zoomcamp/main/02-experiment-traking/images/Q3-screenshot.png)

Command to run the tracking server

```mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./artifacts```

In [16]:
#Q4. improvement of the model with hyperopt
import numpy as np
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope

In [17]:
mlflow.sklearn.autolog(disable=True)
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("02-experiment-tracking-hyperopt")

<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-traking/mlruns/2', creation_time=1717011335664, experiment_id='2', last_update_time=1717011335664, lifecycle_stage='active', name='02-experiment-tracking-hyperopt', tags={}>

In [18]:
data_path = "./output"
num_trials = 15

In [19]:

X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

def objective(params):
    with  mlflow.start_run():
        mlflow.log_params(params)
        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
    'random_state': 42
}

rstate = np.random.default_rng(42)  # for reproducible results
fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=num_trials,
    trials=Trials(),
    rstate=rstate
)

100%|██████████| 15/15 [01:04<00:00,  4.29s/trial, best loss: 5.335419588556921]


{'max_depth': 19.0,
 'min_samples_leaf': 2.0,
 'min_samples_split': 2.0,
 'n_estimators': 11.0}

#Q6. Promote the best model to the model registry

This point was made in the preprocess_data.py file. And the result shown in the next terminal output.

![Output Image 6](https://raw.githubusercontent.com/MateoBarrera/mlops-zoomcamp/main/02-experiment-traking/images/Q6-screenshot.png)