Steps:

1) You need to access the mlops-zoomcamp virtual env from the terminal

2) go to the "mlops-zoomcamp/cohorts/2023/02-experiment-tracking/homework" directory

3) create directory "data" and get the data from the website:

* wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-03.parquet
* wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-02.parquet
* wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-01.parquet

4) install the required packages:

- mlflow
- jupyter
- scikit-learn
- pandas
- seaborn
- hyperopt
- fastparquet
- boto3

### Q1. Install the package

In [1]:
import mlflow
!pip show mlflow

Name: mlflow
Version: 2.3.2
Summary: MLflow: A Platform for ML Development and Productionization
Home-page: https://mlflow.org/
Author: Databricks
Author-email: 
License: Apache License 2.0
Location: /home/ubuntu/anaconda3/lib/python3.10/site-packages
Requires: alembic, click, cloudpickle, databricks-cli, docker, entrypoints, Flask, gitpython, gunicorn, importlib-metadata, Jinja2, markdown, matplotlib, numpy, packaging, pandas, protobuf, pyarrow, pytz, pyyaml, querystring-parser, requests, scikit-learn, scipy, sqlalchemy, sqlparse
Required-by: 


### Q2. Download and preprocess the data

run the preprocessing script:

* python preprocess_data.py --raw_data_path data --dest_path ./output

In [2]:
# the size in bytes
! ls -la output/dv.pkl

-rw-rw-r-- 1 ubuntu ubuntu 153660 Jun  1 14:58 output/dv.pkl


### Q3. Train a model with autolog

In [3]:
import os
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

data_path = './output'

def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))


mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("mlops-zoomcamp-week2-homework")

<Experiment: artifact_location='/home/ubuntu/mlops-zoomcamp/cohorts/2023/02-experiment-tracking/homework/mlruns/2', creation_time=1685640459999, experiment_id='2', last_update_time=1685640459999, lifecycle_stage='active', name='mlops-zoomcamp-week2-homework', tags={}>

In [6]:
with mlflow.start_run():

    mlflow.set_tag("developer", "niiaz")

    mlflow.log_param("train-data-path", os.path.join(data_path, "train.pkl"))
    mlflow.log_param("valid-data-path", os.path.join(data_path, "val.pkl"))

    max_depth = 10
    mlflow.log_param("max_depth", max_depth)

    random_state=0
    mlflow.log_param("random_state", random_state)

    rf = RandomForestRegressor(max_depth=max_depth, random_state=random_state)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)

    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    mlflow.log_artifact(local_path="models/rfr.bin", artifact_path="models_pickle")
    

### Q4. Tune model hyperparameters

In [7]:
import os
import pickle
import mlflow
import optuna

from optuna.samplers import TPESampler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("random-forest-hyperopt1")


def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


In [8]:
def run_optimization(data_path: str, num_trials: int):

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    def objective(trial):
        with mlflow.start_run():        
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
                'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
                'random_state': 42,
                'n_jobs': -1
            }
            mlflow.log_params(params)
            rf = RandomForestRegressor(**params)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_val)
            rmse = mean_squared_error(y_val, y_pred, squared=False)
            mlflow.log_metric("rmse", rmse)

        return rmse

    sampler = TPESampler(seed=42)
    study = optuna.create_study(direction="minimize", sampler=sampler)
    study.optimize(objective, n_trials=num_trials)

data_path = './output'
num_trials = 10

if __name__ == '__main__':
    run_optimization(data_path=data_path, num_trials=num_trials)

[I 2023-06-03 04:39:55,727] A new study created in memory with name: no-name-66f0eba2-64b1-475a-9835-cfe463f74a8b
[I 2023-06-03 04:39:57,950] Trial 0 finished with value: 2.451379690825458 and parameters: {'n_estimators': 25, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 3}. Best is trial 0 with value: 2.451379690825458.
[I 2023-06-03 04:39:58,201] Trial 1 finished with value: 2.4667366020368333 and parameters: {'n_estimators': 16, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 0 with value: 2.451379690825458.
[I 2023-06-03 04:40:00,287] Trial 2 finished with value: 2.449827329704216 and parameters: {'n_estimators': 34, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 2 with value: 2.449827329704216.
[I 2023-06-03 04:40:00,900] Trial 3 finished with value: 2.460983516558473 and parameters: {'n_estimators': 44, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 2 with value: 2.44982732

### Q5. Promote the best model to the model registry

In [9]:
import os
import pickle
import mlflow

from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

data_path = './output'
top_n = 5

HPO_EXPERIMENT_NAME = "random-forest-hyperopt1"
EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state', 'n_jobs']

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.sklearn.autolog()


def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def train_and_log_model(data_path, params):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    with mlflow.start_run():
        for param in RF_PARAMS:
            params[param] = int(params[param])

        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)

        # Evaluate model on the validation and test sets
        val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False)
        mlflow.log_metric("val_rmse", val_rmse)
        test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
        mlflow.log_metric("test_rmse", test_rmse)


def run_register_model(data_path: str, top_n: top_n):

    client = MlflowClient(tracking_uri="sqlite:///mlflow.db")

    # Retrieve the top_n model runs and log the models
    experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.rmse ASC"]
    )
    for run in runs:
        train_and_log_model(data_path=data_path, params=run.data.params)

    # Select the model with the lowest test RMSE
    experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
    best_run = client.search_runs(
        experiment_ids = experiment.experiment_id,
        run_view_type = ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by = ['metrics.rmse ASC']
    )[0]

    # Register the best model
    model_uri = f"runs:/{best_run.info.run_id}/model"
    mlflow.register_model(model_uri=model_uri, name="homework-week2-rfregressor")


if __name__ == '__main__':
    run_register_model(data_path, top_n)


Registered model 'homework-week2-rfregressor' already exists. Creating a new version of this model...
2023/06/03 04:40:46 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: homework-week2-rfregressor, version 3
Created version '3' of model 'homework-week2-rfregressor'.
