### Prerequisites to run this notebook

In [1]:
# Ensure you have the dependencies for this notebook
#%pip install -r logging_model_with_mlflow.txt

In [2]:
import sys, os
import mlflow
import mlflow.azureml

import azureml.core
from azureml.core import Workspace
from mlflow.tracking import MlflowClient

import pandas as pd

from azureml.core import Dataset
#from azureml.data.dataset_factory import DataType
import tempfile

print("SDK version:", azureml.core.VERSION)
print("MLflow version:", mlflow.version.VERSION)

SDK version: 1.44.0
MLflow version: 1.28.0


In [4]:
experiment_name ="TrackDatasets"
experiment=mlflow.set_experiment(experiment_name)
experiment_id=experiment.experiment_id

# Create an experiment with a name that is unique and case sensitive.
client = MlflowClient()
#experiment_id = client.create_experiment(experiment_name)
client.set_experiment_tag(experiment_id, "exper ver", "1")


# Fetch experiment metadata information
experiment = client.get_experiment(experiment_id)
print("Name: {}".format(experiment.name))
print("Experiment_id: {}".format(experiment.experiment_id))

Name: TrackDatasets
Experiment_id: 1ae4018f-adf2-4b54-ae2c-ad04fe6ddca0
Artifact Location: 
Tags: {'exper ver': '1'}
Lifecycle_stage: active


In [3]:
file_url = "http://storage.googleapis.com/download.tensorflow.org/data/heart.csv"
df = pd.read_csv(file_url)
df["thal"] = df["thal"].astype("category").cat.codes

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.drop("target", axis=1), df["target"], test_size=0.3
)

There are two main approaches to track the datasets: the first one is to save a copy of your input data:

```
    with tempfile.TemporaryDirectory() as tmp:
        path = 'saveDatasetVer/train_dataset.csv'  #path where you whant to save your dataset used for training 
        X_train.to_csv(path)
        mlflow.log_artifacts(tmp)
```

In case the dataset is too big you can log the path to the dataset as a parameter: 

```
    mlflow.log_param('dsPathМ', file_url )   #track the pointer to the data

```


In [6]:
import mlflow
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from mlflow.models import infer_signature

with mlflow.start_run():  
    mlflow.xgboost.autolog(log_models=True,log_input_examples=True,log_model_signatures=True,registered_model_name='DatastetsModel')

    model = XGBClassifier(use_label_encoder=False, eval_metric="auc")
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", accuracy)

    mlflow.log_param('dsPathМ', file_url )   #track the pointer to the data

    with tempfile.TemporaryDirectory() as tmp:
        path = 'saveDatasetVer/train_dataset.csv'  #path where you whant to save your dataset used for training 
        X_train.to_csv(path)
        mlflow.log_artifacts(tmp)

    signature = infer_signature(X_test, y_test)
    mlflow.xgboost.log_model(model, "classifier", signature=signature)

Registered model 'DatastetsModel' already exists. Creating a new version of this model...
2022/10/04 11:03:19 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: DatastetsModel, version 2
Created version '2' of model 'DatastetsModel'.
  outputs = _infer_schema(model_output) if model_output is not None else None


In [10]:
mlflow.end_run()

**log_models** -  If True, trained models are logged as MLflow model artifacts. If False, trained models are not logged. Input examples and model signatures, which are attributes of MLflow models, are also omitted when log_models is False.  


**registered_model_name** – If given, each time a model is trained, it is registered as a new model version of the registered model with this name. The registered model is created if it does not already exist.    

**log_input_examples** – If True, input examples from training datasets are collected and logged along with XGBoost model artifacts during training. If False, input examples are not logged. Note: Input examples are MLflow model attributes and are only collected if log_models is also True

**log_model_signatures** – If True, ModelSignatures describing model inputs and outputs are collected and logged along with XGBoost model artifacts during training. If False, signatures are not logged. Note: Model signatures are MLflow model attributes and are only collected if log_models is also True.
