# Training and tracking an XGBoost classifier with MLflow in AML

This notebook is the similiar with the classification training we have in ADB

In [1]:
# Ensure you have the dependencies for this notebook
!pip install -r xgboost_classification_mlflow.txt

Collecting azure-core<1.22
  Downloading azure_core-1.21.1-py2.py3-none-any.whl (178 kB)
[K     |████████████████████████████████| 178 kB 18.8 MB/s eta 0:00:01
[31mERROR: azure-storage-file-share 12.8.0 has requirement azure-core<2.0.0,>=1.23.1, but you'll have azure-core 1.21.1 which is incompatible.[0m
[31mERROR: azure-storage-file-datalake 12.6.0 has requirement azure-storage-blob<13.0.0,>=12.10.0, but you'll have azure-storage-blob 12.9.0 which is incompatible.[0m
[31mERROR: azure-cli 2.35.0 has requirement antlr4-python3-runtime~=4.7.2, but you'll have antlr4-python3-runtime 4.9.3 which is incompatible.[0m
[31mERROR: azure-cli 2.35.0 has requirement azure-graphrbac~=0.60.0, but you'll have azure-graphrbac 0.61.1 which is incompatible.[0m
[31mERROR: azure-cli 2.35.0 has requirement azure-mgmt-resource==20.0.0, but you'll have azure-mgmt-resource 20.1.0 which is incompatible.[0m
[31mERROR: azure-cli 2.35.0 has requirement azure-synapse-spark~=0.2.0, but you'll have azure

# Set the experiment name

In [2]:
import mlflow
experiment_name = 'heart-condition-classifier'

mlflow.set_experiment(experiment_name=experiment_name)

<Experiment: artifact_location='', experiment_id='25040467-10f0-42f5-b376-521ef11d746c', lifecycle_stage='active', name='heart-condition-classifier', tags={}>

# Get the workspace configuration

In [1]:
from azureml.core import Workspace, Dataset,Datastore
import azureml.core

ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')

ImportError: cannot import name 'SerializationError' from 'azure.core.exceptions' (/anaconda/envs/azureml_py38/lib/python3.8/site-packages/azure/core/exceptions.py)

# Read from dataset that contains data from deltalake

In [4]:
dataset = Dataset.get_by_name(name='deltalake', workspace=ws,version='latest')
df = dataset.to_pandas_dataframe()
df


ds2 = Datastore.get(ws, "preview")
path_datastore = "heart"
version_as_of = 0
delta_lake_by_version_new = Dataset.Tabular.from_delta_lake(path=(ds2, path_datastore), version_as_of=version_as_of)

print(delta_lake_by_version_new.to_pandas_dataframe())

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,52,1,1,118,186,0,2,190,0,0.0,2,0,fixed,0
299,43,0,4,132,341,1,2,136,1,3.0,2,0,reversible,1
300,65,1,4,135,254,0,2,127,0,2.8,2,1,reversible,1
301,48,1,4,130,256,1,2,150,1,0.0,1,2,reversible,1


# Data quality

In [5]:
df["thal"] = df["thal"].astype("category").cat.codes

In [6]:
df["thal"].unique()

array([2, 3, 4, 0, 1], dtype=int8)

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.drop("target", axis=1), df["target"], test_size=0.3
)

#  Use MLflow in AML in the same way as you're used to. Similiar to ADB

In [8]:
mlflow.xgboost.autolog()

In [9]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, recall_score

model = XGBClassifier(use_label_encoder=False, eval_metric="logloss")

In [10]:
with mlflow.start_run() as run:
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print("Recall: %.2f%%" % (recall * 100.0))

	Message: UserError: Resource Conflict: ArtifactId ExperimentRun/dcid.636e7e0a-0ac8-4dff-8210-78d2ed335ed6/metric_info.json already exists.
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "UserError: Resource Conflict: ArtifactId ExperimentRun/dcid.636e7e0a-0ac8-4dff-8210-78d2ed335ed6/metric_info.json already exists."
    }
}


Accuracy: 89.01%
Recall: 66.67%


#  Register the MLFlow model in AML model registery

In [11]:
run.info.run_id

'636e7e0a-0ac8-4dff-8210-78d2ed335ed6'

In [16]:
model_name='databricks-heart-classifier'
mlflow.register_model(f"runs:/{run.info.run_id}/model", model_name)

Registered model 'databricks-heart-classifier' already exists. Creating a new version of this model...
2022/08/24 07:27:49 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: databricks-heart-classifier, version 3
Created version '3' of model 'databricks-heart-classifier'.


<ModelVersion: creation_timestamp=1661326068486, current_stage='None', description='', last_updated_timestamp=1661326068486, name='databricks-heart-classifier', run_id='cf5a7156-4148-4af5-95d3-ae2e366c79eb', run_link='', source='azureml://experiments/heart-condition-classifier/runs/cf5a7156-4148-4af5-95d3-ae2e366c79eb/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='3'>

#  Register the dataset to the model

In [18]:
from azureml.core.model import Model
model = Model(ws, model_name)
print('Name:', model.name)
print('Version:', model.version)

Name: databricks-heart-classifier
Version: 3


In [None]:
model.add_dataset_references(dataset)