# Deploy a training pipeline

## Before you start

Run the cell below to verify that all the required packages are installed.

In [None]:
pip install -r "../requirements.txt"

### Connect to your workspace

To connect to a workspace, we need identifier parameters - a subscription ID, resource group name, and workspace name. Since you're working with a compute instance, managed by Azure Machine Learning, you can use the default values to connect to the workspace.

In [14]:
from dotenv import load_dotenv
import os
load_dotenv() # take environment variables from .env.


subscription = os.environ["SUBSCRIPTION"]
resource_group = os.environ["RESOURCE_GROUP"]
ws_name = os.environ["WORKSPACE_NAME"]
compute_cluster = os.environ["COMPUTE_CLUSTER"]

In [2]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# authenticate
credential = DefaultAzureCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id=subscription,
    resource_group_name=resource_group,
    workspace_name=ws_name,
)

Create Data store and upload training data

In [3]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

data_path = "../data"
dataset_name = "diabetes-data-train"

patient_dataset = Data(
    path=data_path,
    type=AssetTypes.URI_FOLDER,
    description="Training data for diabetes prediction",
    name=dataset_name,
)
ml_client.data.create_or_update(patient_dataset)

Data({'path': 'azureml://subscriptions/5bab2ebd-b16a-469d-8aa4-abdaea7f9e17/resourcegroups/damen-pipeline-deployment-rg/workspaces/damen-pipeline-deployment-ws/datastores/workspaceblobstore/paths/LocalUpload/177b0cdcf5bcd30ff675ca95a93cbccd/data/', 'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': None, 'type': 'uri_folder', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'diabetes-data-train', 'description': 'Training data for diabetes prediction', 'tags': {}, 'properties': {}, 'print_as_yaml': False, 'id': '/subscriptions/5bab2ebd-b16a-469d-8aa4-abdaea7f9e17/resourceGroups/damen-pipeline-deployment-rg/providers/Microsoft.MachineLearningServices/workspaces/damen-pipeline-deployment-ws/data/diabetes-data-train/versions/3', 'Resource__source_path': '', 'base_path': 'c:\\Users\\shenglinxu\\OneDrive - Microsoft\\projects\\personal_projects\\mlops-pipeline-creation\\notebooks', 'creation_context': <azure.ai.ml.entities._sy

In [4]:
patient_dataset = ml_client.data.get(
    name="diabetes-data-train", label="latest"
)

print(patient_dataset)

creation_context:
  created_at: '2024-12-09T18:51:50.483978+00:00'
  created_by: System Administrator
  created_by_type: User
  last_modified_at: '2024-12-09T18:51:50.504717+00:00'
description: Training data for diabetes prediction
id: /subscriptions/5bab2ebd-b16a-469d-8aa4-abdaea7f9e17/resourceGroups/damen-pipeline-deployment-rg/providers/Microsoft.MachineLearningServices/workspaces/damen-pipeline-deployment-ws/data/diabetes-data-train/versions/3
name: diabetes-data-train
path: azureml://subscriptions/5bab2ebd-b16a-469d-8aa4-abdaea7f9e17/resourcegroups/damen-pipeline-deployment-rg/workspaces/damen-pipeline-deployment-ws/datastores/workspaceblobstore/paths/LocalUpload/177b0cdcf5bcd30ff675ca95a93cbccd/data/
properties: {}
tags: {}
type: uri_folder
version: '3'



Load components

In [10]:
from azure.ai.ml import load_component
parent_dir = "../components/"

fix_missing_data_component = load_component(source=parent_dir + "fix-missing-data.yml")
normalize_data_component = load_component(source=parent_dir + "normalize-data.yml")
train_decision_tree_component = load_component(source=parent_dir + "train-decision-tree.yml")
train_logistic_regression_component = load_component(source=parent_dir + "train-logistic-regression.yml")
compare_models_component = load_component(source=parent_dir + "compare-models.yml")





Build pipeline

In [11]:
from azure.ai.ml import Input
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.dsl import pipeline

@pipeline()
def diabetes_classification(pipeline_job_input):
    clean_data = fix_missing_data_component(input_data=pipeline_job_input)
    normalized_data = normalize_data_component(input_data=clean_data.outputs.output_data)
    train_model_decision_tree = train_decision_tree_component(training_data=normalized_data.outputs.output_data)
    train_model_logistic_regression = train_logistic_regression_component(training_data=normalized_data.outputs.output_data)
    better_model = compare_models_component(model1=train_model_decision_tree.outputs.model_output_decision_tree, 
                                            model1_metrics=train_model_decision_tree.outputs.metrics_output,
                                            model2=train_model_logistic_regression.outputs.model_output_logistic_reg,
                                            model2_metrics=train_model_logistic_regression.outputs.metrics_output)


    return {
        "pipeline_job_transformed_data": normalized_data.outputs.output_data,
        "pipeline_job_trained_model_decision_tree": train_model_decision_tree.outputs.model_output_decision_tree,
        "pipeline_job_trained_model_logistic_regression": train_model_logistic_regression.outputs.model_output_logistic_reg,
        "pipeline_job_better_model": better_model.outputs.better_model,	
    }

pipeline_job = diabetes_classification(Input(type=AssetTypes.URI_FILE, path=patient_dataset.path))
print(pipeline_job)

display_name: diabetes_classification
type: pipeline
inputs:
  pipeline_job_input:
    type: uri_file
    path: azureml://subscriptions/5bab2ebd-b16a-469d-8aa4-abdaea7f9e17/resourcegroups/damen-pipeline-deployment-rg/workspaces/damen-pipeline-deployment-ws/datastores/workspaceblobstore/paths/LocalUpload/177b0cdcf5bcd30ff675ca95a93cbccd/data/
outputs:
  pipeline_job_transformed_data:
    type: uri_folder
  pipeline_job_trained_model_decision_tree:
    type: uri_folder
  pipeline_job_trained_model_logistic_regression:
    type: uri_folder
  pipeline_job_better_model:
    type: uri_folder
jobs:
  clean_data:
    type: command
    inputs:
      input_data:
        path: ${{parent.inputs.pipeline_job_input}}
    component:
      $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
      name: remove_empty_rows
      version: '1'
      display_name: Remove Empty Rows
      type: command
      inputs:
        input_data:
          type: uri_folder
      outputs:
 

In [None]:

# set pipeline level compute
pipeline_job.settings.default_compute = compute_cluster
# set pipeline level datastore
pipeline_job.settings.default_datastore = "workspaceblobstore"

# print the pipeline job again to review the changes
print(pipeline_job)

display_name: diabetes_classification
type: pipeline
inputs:
  pipeline_job_input:
    type: uri_file
    path: azureml://subscriptions/5bab2ebd-b16a-469d-8aa4-abdaea7f9e17/resourcegroups/damen-pipeline-deployment-rg/workspaces/damen-pipeline-deployment-ws/datastores/workspaceblobstore/paths/LocalUpload/177b0cdcf5bcd30ff675ca95a93cbccd/data/
outputs:
  pipeline_job_transformed_data:
    type: uri_folder
  pipeline_job_trained_model_decision_tree:
    type: uri_folder
  pipeline_job_trained_model_logistic_regression:
    type: uri_folder
  pipeline_job_better_model:
    type: uri_folder
jobs:
  clean_data:
    type: command
    inputs:
      input_data:
        path: ${{parent.inputs.pipeline_job_input}}
    component:
      $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
      name: remove_empty_rows
      version: '1'
      display_name: Remove Empty Rows
      type: command
      inputs:
        input_data:
          type: uri_folder
      outputs:
 

In [13]:
# submit job to workspace
pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name="pipeline_diabetes_training"
)
pipeline_job

Uploading src (0.01 MBs): 100%|##########| 11241/11241 [00:00<00:00, 51816.53it/s]


pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored


Experiment,Name,Type,Status,Details Page
pipeline_diabetes_training,coral_roof_zbhlxsnn0m,pipeline,NotStarted,Link to Azure Machine Learning studio


Go inside AML Studio and check which model performs better and replace `superior_model_node`

In [None]:
# List all child jobs in the job
child_jobs = ml_client.jobs.list(parent_job_name=pipeline_job.name)
superior_model_node = "train_model_decision_tree"

# Traverse and download all the outputs of child job
for child_job in child_jobs:
    # print(child_job.properties)
    if child_job.display_name == superior_model_node:
        props = child_job.properties
        reused_run_id = props.get('azureml.reusedrunid')

        if reused_run_id:
            job_id = reused_run_id
        else:
            job_id = child_job.name
        break

print(job_id)

In [None]:
from azure.ai.ml.entities import Model
from azure.ai.ml.constants import AssetTypes
from azure.core.exceptions import HttpResponseError

run_model = Model(
    path=f"azureml://datastores/workspaceartifactstore/paths/ExperimentRun/dcid.{job_id}/outputs/models/",
    name="decision_tree_diabetes_classifier",
    description="Model trained using decision tree classifier",
    type=AssetTypes.MLFLOW_MODEL,
)

# Attempt to create or update the model
try:
    ml_client.models.create_or_update(run_model)
except HttpResponseError as e:
    print(f"Error: {e}")
    print(f"Response: {e.response}")