# Deploy a training pipeline

## Before you start

Run the cell below to verify that all the required packages are installed.

In [None]:
pip install -r "../requirements.txt"

### Connect to your workspace

To connect to a workspace, we need identifier parameters - a subscription ID, resource group name, and workspace name. Since you're working with a compute instance, managed by Azure Machine Learning, you can use the default values to connect to the workspace.

In [1]:
from dotenv import load_dotenv
import os
load_dotenv() # take environment variables from .env.


subscription = os.environ["SUBSCRIPTION"]
resource_group = os.environ["RESOURCE_GROUP"]
ws_name = os.environ["WORKSPACE_NAME"]
compute_cluster = os.environ["COMPUTE_CLUSTER"]

In [2]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# authenticate
credential = DefaultAzureCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id=subscription,
    resource_group_name=resource_group,
    workspace_name=ws_name,
)

Create Data store and upload training data

In [None]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

data_path = "../data"
dataset_name = "diabetes-data-train"

patient_dataset = Data(
    path=data_path,
    type=AssetTypes.URI_FOLDER,
    description="Training data for diabetes prediction",
    name=dataset_name,
)
ml_client.data.create_or_update(patient_dataset)

In [3]:
patient_dataset = ml_client.data.get(
    name="diabetes-data-train", label="latest"
)

print(patient_dataset)

creation_context:
  created_at: '2024-12-11T00:23:56.378642+00:00'
  created_by: System Administrator
  created_by_type: User
  last_modified_at: '2024-12-11T00:23:56.396134+00:00'
description: Training data for diabetes prediction
id: /subscriptions/5bab2ebd-b16a-469d-8aa4-abdaea7f9e17/resourceGroups/damen-pipeline-test/providers/Microsoft.MachineLearningServices/workspaces/damen-pipeline-deployment-ws/data/diabetes-data-train/versions/4
name: diabetes-data-train
path: azureml://subscriptions/5bab2ebd-b16a-469d-8aa4-abdaea7f9e17/resourcegroups/damen-pipeline-test/workspaces/damen-pipeline-deployment-ws/datastores/workspaceblobstore/paths/LocalUpload/177b0cdcf5bcd30ff675ca95a93cbccd/data/
properties: {}
tags: {}
type: uri_folder
version: '4'



Load components

In [4]:
import os
from pathlib import Path

# Determine the project root relative to the current notebook's location
notebook_dir = Path.cwd()
project_root = notebook_dir.parent  # Adjust to go one level up to the project root

# Change the working directory to the project root
os.chdir(project_root)

print(project_root)

c:\Users\shenglinxu\OneDrive - Microsoft\projects\personal_projects\mlops-pipeline-creation


In [None]:

from py_func_based_pipeline.components.compare_models import compare_two_models
from py_func_based_pipeline.components.fix_missing_data import remove_empty_rows
from py_func_based_pipeline.components.normalize_data import normalize_data
from py_func_based_pipeline.components.train_decision_tree import train_decision_tree_classifier_model
from py_func_based_pipeline.components.train_logistic_regression import train_logistic_regression_classifier_model




In [None]:
ml_client.components.create_or_update(remove_empty_rows)
ml_client.components.create_or_update(normalize_data)
ml_client.components.create_or_update(train_logistic_regression_classifier_model)
ml_client.components.create_or_update(train_decision_tree_classifier_model)
ml_client.components.create_or_update(compare_two_models)


AttributeError: module 'py_func_based_pipeline.components.fix_missing_data' has no attribute '_is_anonymous'

In [None]:
from azure.ai.ml import Input
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.dsl import pipeline

@pipeline()
def diabetes_classification_py_func(pipeline_job_input):
    clean_data = remove_empty_rows(input_data=pipeline_job_input)
    normalized_data = normalize_data(input_data=clean_data.outputs.output_data)
    train_model_decision_tree = train_decision_tree_classifier_model(training_data=normalized_data.outputs.output_data)
    train_model_logistic_regression = train_logistic_regression_classifier_model(training_data=normalized_data.outputs.output_data)
    better_model = compare_two_models(model1=train_model_decision_tree.outputs.model_output_decision_tree, 
                                            model1_metrics=train_model_decision_tree.outputs.metrics_output,
                                            model2=train_model_logistic_regression.outputs.model_output_logistic_reg,
                                            model2_metrics=train_model_logistic_regression.outputs.metrics_output)                             
    
    return {
        "pipeline_job_transformed_data": normalized_data.outputs.output_data,
        "pipeline_job_trained_model_decision_tree": train_model_decision_tree.outputs.model_output_decision_tree,
        "pipeline_job_trained_model_logistic_regression": train_model_logistic_regression.outputs.model_output_logistic_reg,
        "pipeline_job_better_model": better_model.outputs.better_model,	
    }

pipeline_job = diabetes_classification_py_func(Input(type=AssetTypes.URI_FILE, path=patient_dataset.path))

print(pipeline_job)

In [None]:
# set pipeline level compute
pipeline_job.settings.default_compute = compute_cluster
# set pipeline level datastore
pipeline_job.settings.default_datastore = "workspaceblobstore"

# submit job to workspace
pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name="pipeline_diabetes_training_py_func"
)
pipeline_job