Kernel: Python 3.10 - SDK v2

### Imports

In [None]:
# import required libraries
from azure.ai.ml import MLClient, Input, command, Output
from azure.identity import DefaultAzureCredential
from azure.ai.ml import command, Input
from azure.ai.ml.entities import (
    AzureBlobDatastore,
    AzureFileDatastore,
    AzureDataLakeGen1Datastore,
    AzureDataLakeGen2Datastore,
    AccountKeyConfiguration,
    Environment,
    BatchEndpoint,
    PipelineComponentBatchDeployment,
    ModelBatchDeployment,
    ModelBatchDeploymentSettings,
    BatchRetrySettings
)

from azure.ai.ml.entities import Environment

In [None]:
from azure.ai.ml.dsl import pipeline
from azure.ai.ml.automl import classification, regression
from azure.ai.ml.entities._job.automl.tabular import TabularFeaturizationSettings

In [None]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes, BatchDeploymentOutputAction

In [None]:
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient

### Config

In [None]:
# Details of AML workspace
subscription_id = 'XXXXX-XXXXX-XXXXX-XXXXX-XXXXX'
resource_group = 'rg-XXXXX-XXXXX'
workspace = 'aml-XXXXX-XXXXX'

In [None]:
# Datastore
datastore_name = 'XXXXX_demo17'
storage_account_name = 'XXXXX'
storage_container_name = 'demo17'

In [None]:
# Data asset
data_asset_name = 'Australian_Vehicle_Prices'
data_asset_version = '2'
file_path = 'training/Australian Vehicle Prices.csv'
# path = f'wasbs://{storage_container_name}@{storage_account_name}.blob.core.windows.net/{file_path}'
path = f'azureml://datastores/{datastore_name}/paths/{file_path}'
path_dir = os.path.dirname(path)

print(f'path: {path}')
print(f'path_dir: {path_dir}')

In [None]:
conda_file = './environment/preprocessing_env.yaml'

In [None]:
# Existing Azure ML cluster
cluster_name = 'cpu-cluster-04' 

### Get workspace handle

In [None]:
# get a handle to the workspace
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace
)
print(f'Workspace handle retrieved')

In [None]:
aml_workspace = ml_client.workspaces.get(workspace)
# aml_workspace.key_vault

### Create environment

In [None]:
env_docker_conda = Environment(
    image = 'mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04',
    conda_file= conda_file,
    name= 'automl-train-pipeline-custom-environment',
    description="Environment created from a Docker image plus Conda environment.",
)

In [None]:
%%time

print('Creating environment')
ml_client.environments.create_or_update(env_docker_conda)
print('Created environment')

### Pipeline

In [None]:
model_base_name="automl_regression_06082024"

In [None]:
# Define pipeline
@pipeline( description="AutoML Regression Pipeline")
# def automl_regression(regression_train_data, regression_validation_data, regression_test_data):
def automl_regression(regression_train_data: Input(type=AssetTypes.MLTABLE)):
    
    #----------------------------------------------------------------------------#
    # Pre-process
    
    # define command function for preprocessing the model
    preprocessing_command_func = command(
        name = 'Prep Data',
        inputs=dict(
            train_data=Input(type="mltable"),
            
            # Commenting below as we will create val and test data by splitting train data, see preprocess.py
            # validation_data=Input(type="mltable"),
            # test_data=Input(type="mltable"),
        ),
        outputs=dict(
            preprocessed_train_data=Output(type="mltable"),
            preprocessed_validation_data=Output(type="mltable"),
            preprocessed_test_data=Output(type="mltable"),
        ),
        code="./scripts/preprocess.py",
        command="python preprocess.py "
        + "--train_data ${{inputs.train_data}} "
        # + "--validation_data ${{inputs.validation_data}} "
        # + "--test_data ${{inputs.test_data}} "
        + "--preprocessed_train_data ${{outputs.preprocessed_train_data}} "
        + "--preprocessed_validation_data ${{outputs.preprocessed_validation_data}} "
        + "--preprocessed_test_data ${{outputs.preprocessed_test_data}}",
        environment = 'automl-train-pipeline-custom-environment@latest',
        is_deterministic = False # Don't reuse previous run output
    )    
    
    preprocess_node = preprocessing_command_func(
        train_data=regression_train_data,
        
        # Commenting below as we will create val and test data by splitting train data, see preprocess.py
        # validation_data=regression_validation_data,
        # test_data=regression_test_data,
    )

    #----------------------------------------------------------------------------#
    # AutoML: Regression
    
    # define the AutoML regression task with AutoML function
    regression_node = regression(
        primary_metric="r2_score",
        target_column_name="Price",
        training_data=preprocess_node.outputs.preprocessed_train_data,
        validation_data=preprocess_node.outputs.preprocessed_validation_data,
        test_data=preprocess_node.outputs.preprocessed_test_data,
        featurization=TabularFeaturizationSettings(mode="AUTO"),
        # currently need to specify outputs "mlflow_model" explicitly to reference it in following nodes
        outputs={"best_model": Output(type="mlflow_model")},
    )
    # set limits & training
    regression_node.set_limits(enable_early_termination = True, 
                               max_trials=5,
                               max_concurrent_trials=2,
                               timeout_minutes = 30,
                               trial_timeout_minutes = 25
                              )
    regression_node.set_training(
        blocked_training_algorithms = None,
        enable_stack_ensemble=False, enable_vote_ensemble=False
    )
    
    #----------------------------------------------------------------------------#
    # Register model

    # define command function for registering the model
    command_func = command(
        name = 'Register Model',
        inputs=dict(
            model_input_path=Input(type="mlflow_model"),
            model_base_name=model_base_name,
        ),
        code="./scripts/register.py",
        command="python register.py "
        + "--model_input_path ${{inputs.model_input_path}} "
        + "--model_base_name ${{inputs.model_base_name}}",
        environment="azureml://registries/azureml/environments/sklearn-1.5/labels/latest",
        is_deterministic = False # Don't reuse previous run output
    )
    
    register_model = command_func(model_input_path=regression_node.outputs.best_model)
    
    #----------------------------------------------------------------------------#

In [None]:
pipeline_regression = automl_regression(
    # regression_train_data=Input(path="./training-mltable-folder/", type="mltable"),
    regression_train_data=Input(path=path_dir, type="mltable"), # Using datastore path, not local file
    
    
    # Commenting below as we will create val and test data by splitting train data, see preprocess.py
    # regression_validation_data=Input(path="./validation-mltable-folder/", type="mltable"),
    # regression_test_data=Input(path="./test-mltable-folder/", type="mltable"),
)

In [None]:
# set pipeline level compute
pipeline_regression.settings.default_compute = cluster_name

### Submit pipeline

In [None]:
experiment_name = 'automl_regression_train_pipeline_06082024'

In [None]:
# submit the pipeline job
pipeline_job = ml_client.jobs.create_or_update(
    pipeline_regression, experiment_name = experiment_name
)

print(f'Experiment submitted')

In [None]:
pipeline_job

In [None]:
# Wait until the job completes
ml_client.jobs.stream(pipeline_job.name)

### Create Batch Endpoint - Training

In [None]:
# Endpoint name
endpoint_name = 'automl-reg-train-batch-endpoint'
print(f'endpoint_name: {endpoint_name}')

# Max 32 chars length name allowed
# len(endpoint_name)

In [None]:
# Endpoint object
endpoint = BatchEndpoint(
    name=endpoint_name,
    description= 'Batch endpoint for regression training on Australian_Vehicle_Prices dataset'
)

In [None]:
# Create endpoint
ml_client.batch_endpoints.begin_create_or_update(endpoint).result()

In [None]:
# See endpoint details
endpoint = ml_client.batch_endpoints.get(name=endpoint_name)
print(endpoint)

##### Add deployment - Training

In [None]:
# Create component
pipeline_component = ml_client.components.create_or_update(
    automl_regression().component
)

In [None]:
# Configure deployment
deployment = PipelineComponentBatchDeployment(
    name="automl-train", # 32 chars max
    description="A deployment for regression training using AutoML",
    endpoint_name=endpoint.name,
    component=pipeline_component,
    settings={"continue_on_step_failure": False, "default_compute": cluster_name},
)

In [None]:
# Create deployment
ml_client.batch_deployments.begin_create_or_update(deployment).result()

In [None]:
endpoint = ml_client.batch_endpoints.get(endpoint_name)
endpoint.defaults.deployment_name = deployment.name
ml_client.batch_endpoints.begin_create_or_update(endpoint).result()

In [None]:
print(f"The default deployment is {endpoint.defaults.deployment_name}")

### Create Batch Endpoint - Inference

In [None]:
# Endpoint name
endpoint_inf_name = 'automl-reg-inf-batch-endpoint'
print(f'endpoint_inf_name: {endpoint_inf_name}')

# Max 32 chars length name allowed
# len(endpoint_inf_name)

In [None]:
# Endpoint object
endpoint_inf = BatchEndpoint(
    name=endpoint_inf_name,
    description= 'Batch endpoint for regression inference on Australian_Vehicle_Prices dataset'
)

In [None]:
# Create endpoint
ml_client.batch_endpoints.begin_create_or_update(endpoint_inf).result()

In [None]:
# See endpoint details
endpoint_inf = ml_client.batch_endpoints.get(name=endpoint_inf_name)
print(endpoint_inf)

##### Add deployment - Inference

In [None]:
model_latest = ml_client.models.get(name=model_base_name, label="latest")
print(f'model_latest.version: {model_latest.version}')

In [None]:
# Configure model deployment
deployment_inf = ModelBatchDeployment(
    name="automl-inf",
    description="Automl regression model trained previously",
    endpoint_name=endpoint_inf.name,
    model=model_latest,
    compute=cluster_name,
    settings=ModelBatchDeploymentSettings(
        instance_count=2,
        max_concurrency_per_instance=2,
        mini_batch_size=10,
        output_action=BatchDeploymentOutputAction.APPEND_ROW,
        output_file_name="predictions.csv",
        retry_settings=BatchRetrySettings(max_retries=3, timeout=300),
        logging_level="info",
    ),
)

In [None]:
# Create deployment
ml_client.batch_deployments.begin_create_or_update(deployment_inf).result()

In [None]:
# Configure default deployment
endpoint_inf = ml_client.batch_endpoints.get(endpoint_inf.name)
endpoint_inf.defaults.deployment_name = deployment_inf.name
ml_client.batch_endpoints.begin_create_or_update(endpoint_inf).result()

In [None]:
print(f"The default deployment is {endpoint_inf.defaults.deployment_name}")