In [1]:
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient

In [2]:
import os
workspace_name = os.environ.get('WORKSPACE', 'mlops-project-ml')
subscription_id = os.environ.get('SUBSCRIPTION_ID', '88f330cf-5648-423b-aaa3-bc9705075a61')
resource_group = os.environ.get('RESOURCE_GROUP', 'mlops-project')

In [None]:
import yaml

# Load config from YAML file
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

# Extract individual variables
subscription_id = config["subscription_id"]
resource_group = config["resource_group"]
workspace_name = config["workspace_name"]

In [3]:
credential = DefaultAzureCredential()

In [4]:
ml_client = MLClient(
    credential, subscription_id, resource_group, workspace_name
)

In [5]:
# Compute Instances need to have a unique name across the region.
from azure.ai.ml.entities import ComputeInstance, AmlCompute

ci_basic_name = "project-ml-compute" # I add the suffix Auto, because we are automatically creating this instance.
ci_basic = ComputeInstance(name=ci_basic_name, size="STANDARD_DS3_v2")
ml_client.begin_create_or_update(ci_basic).result()

ComputeInstance({'state': 'Running', 'last_operation': {'operation_name': 'Start', 'operation_time': '2025-06-08T15:04:17.820Z', 'operation_status': 'Succeeded', 'operation_trigger': 'User'}, 'os_image_metadata': <azure.ai.ml.entities._compute._image_metadata.ImageMetadata object at 0x7b139f5a38e0>, 'services': [{'display_name': 'Jupyter', 'endpoint_uri': 'https://project-ml-compute.westeurope.instances.azureml.ms/tree/'}, {'display_name': 'Jupyter Lab', 'endpoint_uri': 'https://project-ml-compute.westeurope.instances.azureml.ms/lab'}], 'type': 'computeinstance', 'created_on': '2025-06-06T12:05:35.957256+0000', 'provisioning_state': 'Succeeded', 'provisioning_errors': None, 'name': 'project-ml-compute', 'description': None, 'tags': None, 'properties': {}, 'print_as_yaml': False, 'id': '/subscriptions/88f330cf-5648-423b-aaa3-bc9705075a61/resourceGroups/mlops-project/providers/Microsoft.MachineLearningServices/workspaces/mlops-project-ml/computes/project-ml-compute', 'Resource__source_pa

In [6]:
import os

os.makedirs("components/dataprep", exist_ok=True)

conda_content = """
name: aml-dataprep
channels:
  - conda-forge
dependencies:
  - python=3.8
  - numpy=1.21.2
  - pip=21.2.4
  - scikit-learn=0.24.2
  - scipy=1.7.1
  - pandas>=1.1,<1.2
  - pip:
    - joblib
"""

with open("components/dataprep/conda.yaml", "w") as f:
    f.write(conda_content.strip())


In [7]:
from azure.ai.ml.entities import Environment
import os

custom_env_name = "aml-dataprep"

pipeline_job_env = Environment(
    name=custom_env_name,
    description="",
    tags={},
    conda_file=os.path.join("components", "dataprep", "conda.yaml"),
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
)
pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

print(
    f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
)

Environment with name aml-dataprep is registered to workspace, the environment version is 1


In [8]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output
import os

# Define the component
data_prep_component = command(
    name="dataprep",
    display_name="Data preparation, handle data",
    description="Reads a data asset that is a csv file and handles data preprocessing",
    inputs={
        "data": Input(type="uri_file"),
    },
    outputs={
        "output_data": Output(type="uri_folder", mode="rw_mount"),
    },
    code=os.path.join("components", "dataprep", "code"),  # Path to dataprep.py
    command="""python dataprep.py \
            --data ${{inputs.data}} \
            --output_data ${{outputs.output_data}} \
            """,
    environment="aml-dataprep@latest",  # Your custom environment with PIL
)

# Register the component with your workspace
data_prep_component = ml_client.create_or_update(data_prep_component.component)

# Confirmation
print(f"Component {data_prep_component.name} with Version {data_prep_component.version} is registered")


[32mUploading code (0.0 MBs):   0%|          | 0/1687 [00:00<?, ?it/s][32mUploading code (0.0 MBs): 100%|██████████| 1687/1687 [00:00<00:00, 67312.20it/s]
[39m



Component dataprep with Version 2025-06-08-15-43-40-1909673 is registered


In [9]:
from azure.ai.ml import dsl, Input, Output

@dsl.pipeline(
    compute="project-ml-compute",
    description="Custom data_prep pipeline",
)
def boston_preprocessing_pipeline(
    input_version: str = "1",
    output_version: str = "1",
):
    # Single data prep job for a CSV file
    data_prep_job = data_prep_component(
        data=Input(
            type="uri_file",
            path="azureml:boston_housing_prices_csv:1"
        ),
    )

    # Define the output path in workspace blobstore
    subscription_id = "88f330cf-5648-423b-aaa3-bc9705075a61"
    resource_group = "mlops-project"
    workspace_name = "mlops-project-ml"

    output_path = (
    f"azureml://subscriptions/{subscription_id}/"
    f"resourcegroups/{resource_group}/"
    f"workspaces/{workspace_name}/"
    f"datastores/workspaceblobstore/paths/cleaned_data/"
    )

    data_prep_job.outputs.output_data = Output(
        type="uri_folder",
        path=output_path,
        name = "data_cleaned",
        mode="rw_mount"
    )
    return {
        "cleaned_train_data": data_prep_job.outputs.output_data
    }


In [10]:
pipeline = boston_preprocessing_pipeline()

from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
import webbrowser

# Connect to ML workspace
ml_client = MLClient(
    DefaultAzureCredential(),
    subscription_id=subscription_id,
    resource_group_name=resource_group,
    workspace_name=workspace_name
)

# Submit the pipeline
pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    experiment_name="data_preprocessing_pipeline"
)

# Open Azure ML Studio view of this job
webbrowser.open(pipeline_job.studio_url)


Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see h

False