In [1]:
from azure.ai.ml import load_component
from azure.ai.ml.dsl import pipeline
from azure.ai.ml import MLClient, Input
from azure.identity import DefaultAzureCredential, EnvironmentCredential
from azure.ai.ml.entities import AmlCompute
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd

In [2]:
print("Cargar username/pass desde archivo .env")
load_dotenv("env.txt")

Cargar username/pass desde archivo .env


True

In [3]:
def get_comput_target(ml_client, name="cpu-cluster", family='Standard_DS2_v2'):
    cpu_compute_target = name
    
    try:
        # let's see if the compute target already exists
        cpu_cluster = ml_client.compute.get(cpu_compute_target)
    except Exception:
        cpu_cluster = AmlCompute(
            name=cpu_compute_target,
            type="amlcompute",
            size=family,
            min_instances=0,
            max_instances=4,
            idle_time_before_scale_down=180,
            tier="Dedicated",
        )
    
        cpu_cluster = ml_client.compute.begin_create_or_update(cpu_cluster).result()

In [4]:
credential = DefaultAzureCredential()
ml_client = MLClient.from_config(credential=credential)

Found the config file in: .\config.json


In [5]:
credential.get_token("https://management.azure.com/.default")

AccessToken(token='eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsIng1dCI6IjlHbW55RlBraGMzaE91UjIybXZTdmduTG83WSIsImtpZCI6IjlHbW55RlBraGMzaE91UjIybXZTdmduTG83WSJ9.eyJhdWQiOiJodHRwczovL21hbmFnZW1lbnQuYXp1cmUuY29tIiwiaXNzIjoiaHR0cHM6Ly9zdHMud2luZG93cy5uZXQvODRjYmMyMWItZmRmMy00MjcyLWFkMzQtZjRjMGZjZjcwZmExLyIsImlhdCI6MTY5OTQxNzAzMCwibmJmIjoxNjk5NDE3MDMwLCJleHAiOjE2OTk0MjA5MzAsImFpbyI6IkUyRmdZR0Q2S21UM2FqRnYyUG9Qelo4ZUtWNWVBQUE9IiwiYXBwaWQiOiIwODg0MjVjNS01NDkxLTQyMWUtYmUyMy0wMmEyZjAwODc5MjciLCJhcHBpZGFjciI6IjEiLCJpZHAiOiJodHRwczovL3N0cy53aW5kb3dzLm5ldC84NGNiYzIxYi1mZGYzLTQyNzItYWQzNC1mNGMwZmNmNzBmYTEvIiwiaWR0eXAiOiJhcHAiLCJvaWQiOiJiODc2NDFkMy0yOTUwLTQwNzQtYTI2OS01YjI2ODhjNjczOTMiLCJyaCI6IjAuQWIwQUc4TExoUFA5Y2tLdE5QVEFfUGNQb1VaSWYza0F1dGRQdWtQYXdmajJNQlBMQUFBLiIsInN1YiI6ImI4NzY0MWQzLTI5NTAtNDA3NC1hMjY5LTViMjY4OGM2NzM5MyIsInRpZCI6Ijg0Y2JjMjFiLWZkZjMtNDI3Mi1hZDM0LWY0YzBmY2Y3MGZhMSIsInV0aSI6InVIcTl0RHhQbVUtUUFKSkZxWnl6QUEiLCJ2ZXIiOiIxLjAiLCJ4bXNfY2FlIjoiMSIsInhtc190Y2R0IjoxNjk1MjU5MzQxfQ.f8D2-sIIkaWAydhDB

In [6]:
compute_target = get_comput_target(ml_client)

In [7]:
train_component = load_component(source="./train-component/dt_train.yml")
split_data_component = load_component(source="./split-data-component/split.yml")
score_component = load_component(source="./score-component/score.yml")
eval_model_component = load_component(source="./eval-model-component/eval.yml")
clean_data_component = load_component(source="./clean-data-component/clean_data.yml")

In [12]:
# define a pipeline containing 3 nodes: Prepare data node, train node, and score node
@pipeline(
    default_compute='cpu-cluster',
)
def water_potability_decision_tree_dummy(pipeline_input_data):

    clean_data_node = clean_data_component(
        data_set = pipeline_input_data
    )

    split_data_node = split_data_component(
        clean_data = clean_data_node.outputs.data_clean_output, # Directory
        split_ratio_train = 0.8
    )

    train_node = train_component(
        training_data=split_data_node.outputs.data_train, # File
        criterion = 'entropy',
        min_samples_split=2,
        max_depth=None
    )
    
    score_node = score_component(
        test_data=split_data_node.outputs.data_test, # File
        model_input=train_node.outputs.model_output # Folder
    )

    eval_node = eval_model_component(
        scoring_result=score_node.outputs.score_output, # Folder
        target_column = 'Potability'
    )

    return {
        "correlation_graph" : clean_data_node.outputs.corr_matrix_output,
        "model_pkl" : train_node.outputs.model_output,
        "model_metrics": eval_node.outputs.eval_output
    }


# create a pipeline
water_potability_ds =  Input(
            type="uri_file",
            path="azureml://subscriptions/df5b1289-646f-4999-a2a8-7eec46d13e15/resourcegroups/Azure-ML/workspaces/Azureml/datastores/workspaceblobstore/paths/UI/2023-11-08_041356_UTC/water_potability_ds.csv",
        )
pipeline_job = water_potability_decision_tree_dummy(pipeline_input_data=water_potability_ds)



In [13]:
pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name="pipeline_water_potability_dummy"
)
pipeline_job

Uploading eval_src (0.0 MBs): 100%|##########| 3120/3120 [00:01<00:00, 2485.50it/s] 




Experiment,Name,Type,Status,Details Page
pipeline_water_potability_dummy,dreamy_ice_33mphs0lwn,pipeline,Preparing,Link to Azure Machine Learning studio
