#### Importando librerias

In [2]:
import sys
print(sys.executable)
print(sys.version)
print(sys.version_info)

/anaconda/envs/azureml_py310_sdkv2/bin/python
3.10.11 (main, May 16 2023, 00:28:57) [GCC 11.2.0]
sys.version_info(major=3, minor=10, micro=11, releaselevel='final', serial=0)


In [3]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import AmlCompute
import os
from azure.ai.ml.entities import Environment
from azure.ai.ml import command, Input
from azure.ai.ml.sweep import Choice, Uniform, MedianStoppingPolicy
from azure.identity import DefaultAzureCredential
from azure.ai.ml.constants import AssetTypes

#### Iniciando sesión

In [4]:
credential = DefaultAzureCredential()
ml_client = MLClient.from_config(credential=credential)

Found the config file in: /config.json


#### Creación del cluster
El cluster con el que trabajaremos sera el mismo del anterior Orquestador.

In [5]:
from azure.ai.ml.entities import AmlCompute
from azure.core.exceptions import ResourceNotFoundError

compute_name = "DS11-v2-cpu-cluster"

try:
    _ = ml_client.compute.get(compute_name)
    print("Found existing compute target.")
except ResourceNotFoundError:
    print("Creating a new compute target...")
    compute_config = AmlCompute(
        name=compute_name,
        type="amlcompute",
        size="STANDARD_DS11_V2",
        idle_time_before_scale_down=120,
        min_instances=0,
        max_instances=5,
    )
    ml_client.begin_create_or_update(compute_config).result()

Found existing compute target.


#### Definición del environment
Para el environment usaremos ubuntu como sistema operativo, las dependencias estaran alojadas en un yaml en la direccion `./env/conda.yaml`. Este archivo fue obtenido del ejemplo de hiperparámetros del docente.

In [6]:
env_name = 'sklearn-env'
job_env = Environment(
    name=env_name,
    description="sklearn 0.24.2",
    conda_file='./env/conda.yaml',
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
)
job_env = ml_client.environments.create_or_update(job_env)

#### Subir el data set al data blob storage de Azure
En el otro orquestador trabajabamos con un csv dentro de nuestro directorio, pero ahora debemos subirlo al data blob storage de Azure para acceder a el desde cualquier instancia.

In [13]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes, InputOutputModes
from azure.ai.ml import Input

dataset_dir = './data'

my_data = Data(
    path=dataset_dir,
    type=AssetTypes.URI_FOLDER,
    description="Fraudulent transaction data set",
    name="creditcard_non_atypical",
)

uri_folder_data_asset = ml_client.data.create_or_update(my_data)

In [15]:
uri_folder_data_asset.path

'azureml://subscriptions/9d6f3686-6c64-4aea-8d5a-3cd7cb82619b/resourcegroups/ml-proyecto/workspaces/creditcard-ml/datastores/workspaceblobstore/paths/LocalUpload/9597b7c6a70d05cdc62ccdd532b8a614/data/'

#### Definición del Job
Ahora definimos el job argumentando el archivo .py que usaremos en cada instancia a demas de los argumentos necesarios para que el mismo pueda correr. Ademas, debemos añadir el link del archivo csv subido al data blob storage de Azure junto a los hiperparámetros definidos en `inputs`. Este ultimo es imporante ya que los mismo seran enviados a nuestro archivo .py usando `command` donde cada hiperparámetro sera ingresado como argumento a nuestro archivo.

In [28]:
job = command(
    code="./src",
    command="python train.py --dataset_path ${{inputs.dataset_path}} --min_samples_split ${{inputs.min_samples_split}} --n_neighbors ${{inputs.n_neighbors}} --algorithm ${{inputs.algorithm}} --p ${{inputs.p}} --weights ${{inputs.weights}}",
    environment=f"{job_env.name}:{job_env.version}",
    experiment_name='cc-fraud-detection-exp-k-nearest--neighbors-hyp',
    display_name="cc-fraud-detection-exp-k-nearest--neighbors-hyp",
    inputs={
        "dataset_path": Input(
            type="uri_file",
            path="azureml://subscriptions/9d6f3686-6c64-4aea-8d5a-3cd7cb82619b/resourcegroups/ml-proyecto/workspaces/creditcard-ml/datastores/workspaceblobstore/paths/LocalUpload/9597b7c6a70d05cdc62ccdd532b8a614/data/creditcard_non_atypical.csv",
        ),
        "min_samples_split": 3,
        "n_neighbors":50,
        "algorithm":"auto",
        "p":1,
        "weights":"uniform"
    },
    compute=compute_name,
)

In [29]:
returned_job = ml_client.jobs.create_or_update(job)

Uploading src (0.01 MBs):   0%|          | 0/5772 [00:00<?, ?it/s]Uploading src (0.01 MBs): 100%|██████████| 5772/5772 [00:00<00:00, 87292.48it/s]




In [30]:
ml_client.jobs.stream(returned_job.name)

RunId: quiet_dream_xpfs1fcxqg
Web View: https://ml.azure.com/runs/quiet_dream_xpfs1fcxqg?wsid=/subscriptions/9d6f3686-6c64-4aea-8d5a-3cd7cb82619b/resourcegroups/ml-proyecto/workspaces/creditcard-ml

Execution Summary
RunId: quiet_dream_xpfs1fcxqg
Web View: https://ml.azure.com/runs/quiet_dream_xpfs1fcxqg?wsid=/subscriptions/9d6f3686-6c64-4aea-8d5a-3cd7cb82619b/resourcegroups/ml-proyecto/workspaces/creditcard-ml



In [32]:
job_for_sweep = job(
    min_samples_split=Choice(values=[3,5,7,9]),
    n_neighbors = Choice(values=[45,50,55,60]),
    algorithm = Choice(values=['ball_tree','auto']),
    p = Choice(values=[1,2]),
    weights = Choice(values=['uniform','distance'])
)

sweep_job = job_for_sweep.sweep(
    compute=compute_name,
    sampling_algorithm="random",
    primary_metric="F1 Score",
    goal="Maximize",
    max_total_trials=12,
    max_concurrent_trials=5,
)

returned_sweep_job = ml_client.create_or_update(sweep_job)
ml_client.jobs.stream(returned_sweep_job.name)

RunId: mighty_turtle_4md175v1k9
Web View: https://ml.azure.com/runs/mighty_turtle_4md175v1k9?wsid=/subscriptions/9d6f3686-6c64-4aea-8d5a-3cd7cb82619b/resourcegroups/ml-proyecto/workspaces/creditcard-ml

Streaming azureml-logs/hyperdrive.txt

[2023-10-30T01:47:03.162145][GENERATOR][INFO]Trying to sample '5' jobs from the hyperparameter space
[2023-10-30T01:47:03.7466941Z][SCHEDULER][INFO]Scheduling job, id='mighty_turtle_4md175v1k9_0' 
[2023-10-30T01:47:03.8497875Z][SCHEDULER][INFO]Scheduling job, id='mighty_turtle_4md175v1k9_1' 
[2023-10-30T01:47:03.9264586Z][SCHEDULER][INFO]Scheduling job, id='mighty_turtle_4md175v1k9_2' 
[2023-10-30T01:47:04.1602074Z][SCHEDULER][INFO]Scheduling job, id='mighty_turtle_4md175v1k9_4' 
[2023-10-30T01:47:04.0707722Z][SCHEDULER][INFO]Scheduling job, id='mighty_turtle_4md175v1k9_3' 
[2023-10-30T01:47:04.106026][GENERATOR][INFO]Successfully sampled '5' jobs, they will soon be submitted to the execution target.
[2023-10-30T01:47:04.4471409Z][SCHEDULER][INFO]S