# Title Here

### Connect to your workspace

To connect to a workspace, we need identifier parameters - a subscription ID, resource group name, and workspace name. Since you're working with a compute instance, managed by Azure Machine Learning, you can use the default values to connect to the workspace.

In [None]:
from dotenv import load_dotenv
import os
load_dotenv() # take environment variables from .env.


subscription = os.environ["SUBSCRIPTION"]
resource_group = os.environ["RESOURCE_GROUP"]
ws_name = os.environ["WS_NAME"]

In [None]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# authenticate
credential = DefaultAzureCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id=subscription,
    resource_group_name=resource_group,
    workspace_name=ws_name,
)

Create Data store and upload training data

In [None]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

data_path = "../data"
dataset_name = "diabetes-data-train"

patient_dataset = Data(
    path=data_path,
    type=AssetTypes.URI_FOLDER,
    description="Training data for diabetes prediction",
    name=dataset_name,
)
ml_client.data.create_or_update(patient_dataset)

In [None]:
patient_dataset = ml_client.data.get(
    name="diabetes-data-train", label="latest"
)

print(patient_dataset)

Load components

In [None]:
from azure.ai.ml import load_component
parent_dir = "../components/"

fix_missing_data = load_component(source=parent_dir + "fix-missing-data.yml")
normalize_data = load_component(source=parent_dir + "normalize-data.yml")
train_decision_tree = load_component(source=parent_dir + "train-decision-tree.yml")
train_logistic_regression = load_component(source=parent_dir + "train-logistic-regression.yml")




Build pipeline

In [None]:
from azure.ai.ml import Input
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.dsl import pipeline

@pipeline()
def diabetes_classification(pipeline_job_input):
    clean_data = fix_missing_data(input_data=pipeline_job_input)
    normalized_data = normalize_data(input_data=clean_data.outputs.output_data)
    train_model_decision_tree = train_decision_tree(training_data=normalized_data.outputs.output_data)
    train_model_logistic_regression = train_logistic_regression(training_data=normalized_data.outputs.output_data)


    return {
        "pipeline_job_transformed_data": normalized_data.outputs.output_data,
        "pipeline_job_trained_model_decision_tree": train_model_decision_tree.outputs.model_output_decision_tree,
        "pipeline_job_trained_model_logistic_regression": train_model_logistic_regression.outputs.model_output_logistic_reg,
    }

pipeline_job = diabetes_classification(Input(type=AssetTypes.URI_FILE, path=patient_dataset.path))
print(pipeline_job)

In [None]:

# set pipeline level compute
pipeline_job.settings.default_compute = "aml-cluster"
# set pipeline level datastore
pipeline_job.settings.default_datastore = "workspaceblobstore"

# print the pipeline job again to review the changes
print(pipeline_job)

In [None]:
# submit job to workspace
pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name="pipeline_diabetes_training"
)
pipeline_job

In [None]:
# List all child jobs in the job
child_jobs = ml_client.jobs.list(parent_job_name=pipeline_job.name)

# Traverse and download all the outputs of child job
for child_job in child_jobs:
    ml_client.jobs.download(name=child_job.name, output_name="model_output_decision_tree", download_path="../outputs")
    print(child_job.name)

In [None]:
from azure.ai.ml.entities import Model
from azure.ai.ml.constants import AssetTypes

model_name = 'diabetes-decision-tree-model'
model = ml_client.models.create_or_update(
    Model(name=model_name, path='../outputs/named-outputs/model_output_decision_tree/model_decision_tree.sav', type=AssetTypes.CUSTOM_MODEL),
)