In [None]:
from azure.ai.ml import MLClient, Input, Output, command
from azure.ai.ml.dsl import pipeline
from azure.ai.ml.entities import Environment
from azure.identity import DefaultAzureCredential


In [None]:
ml_client = MLClient.from_config(
    credential=DefaultAzureCredential()
)

print(ml_client.workspace_name)
print(ml_client.subscription_id)
print(ml_client.resource_group_name)


In [None]:
from azure.ai.ml import Input

train_data = Input(
    type="uri_folder",
    path="azureml:train-data:1"
)

test_data = Input(
    type="uri_folder",
    path="azureml:test-data:1"
)


In [None]:
sk_env = Environment(
    name="sklearn-pipeline-env",
    description="Environment for end-to-end ML pipeline",
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04",
    conda_file={
        "name": "sklearn-env",
        "channels": ["conda-forge"],
        "dependencies": [
            "python=3.8",
            "scikit-learn",
            "pandas",
            "numpy",
            "pip",
            {
                "pip": [
                    "mlflow"
                ]
            }
        ]
    }
)

ml_client.environments.create_or_update(sk_env)


In [None]:
preprocess_step = command(
    name="preprocess_step",
    code="./Project_pipeline",
    command="python preprocess.py --data ${{inputs.data}} --output ${{outputs.preprocessed}}",
    inputs={
        "data": Input(type="uri_file")
    },
    outputs={
        "preprocessed": Output(type="uri_folder")
    },
    environment=sk_env,
    compute="cpu-cluster-mlflow"
)


In [None]:
train_step = command(
    name="train_step",
    code="./Project_pipeline",
    command="python train.py --data ${{inputs.train_data}} --model ${{outputs.model}}",
    inputs={
        "train_data": Input(type="uri_folder")
    },
    outputs={
        "model": Output(type="uri_folder")
    },
    environment=sk_env,
    compute="cpu-cluster-mlflow"
)


In [None]:
evaluate_step = command(
    name="evaluate_step",
    code="./Project_pipeline",
    command="python evaluate.py --model ${{inputs.model}} --test_data ${{inputs.test_data}}",
    inputs={
        "model": Input(type="uri_folder"),
        "test_data": Input(type="uri_file")
    },
    environment=sk_env,
    compute="cpu-cluster-mlflow"
)


In [None]:
@pipeline(compute="cpu-cluster-mlflow")
def end_to_end_pipeline(train_input, test_input):

    preprocess = preprocess_step(
        data=train_input
    )

    train = train_step(
        train_data=preprocess.outputs.preprocessed
    )

    evaluate = evaluate_step(
        model=train.outputs.model,
        test_data=test_input
    )

    return {
        "model": train.outputs.model
    }


In [None]:
for data in ml_client.data.list():
    print(data.name, data.latest_version)


In [None]:
pipeline_job = end_to_end_pipeline(
    train_input=train_data,
    test_input=test_data
)

ml_client.jobs.create_or_update(pipeline_job)


In [None]:
 import pandas as pd

df = pd.read_csv("raw/train.csv")
print(df.columns)
df.head()
