In [1]:
import os
import azureml.core
from azureml.core import Workspace, Experiment, Datastore
from azureml.widgets import RunDetails
 
from azureml.core import Dataset
 
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.core import PipelineRun, StepRun, PortDataReference
from azureml.pipeline.steps import PythonScriptStep
 
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
 
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
 
from azureml.core.model import Model
 
# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.35.0


In [2]:
ws = Workspace.from_config()

In [3]:
ws

Workspace.create(name='mdl_ml_ops', subscription_id='d674bf87-325c-409a-a63f-001d9725b99e', resource_group='mdl_ml_pipeline')

In [4]:
def_blob_store = ws.get_default_datastore() 

In [5]:
################################################
#### MUST BE RUN THE FIRST TIME YOU DO THIS ####
#### but only needs to be run once          ####
################################################

#def_blob_store.upload_files(["./data/titanic.csv"], target_path="data", overwrite=True)

In [6]:
compute_name = 'cpu-cluster'
if not compute_name in ws.compute_targets :
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                                min_nodes=0,
                                                                max_nodes=1)
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)

    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # Show the result
    print(compute_target.get_status().serialize())

compute_target = ws.compute_targets[compute_name]

In [7]:
aml_run_config = RunConfiguration()
 
aml_run_config.target = compute_target
aml_run_config.environment.docker.enabled = True
aml_run_config.environment.docker.base_image = "mcr.microsoft.com/azureml/base:latest"
 
aml_run_config.environment.python.user_managed_dependencies = False
 
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas','scikit-learn','numpy'], 
    pip_packages=['joblib','azureml-sdk','fusepy'], 
    pin_sdk_version=False)

'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


In [8]:
titanic_data = Dataset.Tabular.from_delimited_files(def_blob_store.path('./data/titanic.csv'))
titanic_data = titanic_data.register(ws, 'titanic_data')

In [9]:
raw_data = titanic_data.as_named_input('raw_data')
train_data = PipelineData("train_data", datastore=def_blob_store).as_dataset()
test_data = PipelineData("test_data", datastore=def_blob_store).as_dataset()
scaler_file = PipelineData("scaler_file", datastore=def_blob_store)
model_file = PipelineData("model_file", datastore=def_blob_store)

In [10]:
source_directory="./prep"
step1 = PythonScriptStep(name="prep_step",
                         script_name="./prep.py", 
                         arguments=["--train", train_data,"--test", test_data,"--scaler",scaler_file],
                         inputs=[raw_data],
                         outputs=[train_data,test_data,scaler_file],                         
                         compute_target=compute_target, 
                         runconfig=aml_run_config,
                         source_directory=source_directory,
                         allow_reuse=True)

In [11]:
source_directory="./train"
step2 = PythonScriptStep(name="train_step",
                         script_name="./train.py", 
                         arguments=["--train", train_data,"--test", test_data,"--model",model_file],
                         inputs=[train_data,test_data],
                         outputs=[model_file],                         
                         compute_target=compute_target, 
                         runconfig=aml_run_config,
                         source_directory=source_directory,
                         allow_reuse=True)

In [12]:
steps = [step1,step2]

In [13]:
pipeline1 = Pipeline(workspace=ws, steps=steps)

In [14]:
pipeline_run1 = Experiment(ws, 'titanic_no_auto').submit(pipeline1, regenerate_outputs=False)

Created step prep_step [fc82b569][130cadb3-d374-4441-a125-235a66d39462], (This step is eligible to reuse a previous run's output)
Created step train_step [88626030][6301bac3-2afa-4d6a-9755-2e44efed19cc], (This step will run and generate new outputs)
Submitted PipelineRun 2995f5bb-0d98-4128-8311-2497448215c6
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/2995f5bb-0d98-4128-8311-2497448215c6?wsid=/subscriptions/d674bf87-325c-409a-a63f-001d9725b99e/resourcegroups/mdl_ml_pipeline/workspaces/mdl_ml_ops&tid=83b02c92-5f26-48ed-9e5b-6c2fca46a8e6
