### Deploy batch inferencen pipelines with Azure Machine Learning

Long-running tasks that operates on large volumes of data are performed as _batch operations_. In Machine Learning, _batch inferencing_ is used to apply a predictive model to multiple cases asynchronously - usually writing the results to a file or database.

To create a batch inference pipeline, perform the following tasks:
1. Register/Retrieve a model
2. Create a scoring script which load the model and predict new batch of data
3. Create a pipeline with ParallelRunStep which performs parallel batch inferencing. That object read batches from a file and write the result on OutputFileDatasetConfig.
4. Run the pipeline and retrieve the output.

Then, once the pipeline has been ran, it can be published as a REST API service. It can also be scheduled.

In [1]:
import os

from azureml.core import Workspace

ws = Workspace.from_config()

In [3]:
# 1. Retrieve the model.
model = ws.models['diabetes_model']

In [11]:
# 2. Create the batch data

import pandas as pd
import os

ws.set_default_datastore('workspaceblobstore')
default_ds = ws.get_default_datastore()

diabetes = pd.read_csv('./Script/data/diabetes.csv')
sample = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].sample(n=100).values

batch_folder = './Script/batch_data'
os.makedirs(batch_folder, exist_ok=True)

for i in range(100):
    fname = str(i+1) + '.csv'
    sample[i].tofile(os.path.join(batch_folder, fname), sep=",")
print("files saved!")

files saved!


In [15]:
# Upload the file to default datastore.
default_ds.upload(src_dir="./Script/batch_data/",
                  target_path="batch-data",
                  overwrite=True,
                  show_progress=True)


Uploading an estimated of 100 files
Uploading ./Script/batch_data\36.csv
Uploaded ./Script/batch_data\36.csv, 1 files out of an estimated total of 100
Uploading ./Script/batch_data\38.csv
Uploaded ./Script/batch_data\38.csv, 2 files out of an estimated total of 100
Uploading ./Script/batch_data\40.csv
Uploaded ./Script/batch_data\40.csv, 3 files out of an estimated total of 100
Uploading ./Script/batch_data\10.csv
Uploaded ./Script/batch_data\10.csv, 4 files out of an estimated total of 100
Uploading ./Script/batch_data\12.csv
Uploaded ./Script/batch_data\12.csv, 5 files out of an estimated total of 100
Uploading ./Script/batch_data\14.csv
Uploaded ./Script/batch_data\14.csv, 6 files out of an estimated total of 100
Uploading ./Script/batch_data\15.csv
Uploaded ./Script/batch_data\15.csv, 7 files out of an estimated total of 100
Uploading ./Script/batch_data\18.csv
Uploaded ./Script/batch_data\18.csv, 8 files out of an estimated total of 100
Uploading ./Script/batch_data\2.csv
Uploaded

$AZUREML_DATAREFERENCE_74e2cd248dba44958712ee326b871340

In [16]:
from azureml.core import Datastore, Dataset

# Register a dataset for the input data
batch_data_set = Dataset.File.from_files(path=(default_ds, 'batch-data/'), validate=False)
try:
    batch_data_set = batch_data_set.register(workspace=ws,
                                             name='batch-data',
                                             description='batch data',
                                             create_new_version=True)
except Exception as ex:
    print(ex)

In [17]:
# Create a compute.

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = 'aml-cluster'

try:
    inference_cluster = ComputeTarget(workspace = ws, name = cluster_name)
except:
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size = 'STANDARD_DS11_V2', max_nodes=2)
        inference_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
    except Exception as ex:
        print(ex)

In [18]:
# Create a pipeline for batch inferencing.

import os

experiment_folder = './Script/batch_pipeline'
os.makedirs(experiment_folder, exist_ok=True)

In [23]:
# Retrieve an environment.
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.core import Environment

env = Environment.get(workspace = ws, name = 'experiment_env')
env.docker.base_image = DEFAULT_CPU_IMAGE

In [26]:
# Set the steps of pipeline.
from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep
from azureml.data import OutputFileDatasetConfig

output_dir = OutputFileDatasetConfig(name = 'inferences')

parallel_run_config = ParallelRunConfig(
    source_directory=experiment_folder,
    entry_script='entry_pipeline.py',
    mini_batch_size='5',
    error_threshold=10,
    output_action='append_row',
    environment=env,
    compute_target=inference_cluster,
    node_count=2)

parallel_run_step = ParallelRunStep(
    name = 'batch-score-diabetes',
    parallel_run_config=parallel_run_config,
    inputs = [batch_data_set.as_named_input('diabetes_batch')],
    output=output_dir,
    arguments=[],
    allow_reuse=True
)

In [28]:
# Define the pipeline.

from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

pipeline = Pipeline(workspace=ws, steps=[parallel_run_step])

exp = Experiment(workspace=ws, name = 'batch-inferencing')
run = exp.submit(pipeline)
run.wait_for_completion(show_output=True)

Created step batch-score-diabetes [6f0143be][bd148f6c-e1a7-4c4c-b37a-2be682e946db], (This step will run and generate new outputs)
Submitted PipelineRun a1f4c596-3c36-498a-b282-429969b8f698
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/a1f4c596-3c36-498a-b282-429969b8f698?wsid=/subscriptions/d12c1b85-0a70-4232-b483-12d1ffcfc148/resourcegroups/ResourceGroupRavazzi/workspaces/ravazzil-workspace&tid=b00367e2-193a-4f48-94de-7245d45c0947
PipelineRunId: a1f4c596-3c36-498a-b282-429969b8f698
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/a1f4c596-3c36-498a-b282-429969b8f698?wsid=/subscriptions/d12c1b85-0a70-4232-b483-12d1ffcfc148/resourcegroups/ResourceGroupRavazzi/workspaces/ravazzil-workspace&tid=b00367e2-193a-4f48-94de-7245d45c0947
PipelineRun Status: Running


StepRunId: 2748a8f6-840b-41bb-8a00-192f5da40aec
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/2748a8f6-840b-41bb-8a00-192f5da40aec?wsid=/subscriptions/d12c1b85-0a70-4232-b483-12d

'Finished'

In [29]:
import pandas as pd

# Get the run for the first step and download its output
prediction_run = next(run.get_children())
prediction_output = prediction_run.get_output_data('inferences')
prediction_output.download(local_path='diabetes-results')

# Traverse the folder hierarchy and find the results file
for root, dirs, files in os.walk('diabetes-results'):
    for file in files:
        if file.endswith('parallel_run_step.txt'):
            result_file = os.path.join(root,file)

# cleanup output format
df = pd.read_csv(result_file, delimiter=":", header=None)
df.columns = ["File", "Prediction"]

# Display the first 20 results
df.head(20)

Unnamed: 0,File,Prediction
0,1.csv,0
1,10.csv,1
2,100.csv,0
3,11.csv,0
4,12.csv,1
5,13.csv,1
6,14.csv,0
7,15.csv,0
8,16.csv,1
9,17.csv,0


In [30]:
# Publish the pipeline.

published_pipeline = run.publish_pipeline(
    name = 'diabetes-batch-pipeline',
    description = 'Batch scoring of diabetes data',
    version='1.0'
)
published_pipeline

Name,Id,Status,Endpoint
diabetes-batch-pipeline,c96c5241-f85a-4ecc-9392-8a08b06006d4,Active,REST Endpoint


In [31]:
published_pipeline.endpoint

'https://francecentral.api.azureml.ms/pipelines/v1.0/subscriptions/d12c1b85-0a70-4232-b483-12d1ffcfc148/resourceGroups/ResourceGroupRavazzi/providers/Microsoft.MachineLearningServices/workspaces/ravazzil-workspace/PipelineRuns/PipelineSubmit/c96c5241-f85a-4ecc-9392-8a08b06006d4'

In [32]:
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()
print('Authentication header ready.')

Authentication header ready.


In [33]:
import requests

rest_endpoint = published_pipeline.endpoint
response = requests.post(rest_endpoint,
                         headers=auth_header,
                         json={"ExperimentName": "mslearn-diabetes-batch"})
run_id = response.json()["Id"]
run_id

'8721aa95-ddea-4cfe-a2be-a12122b192a5'

In [34]:
from azureml.pipeline.core.run import PipelineRun
from azureml.widgets import RunDetails

published_pipeline_run = PipelineRun(ws.experiments['mslearn-diabetes-batch'], run_id)

# Block until the run completes
published_pipeline_run.wait_for_completion(show_output=True)

PipelineRunId: 8721aa95-ddea-4cfe-a2be-a12122b192a5
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/8721aa95-ddea-4cfe-a2be-a12122b192a5?wsid=/subscriptions/d12c1b85-0a70-4232-b483-12d1ffcfc148/resourcegroups/ResourceGroupRavazzi/workspaces/ravazzil-workspace&tid=b00367e2-193a-4f48-94de-7245d45c0947

PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '8721aa95-ddea-4cfe-a2be-a12122b192a5', 'status': 'Completed', 'startTimeUtc': '2023-01-03T18:21:23.839135Z', 'endTimeUtc': '2023-01-03T18:21:25.062107Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'Unavailable', 'runType': 'HTTP', 'azureml.parameters': '{}', 'azureml.continue_on_step_failure': 'False', 'azureml.continue_on_failed_optional_input': 'True', 'azureml.pipelineid': 'c96c5241-f85a-4ecc-9392-8a08b06006d4', 'azureml.pipelineComponent': 'pipelinerun', 'azureml.pipelines.stages': '{"Initialization":null,"Execution":{"StartTime":"2023-01-03T18:21:24.23248

'Finished'

To view results, you can use the code above.