## Create batch pipeline

In [1]:
# Connect to your workspace
from azureml.core import Workspace
ws = Workspace.from_config()

## Provision inference compute

In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "mlopsbootcamp"

try:
    # Check for existing compute target
    inference_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS2_v2', max_nodes=2)
        inference_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        inference_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

InProgress.....
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Create a pipeline for batch inferencing

In [2]:
import os
# Create a folder for the experiment files
WORKDIR = os.path.dirname(os.getcwd())
experiment_folder = os.path.join(WORKDIR, 'batch_pipeline')
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch_pipeline


## Get path to the model

In [3]:
from azureml.core import Model
model_list = Model.list(ws)
model_path = Model.get_model_path('fourier_regression', _workspace=ws)
print(model_list, model_path)

[Model(workspace=Workspace.create(name='mlworkspace', subscription_id='e7d71274-b7c4-47ed-9751-2505b563b918', resource_group='mlgroup'), name=fourier_regression, id=fourier_regression:1, version=1, tags={}, properties={})] azureml-models\fourier_regression\1\fourier.pkl


## Load the model

In [7]:
import joblib
model = joblib.load(model_path)
model

LinearRegression()

## Check the batch data

In [8]:
mini_batch = list()
for (dirpath, dirnames, filenames) in os.walk(os.path.join(WORKDIR, "batch-data")):
    mini_batch += [os.path.join(dirpath, file) for file in filenames]
for elem in mini_batch:
    print(elem)

d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-data\batch-1.csv
d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-data\batch-2.csv
d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-data\batch-3.csv
d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-data\batch-4.csv
d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-data\batch-5.csv
d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-data\batch-6.csv
d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-data\batch-7.csv


## Make prediction per batch

In [22]:
import numpy as np
def run(mini_batch):
    # This runs for each batch
    resultList = []

    # process each file in the batch
    for f in mini_batch:
        # Read comma-delimited data into an array
        data = np.genfromtxt(f, delimiter=',', skip_header=1)
        # Reshape into a 2-dimensional array for model input
        data = data[:, 1:]
        prediction = model.predict(data)
        # Append prediction to results
        resultList.append("{}: {}".format(os.path.basename(f), np.mean(prediction)))
    return resultList

result = run(mini_batch)
result

['batch-1.csv: 99.93783767827078',
 'batch-2.csv: 99.89835726364004',
 'batch-3.csv: 89.52226883425273',
 'batch-4.csv: 89.36492929530372',
 'batch-5.csv: 100.80108846176175',
 'batch-6.csv: 100.33507942766154',
 'batch-7.csv: 100.09765850751842']

## Summarize into python script

In [26]:
%%writefile $experiment_folder\score.py

import os
import numpy as np
from azureml.core import Model
import joblib

def init():
    # Runs when the pipeline step is initialized
    global model

    # load the model
    model_path = Model.get_model_path('fourier_regression')
    model = joblib.load(model_path)

def run(mini_batch):
    # This runs for each batch
    resultList = []

    # process each file in the batch
    for f in mini_batch:
        # Read comma-delimited data into an array
        data = np.genfromtxt(f, delimiter=',', skip_header=1)
        # Reshape into a 2-dimensional array for model input
        data = data[:, 1:]
        prediction = model.predict(data)
        # log results (for application insights)
        log = 'Data:' + str(data) + ' - Prediction:' + str(prediction)
        print(log)
        # Append prediction to results
        resultList.append("{}: {}".format(os.path.basename(f), np.mean(prediction)))
    return resultList

Overwriting d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch_pipeline\score.py


## Create conda environment for the pipeline

In [27]:
%%writefile $experiment_folder\batch_environment.yml
name: batch_environment
dependencies:
- python=3.8
- numpy
- pandas
- scikit-learn
- pip:
    - azureml-core
    - azureml-dataset-runtime[fuse]
    - azureml-pipeline-core
    - azureml-pipeline-steps

Overwriting d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch_pipeline\batch_environment.yml


## Define run using environment

In [28]:
from azureml.core import Environment
from azureml.core.runconfig import DEFAULT_CPU_IMAGE

# Create an Environment for the experiment
batch_env = Environment.from_conda_specification("experiment_env", experiment_folder + "/batch_environment.yml")
batch_env.docker.base_image = DEFAULT_CPU_IMAGE
print('Configuration ready.')

Configuration ready.


## Configure batch pipeline steps

In [29]:
from datetime import datetime

from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep
from azureml.data import OutputFileDatasetConfig
from azureml.core.runconfig import DockerConfiguration

# Get the batch dataset for input
batch_data_set = ws.datasets['batch-data']

# Set the output location
default_ds = ws.get_default_datastore()
output_dir = OutputFileDatasetConfig(name='inferences')

# Define the parallel run step step configuration
parallel_run_config = ParallelRunConfig(
    source_directory=experiment_folder,
    entry_script="score.py",
    mini_batch_size="5",
    error_threshold=10,
    output_action="append_row",
    environment=batch_env,
    compute_target=inference_cluster,
    node_count=2)

parallel_step_name = "batchscoring-" + datetime.now().strftime("%Y%m%d%H%M")

# Create the parallel run step
parallelrun_step = ParallelRunStep(
    name=parallel_step_name,
    parallel_run_config=parallel_run_config,
    inputs=[batch_data_set.as_named_input('batch_data')],
    output=output_dir,
    arguments=[],
    allow_reuse=True
)

print('Steps defined')

Steps defined


## Run the pipeline

In [30]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

# Create the pipeline
pipeline = Pipeline(workspace=ws, steps=[parallelrun_step])

# Run the pipeline as an experiment
pipeline_run = Experiment(ws, 'pytown-energy-demand-batch').submit(pipeline)
pipeline_run.wait_for_completion(show_output=True)

Created step batchscoring-202108070039 [9a27a8f2][b7d9a36e-a870-49be-beda-dee6e9226323], (This step will run and generate new outputs)
Submitted PipelineRun ce08e119-55df-42e0-b088-9521100eac90
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/ce08e119-55df-42e0-b088-9521100eac90?wsid=/subscriptions/e7d71274-b7c4-47ed-9751-2505b563b918/resourcegroups/mlgroup/workspaces/mlworkspace&tid=a0f1cacd-618c-4403-b945-76fb3d6874e5
PipelineRunId: ce08e119-55df-42e0-b088-9521100eac90
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/ce08e119-55df-42e0-b088-9521100eac90?wsid=/subscriptions/e7d71274-b7c4-47ed-9751-2505b563b918/resourcegroups/mlgroup/workspaces/mlworkspace&tid=a0f1cacd-618c-4403-b945-76fb3d6874e5
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 2c56b7fc-1633-4648-9806-a5562cadaa04
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/2c56b7fc-1633-4648-9806-a5562cadaa04?wsid=/subscriptions/e7d71274-b7c4-47ed-9751-2505b56

'Finished'

## Retrieve predictions

In [31]:
import pandas as pd
import shutil

# Remove the local results folder if left over from a previous run
BATCH_RESULTS = os.path.join(WORKDIR, 'batch-results')
shutil.rmtree(BATCH_RESULTS, ignore_errors=True)

# Get the run for the first step and download its output
prediction_run = next(pipeline_run.get_children())
prediction_output = prediction_run.get_output_data('inferences')
prediction_output.download(local_path=BATCH_RESULTS)

# Traverse the folder hierarchy and find the results file
for root, dirs, files in os.walk(BATCH_RESULTS):
    for file in files:
        if file.endswith('parallel_run_step.txt'):
            result_file = os.path.join(root,file)

# cleanup output format
df = pd.read_csv(result_file, delimiter=":", header=None)
df.columns = ["File", "Prediction"]

# Display the first 20 results
df.head(20)

Unnamed: 0,File,Prediction
0,batch-1.csv,99.937838
1,batch-2.csv,99.898357
2,batch-3.csv,89.522269
3,batch-4.csv,89.364929
4,batch-5.csv,100.801088
5,batch-6.csv,100.335079
6,batch-7.csv,100.097659


## Load predictions into data storage

In [32]:
datastore = ws.get_default_datastore()
datastore.upload(src_dir=BATCH_RESULTS, target_path="batch-results", overwrite=True, show_progress=True)

Uploading an estimated of 1 files
Uploading d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-results\dataset\2c56b7fc-1633-4648-9806-a5562cadaa04\inferences\parallel_run_step.txt
Uploaded d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-results\dataset\2c56b7fc-1633-4648-9806-a5562cadaa04\inferences\parallel_run_step.txt, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_7a28a2977434476f9c136e1907ec1359

## Publish the pipeline

In [33]:
published_pipeline = pipeline_run.publish_pipeline(name='Fourier_regression_batch_prediction_pipeline',
                                                   description='Batch scoring using linear regression model with Fourier ML features',
                                                   version='1.0')

published_pipeline

Name,Id,Status,Endpoint
Fourier_regression_batch_prediction_pipeline,bb3d5fd1-92f0-4141-a6c7-e64ad9cb4547,Active,REST Endpoint


## Get REST endpoint

In [34]:
rest_endpoint = published_pipeline.endpoint
print(rest_endpoint)

https://westeurope.api.azureml.ms/pipelines/v1.0/subscriptions/e7d71274-b7c4-47ed-9751-2505b563b918/resourceGroups/mlgroup/providers/Microsoft.MachineLearningServices/workspaces/mlworkspace/PipelineRuns/PipelineSubmit/bb3d5fd1-92f0-4141-a6c7-e64ad9cb4547


## Schedule the pipeline to run every Monday at 04:00 in the morning (02:00 UTC)

In [35]:
from azureml.pipeline.core import ScheduleRecurrence, Schedule

weekly = ScheduleRecurrence(frequency='Week', interval=1, week_days=["Monday"], time_of_day="02:00")
pipeline_schedule = Schedule.create(ws, name='Weekly Predictions',
                                        description='batch inferencing',
                                        pipeline_id=published_pipeline.id,
                                        experiment_name='Batch_Prediction',
                                        recurrence=weekly)

## Disable pipeline with active schedule

In [36]:
ss = Schedule.list(ws)
for s in ss:
    print(s)

Pipeline(Name: Weekly Predictions,
Id: cda9ba75-7630-44e5-aae0-d20f6eb423e3,
Status: Active,
Pipeline Id: bb3d5fd1-92f0-4141-a6c7-e64ad9cb4547,
Pipeline Endpoint Id: None,
Recurrence Details: Runs at 2:00 on Monday every Week)


In [37]:
def stop_by_schedule_id(ws, schedule_id):
    s = next(s for s in Schedule.list(ws) if s.id == schedule_id)
    s.disable()
    return s

stop_by_schedule_id(ws, s.id)

Name,Id,Status,Pipeline Id,Pipeline Endpoint Id,Recurrence Details
Weekly Predictions,cda9ba75-7630-44e5-aae0-d20f6eb423e3,Disabled,bb3d5fd1-92f0-4141-a6c7-e64ad9cb4547,,Runs at 2:00 on Monday every Week
