## Create batch pipeline

In [2]:
# Connect to your workspace
from azureml.core import Workspace
ws = Workspace.from_config()

## Provision inference compute

In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "mlopsbootcamp"

try:
    # Check for existing compute target
    inference_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS2_v2', max_nodes=2)
        inference_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        inference_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Creating......
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Create a pipeline for batch inferencing

In [4]:
import os
# Create a folder for the experiment files
experiment_folder = 'batch_pipeline'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

batch_pipeline


## Get path to the model

In [5]:
from azureml.core import Model
model_list = Model.list(ws)
model_path = Model.get_model_path('fourier_regression')
print(model_list, model_path)

[Model(workspace=Workspace.create(name='mlworkspace', subscription_id='e7d71274-b7c4-47ed-9751-2505b563b918', resource_group='mlgroup'), name=fourier_regression, id=fourier_regression:1, version=1, tags={}, properties={})] azureml-models\fourier_regression\2\fourier.pkl


## Load the model

In [6]:
import joblib
model = joblib.load(model_path)
model

LinearRegression()

## Check the batch data

In [7]:
mini_batch = list()
for (dirpath, dirnames, filenames) in os.walk("batch-data"):
    mini_batch += [os.path.join(dirpath, file) for file in filenames]
for elem in mini_batch:
    print(elem)

batch-data\1.csv
batch-data\2.csv
batch-data\3.csv
batch-data\4.csv
batch-data\5.csv
batch-data\6.csv
batch-data\7.csv


## Make prediction per batch

In [8]:
import numpy as np
def run(mini_batch):
    # This runs for each batch
    resultList = []

    # process each file in the batch
    for f in mini_batch:
        # Read comma-delimited data into an array
        data = np.genfromtxt(f, delimiter=',')
        print(data)
        # Reshape into a 2-dimensional array for model input
        prediction = model.predict(data.reshape(1, -1))
        # Append prediction to results
        resultList.append("{}: {}".format(os.path.basename(f), prediction[0]))
    return resultList

result = run(mini_batch)
result

[2.88624084e+02 0.00000000e+00 2.58819045e-01 9.65925826e-01
 5.00000000e-01 8.66025404e-01 7.07106781e-01 7.07106781e-01
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
[2.88446014e+02 0.00000000e+00 2.58819045e-01 9.65925826e-01
 5.00000000e-01 8.66025404e-01 7.07106781e-01 7.07106781e-01
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
[2.88254852e+02 0.00000000e+00 2.58819045e-01 9.65925826e-01
 5.00000000e-01 8.66025404e-01 7.07106781e-01 7.07106781e-01
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
[2.88050537e+02 0.00000000e+00 2.58819045e-01 9.65925826e-01
 5.00000000e-01 8.66025404e-01 7.07106781e-01 7.07106781e-01
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
[2.87846252e+02 0.00

['1.csv: 95.15347618619145',
 '2.csv: 95.2173844857991',
 '3.csv: 95.28599144188169',
 '4.csv: 95.35931895959774',
 '5.csv: 95.43263552473465',
 '6.csv: 95.5059630424507',
 '7.csv: 95.40248307420845']

## Summarize into python script

In [9]:
%%writefile $experiment_folder\score.py

import os
import numpy as np
from azureml.core import Model
import joblib

def init():
    # Runs when the pipeline step is initialized
    global model

    # load the model
    model_path = Model.get_model_path('fourier_regression')
    model = joblib.load(model_path)

def run(mini_batch):
    # This runs for each batch
    resultList = []

    # process each file in the batch
    for f in mini_batch:
        # Read comma-delimited data into an array
        data = np.genfromtxt(f, delimiter=',')
        # Reshape into a 2-dimensional array for model input
        prediction = model.predict(data.reshape(1, -1))
        # Append prediction to results
        resultList.append("{}: {}".format(os.path.basename(f), prediction[0]))
    return resultList

Overwriting batch_pipeline\score.py


## Create conda environment for the pipeline

In [10]:
%%writefile $experiment_folder\batch_environment.yml
name: batch_environment
dependencies:
- python=3.8
- numpy
- pandas
- scikit-learn
- pip:
    - azureml-core
    - azureml-dataset-runtime[fuse]
    - azureml-pipeline-core
    - azureml-pipeline-steps

Overwriting batch_pipeline\batch_environment.yml


## Define run using environment

In [11]:
from azureml.core import Environment
from azureml.core.runconfig import DEFAULT_CPU_IMAGE

# Create an Environment for the experiment
batch_env = Environment.from_conda_specification("experiment_env", experiment_folder + "/batch_environment.yml")
batch_env.docker.base_image = DEFAULT_CPU_IMAGE
print('Configuration ready.')

Configuration ready.


## Configure batch pipeline steps

In [12]:
from datetime import datetime

from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep
from azureml.data import OutputFileDatasetConfig

# Get the batch dataset for input
batch_data_set = ws.datasets['batch-data']

# Set the output location
default_ds = ws.get_default_datastore()
output_dir = OutputFileDatasetConfig(name='inferences')

# Define the parallel run step step configuration
parallel_run_config = ParallelRunConfig(
    source_directory=experiment_folder,
    entry_script="score.py",
    mini_batch_size="5",
    error_threshold=10,
    output_action="append_row",
    environment=batch_env,
    compute_target=inference_cluster,
    node_count=2)

parallel_step_name = "batchscoring-" + datetime.now().strftime("%Y%m%d%H%M")

# Create the parallel run step
parallelrun_step = ParallelRunStep(
    name=parallel_step_name,
    parallel_run_config=parallel_run_config,
    inputs=[batch_data_set.as_named_input('batch_data')],
    output=output_dir,
    arguments=[],
    allow_reuse=True
)

print('Steps defined')

Steps defined


## Run the pipeline

In [13]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

# Create the pipeline
pipeline = Pipeline(workspace=ws, steps=[parallelrun_step])

# Run the pipeline as an experiment
pipeline_run = Experiment(ws, 'pytown-energy-demand-batch').submit(pipeline)
pipeline_run.wait_for_completion(show_output=True)

Created step batchscoring-202108011650 [4b275680][e9771bb7-d04b-4e22-815f-f957ee852bc1], (This step will run and generate new outputs)
Submitted PipelineRun 89a52746-e0f9-4f08-8448-dca1ad15f76a
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/89a52746-e0f9-4f08-8448-dca1ad15f76a?wsid=/subscriptions/e7d71274-b7c4-47ed-9751-2505b563b918/resourcegroups/mlgroup/workspaces/mlworkspace&tid=a0f1cacd-618c-4403-b945-76fb3d6874e5
PipelineRunId: 89a52746-e0f9-4f08-8448-dca1ad15f76a
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/89a52746-e0f9-4f08-8448-dca1ad15f76a?wsid=/subscriptions/e7d71274-b7c4-47ed-9751-2505b563b918/resourcegroups/mlgroup/workspaces/mlworkspace&tid=a0f1cacd-618c-4403-b945-76fb3d6874e5
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 32b13e31-6945-4306-ac84-a964baab43c4
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/32b13e31-6945-4306-ac84-a964baab43c4?wsid=/subscriptions/e7d71274-b7c4-47ed-9751-2505b56

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "Response status code does not indicate success: 400 (You don't have permissions to do this operation on storage. Make sure you didn't change the permissi).\nMicrosoft.RelInfra.Common.Exceptions.ErrorResponseException: You don't have permissions to do this operation on storage. Make sure you didn't change the permissions to storage",
        "messageParameters": {},
        "details": []
    },
    "time": "0001-01-01T00:00:00.000Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"Response status code does not indicate success: 400 (You don't have permissions to do this operation on storage. Make sure you didn't change the permissi).\\nMicrosoft.RelInfra.Common.Exceptions.ErrorResponseException: You don't have permissions to do this operation on storage. Make sure you didn't change the permissions to storage\",\n        \"messageParameters\": {},\n        \"details\": []\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\"\n}"
    }
}

## Retrieve predictions

In [None]:
import pandas as pd
import shutil

# Remove the local results folder if left over from a previous run
shutil.rmtree('batch-results', ignore_errors=True)

# Get the run for the first step and download its output
prediction_run = next(pipeline_run.get_children())
prediction_output = prediction_run.get_output_data('inferences')
prediction_output.download(local_path='batch-results')

# Traverse the folder hierarchy and find the results file
for root, dirs, files in os.walk('batch-results'):
    for file in files:
        if file.endswith('parallel_run_step.txt'):
            result_file = os.path.join(root,file)

# cleanup output format
df = pd.read_csv(result_file, delimiter=":", header=None)
df.columns = ["File", "Prediction"]

# Display the first 20 results
df.head(20)

## Publish the pipeline

In [None]:
# published_pipeline = pipeline_run.publish_pipeline(name='Fourier_regression_batch_prediction_pipeline',
#                                                    description='Batch scoring using linear regression model with Fourier ML features',
#                                                    version='1.0')

# published_pipeline

## Get REST endpoint

In [None]:
# rest_endpoint = published_pipeline.endpoint
# print(rest_endpoint)

## Schedule the pipeline to run every Monday at 04:00 in the morning (02:00 UTC)

In [None]:
# from azureml.pipeline.core import ScheduleRecurrence, Schedule

# weekly = ScheduleRecurrence(frequency='Week', interval=1, week_days=["Monday"], time_of_day="02:00")
# pipeline_schedule = Schedule.create(ws, name='Weekly Predictions',
#                                         description='batch inferencing',
#                                         pipeline_id=published_pipeline.id,
#                                         experiment_name='Batch_Prediction',
#                                         recurrence=weekly)

## Disable pipeline with active schedule

In [None]:
# ss = Schedule.list(ws)
# for s in ss:
#     print(s)

In [None]:
# def stop_by_schedule_id(ws, schedule_id):
#     s = next(s for s in Schedule.list(ws) if s.id == schedule_id)
#     s.disable()
#     return s

# schedule_id = 'e14c5ce5-db2c-462b-97f4-8c1c21faa# 31b'
# stop_by_schedule_id(ws, schedule_id)