In [1]:
from azureml.core import Workspace
ws = Workspace.from_config()

In [8]:
from azureml.core import Workspace
ws = Workspace.from_config()

In [11]:
# Provision compute context for the pipeline
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "mlopsbootcamp"

try:
    # Check for existing compute target
    inference_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS2_v2', max_nodes=2)
        inference_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        inference_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


In [3]:
import os

# Create a folder for the files
capstone_folder = os.path.join(os.path.dirname(os.getcwd()),'pytown_energymonitor')
print(capstone_folder)

c:\Users\meira\Projects\PyLadiesMLOpsCapstone\pytown_energymonitor


In [14]:
%%writefile $capstone_folder\score.py

import os
import pandas as pd

def init():
    # Runs when the pipeline step is initialized
    global model

def run(mini_batch):
    # This runs for each batch
    resultList = []

    # process each file in the batch
    for f in mini_batch:
        # Read csv file into a dataframe
        input_df = pd.read_csv(f)
        # calculate naive forecast
        prediction_df = (
            input_df.groupby('dayofweek')
            .rolling(3, closed = 'left')
            .mean()
            .reset_index()
            .sort_values(by ='data_index_')
            .set_index('data_index_')
        ).rename(columns ={'load_actuals_mw' : 'load_pred_mw'})
        # Append prediction to results
        resultList.append("{}: {}".format(os.path.basename(f), prediction_df['load_pred_mw'].tail(7)))
    return resultList

Overwriting c:\Users\meira\Projects\PyLadiesMLOpsCapstone\pytown_energymonitor\score.py


In [15]:
%%writefile $capstone_folder\capstone_environment.yml
name: capstone_environment
dependencies:
- python=3.8
- numpy
- pandas
- scikit-learn
- pip:
  - azureml-core

Overwriting c:\Users\meira\Projects\PyLadiesMLOpsCapstone\pytown_energymonitor\capstone_environment.yml


In [16]:
from azureml.core import Environment
from azureml.core.runconfig import DEFAULT_CPU_IMAGE

# Create an Environment for the pipeline to run
capstone_env = Environment.from_conda_specification("capstone_env", capstone_folder + "/capstone_environment.yml")
capstone_env.docker.base_image = DEFAULT_CPU_IMAGE
print('Configuration ready.')

Configuration ready.


In [17]:
# run the batch pipeline: execute py script and save results to txt in the output folder
from datetime import datetime

from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep
from azureml.data import OutputFileDatasetConfig
from azureml.core.runconfig import DockerConfiguration

# # Get the batch dataset for input
input_data_set = ws.datasets['daily_load_data']

# Set the output location
default_ds = ws.get_default_datastore()
output_dir = OutputFileDatasetConfig(name='capstone_inferences')

# Define the parallel run step step configuration
parallel_run_config = ParallelRunConfig(
    source_directory=capstone_folder,
    entry_script="score.py",
    mini_batch_size="5",
    error_threshold=10,
    output_action="append_row",
    environment=capstone_env,
    compute_target=inference_cluster,
    node_count=2)

parallel_step_name = "batchscoring-" + datetime.now().strftime("%Y%m%d%H%M")

# Create the parallel run step
parallelrun_step = ParallelRunStep(
    name=parallel_step_name,
    parallel_run_config=parallel_run_config,
    inputs=[input_data_set.as_named_input('daily_load_data')],
    output=output_dir,
    arguments=[],
    allow_reuse=True
)

print('Steps defined')


Steps defined


ParallelRunStep requires azureml-dataset-runtime[fuse,pandas] for tabular dataset.
Please add relevant package in CondaDependencies.


In [18]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

# Create the pipeline
pipeline = Pipeline(workspace=ws, steps=[parallelrun_step])

# Run the pipeline as an experiment
pipeline_run = Experiment(ws, 'capstone-naive-forecast-batch').submit(pipeline)
pipeline_run.wait_for_completion(show_output=True)

Created step batchscoring-202108061705 [b14fe54d][2e60f393-2634-4fab-bec6-5d1e15e65ee3], (This step will run and generate new outputs)
Submitted PipelineRun 4b4e9726-cd74-4e9d-8e31-35a9541bef2d
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/4b4e9726-cd74-4e9d-8e31-35a9541bef2d?wsid=/subscriptions/3f708c1c-dec8-4886-90e0-6657d6b42467/resourcegroups/mlops_bootcamp/workspaces/mlops&tid=964d978b-aade-4afb-b035-7c91124668cf
PipelineRunId: 4b4e9726-cd74-4e9d-8e31-35a9541bef2d
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/4b4e9726-cd74-4e9d-8e31-35a9541bef2d?wsid=/subscriptions/3f708c1c-dec8-4886-90e0-6657d6b42467/resourcegroups/mlops_bootcamp/workspaces/mlops&tid=964d978b-aade-4afb-b035-7c91124668cf
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 77445767-8428-4d32-8141-4082802d063a
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/77445767-8428-4d32-8141-4082802d063a?wsid=/subscriptions/3f708c1c-dec8-4886-90e0-6657d

In [None]:
import pandas as pd
import shutil

# Remove the local results folder if left over from a previous run
shutil.rmtree('capstone-batch-results', ignore_errors=True)

# Get the run for the first step and download its output
prediction_run = next(pipeline_run.get_children())
prediction_output = prediction_run.get_output_data('capstone_inferences')
prediction_output.download(local_path='capstone-batch-results')

# Traverse the folder hierarchy and find the results file
for root, dirs, files in os.walk('capstone-batch-results'):
    for file in files:
        if file.endswith('parallel_run_step.txt'):
            result_file = os.path.join(root,file)

# cleanup output format
df = pd.read_csv(result_file, delimiter=":", header=None)
df.columns = ["File", "Prediction"]

# Display the first 20 results
df.head(20)