## Imports

In [None]:
%matplotlib inline
import numpy as np
import datetime
import uuid
import os

import azureml.core
from azureml.core import Workspace, Dataset, Datastore, Environment, ScriptRunConfig, Experiment
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.model import Model, InferenceConfig
from azure.storage.blob import BlobServiceClient
from azureml.widgets import RunDetails

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

## AzureML Workspace Configuration

In [None]:
# Make sure to replace these with values match your Azure resources 
subscription_id = "YOUR_SUBSCRIPTION_ID"
resource_group = "YOUR_RESOURCE_GROUP_NAME"
workspace_name = "YOUR_WORKSPACE_NAME"
workspace_region = "YOUR_WORKSPACE_REGION"
container_registry_name = "YOUR_CONTAINTER_REGISTRY_NAME"
storage_name = "YOUR_STORAGE_ACCOUNT_NAME"
blob_datastore_name = "YOUR_DATASTORE_NAME"
key_vault_name = "YOUR_KEY_VAULT_NAME"
container_name = "YOUR_STORAGE_CONTAINER_NAME"
storage_key = "YOUR_STORAGE_KEY"

In [None]:
# Setting up repository locations used in the script
scripts = "../scripts"
train_script = "train_script.py"
score_script = "score_script.py"
test_script = "test_script.py"

In [None]:
try:
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
    # Write the details of the workspace to a configuration file to the notebook library
    print("Workspace configuration succeeded.")
except:
    # Create a new workspace using the specified parameters
    container_registry = f"/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.ContainerRegistry/registries/{container_registry_name}"
    storage_account = f"/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.Storage/storageAccounts/{storage_name}"
    key_vault = f"/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.KeyVault/vaults/{key_vault_name}"
    
    ws = Workspace.create(name = workspace_name,
                        subscription_id = subscription_id,
                        resource_group = resource_group, 
                        location = workspace_region,
                        create_resource_group = True,
                        storage_account=storage_account,
                        container_registry=container_registry,
                        key_vault=key_vault,
                        sku = 'basic',
                        exist_ok = True)
    ws.get_details()
    print("Workspace not accessible. Created a new workspace")
    
# TODO: Work with environments
myenv = Environment.from_pip_requirements(name = "myenv",
                                        file_path = "../../pipeline/environment/docker-contexts/python-and-pip/requirements.txt")
myenv.register(workspace=ws)
ws.write_config(path=".azureml", file_name="config.json")


In [None]:
# Connect to the workspace
ws = Workspace.from_config(path=".azureml")
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

## Create Compute

In [None]:
# choose a name for your cluster
compute_name = "demo-cpu-cluster"
compute_min_nodes = 0
compute_max_nodes = 4

# This example uses CPU VM.
vm_size = "STANDARD_D2_V2"

if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print("Found compute target: " + compute_name)
else:
    print("Creating new compute target...")
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = vm_size,
                                                                min_nodes = compute_min_nodes, 
                                                                max_nodes = compute_max_nodes,
                                                                idle_seconds_before_scaledown=600,
                                                                admin_username='compute-cluster-admin',
                                                                admin_user_password='compute-cluster-password')

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)
    
    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
     # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

## Prepare Data for Training

In [None]:
storage_connection = f"DefaultEndpointsProtocol=https;AccountName={storage_name};AccountKey={storage_key};EndpointSuffix=core.windows.net"

# Placeholder Blob location
train_src_loc = f"https://mlopsdemodatalake.blob.core.windows.net/{container_name}/mnist_train.csv"
test_src_loc = f"https://mlopsdemodatalake.blob.core.windows.net/{container_name}/mnist_test.csv"

# Copying the placeholder blobs over to a newly created directory for this run
now = datetime.datetime.now()
folder = f'{now.year}/week{now.isocalendar()[1]}/day{now.weekday()}/{uuid.uuid4()}'
print(f"Copy blobs for this run into: {folder}")
train_dest_loc = f'{folder}/mnist_train.csv'
test_dest_loc =  f'{folder}/mnist_test.csv' 

blob_service_client = BlobServiceClient.from_connection_string(storage_connection)
train_blob_client = blob_service_client.get_blob_client(container=container_name, blob=train_dest_loc)
test_blob_client = blob_service_client.get_blob_client(container=container_name, blob=test_dest_loc)

train_copy = train_blob_client.start_copy_from_url(train_src_loc, requires_sync=True)
test_copy = test_blob_client.start_copy_from_url(test_src_loc, requires_sync=True)

In [None]:
# Create or use a Datastore
try:
    ds = Datastore.get(ws, blob_datastore_name)
    print("Found Blob Datastore with name: %s" % blob_datastore_name)
except:
    # Create a new datastore using the specified parameters
    ds = Datastore.register_azure_blob_container(
           workspace=ws,
           datastore_name=blob_datastore_name,
           account_name=storage_name, 
           container_name=container_name,
           account_key=storage_key)
    print("Registered blob datastore with name: %s" % blob_datastore_name)

In [None]:
# Create Datasets from the Datastore
train = Dataset.Tabular.from_delimited_files(path=(ds, train_dest_loc))
test = Dataset.Tabular.from_delimited_files(path=(ds, test_dest_loc))

In [None]:

# Register the Datasets in AzureML
print(f"Register new datasets for versioning")
train.register(workspace=ws, name='demo_train_set_mnist', description='training dataset', create_new_version=True)
test.register(workspace=ws, name='demo_test_set_mnist', description='testing dataset', create_new_version=True)


## Create training script 

In [None]:
train_script_loc = os.path.join(scripts, train_script)

In [None]:
%%writefile $train_script_loc

import os
import numpy as np
import joblib

from sklearn.linear_model import LogisticRegression
from azureml.core import Dataset, Run

# Workspace and Run configuration
run = Run.get_context()
ws = run.experiment.workspace

#Import data here!
train = Dataset.get_by_name(name='demo_train_set_mnist', workspace=ws, version='latest')
test = Dataset.get_by_name(name='demo_test_set_mnist', workspace=ws, version='latest')

# Data preparation: Separate labels from features, and normalize features on the fly
train_df = train.to_pandas_dataframe()
test_df = test.to_pandas_dataframe()  
x_train = train_df.iloc[:, 1:] / 255
y_train = train_df.loc[:,"label"]
x_test = test_df.iloc[:, 1:] / 255
y_test = test_df.loc[:, "label"]

print(f'Training set dimension: {x_train.shape, y_train.shape}, Test set dimension: {x_test.shape, y_test.shape}')

reg = 0.5
print('Train a logistic regression model with regularization rate of', reg)
clf = LogisticRegression(C=1.0/reg, solver="liblinear", multi_class="auto", random_state=42)
clf.fit(x_train, y_train)

print('Predict the test set')
y_hat = clf.predict(x_test)

# calculate accuracy on the prediction
acc = np.average(y_hat == y_test)
print('Accuracy is', acc)

run.log('regularization rate', np.float(reg))
run.log('accuracy', np.float(acc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=clf, filename='outputs/sklearn_mnist_model.pkl')

## Configure and submit the training job

In [None]:
src = ScriptRunConfig(source_directory=scripts,
                      script=train_script,
                      arguments=['--input-data', train.as_named_input('training_dataset'), test.as_named_input('testing_dataset')],
                      compute_target=compute_target,
                      environment=myenv)

experiment_name = 'demo-sklearn-mnist-exp'
exp = Experiment(workspace=ws, name=experiment_name)
run = exp.submit(config=src)
run


## Display Run details

In [None]:
RunDetails(run).show()
run.wait_for_completion(show_output=True) 
print(run.get_metrics())

## Register the model

In [None]:
print(run.get_file_names())
model = run.register_model(model_name='demo-sklearn-mnist', model_path='outputs/sklearn_mnist_model.pkl', datasets =[('training_data', train), ('testing_data', test)])
print(model.name, model.id, model.version, sep='\t')

## Create a scoring script to make predictions using the Container

In [None]:
score_script_loc = os.path.join(scripts, score_script)

In [None]:
%%writefile $score_script_loc

import os
import logging
import json
import numpy
import joblib

def init():
    """
    This function is called when the container is initialized/started, typically after create/update of the deployment.
    You can write the logic here to perform init operations like caching the model in memory
    """
    global model
    # AZUREML_MODEL_DIR is an environment variable created during deployment.
    # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)
    model_path = os.path.join(
        os.getenv("AZUREML_MODEL_DIR"), 'sklearn_mnist_model.pkl'
    )
    # deserialize the model file back into a sklearn model
    model = joblib.load(model_path)
    logging.info("Init complete")


def run(raw_data):
    """
    This function is called for every invocation of the endpoint to perform the actual scoring/prediction.
    In the example we extract the data from the json input and call the scikit-learn model's predict()
    method and return the result back
    """
    logging.info("model 1: request received")
    data = json.loads(raw_data)["data"]
    data = numpy.array(data)
    result = model.predict(data)
    logging.info("Request processed")
    return result.tolist()

## Containerize model and publish to ACR

In [None]:
inference_config = InferenceConfig(entry_script=score_script_loc, environment=myenv)
package = Model.package(ws, [model], inference_config=inference_config, image_name='demo-image-repo', image_label='demo-image')
package.wait_for_creation(show_output=True)

# Download the package.
cont = package.pull()
# Get the Azure container registry that the model/Dockerfile uses.
acr=package.get_container_registry()
print("Address:", acr.address)
print("Username:", acr.username)
print("Password:", acr.password)

## Create a test script for the exported container

In [None]:
test_script_loc = os.path.join(scripts, test_script)

In [None]:
%%writefile $test_script_loc

import requests
import json
import numpy as np

# URL for the web service.
scoring_uri = 'http://localhost:6789/score'

# Two sets of data to score, so we get two results back.
data = {"data":
        [
            np.zeros(784).tolist(),
            np.ones(784).tolist(),
        ]
        }
# Convert to JSON string.
input_data = json.dumps(data)

# Set the content type.
headers = {'Content-Type': 'application/json'}

# Make the request and display the response.
resp = requests.post(scoring_uri, input_data, headers=headers)
print(resp.text)