# Automated ML


In [1]:
#workspace management
from azureml.core import Workspace, Experiment, Dataset
#compute resource management
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
import os

#autoML
from azureml.train.automl import AutoMLConfig

#model saving,deployment and exchange
import joblib

#data management
import pandas as pd


## Dataset

### Overview
The dataset of this project is the same as the one used for exploiting the capabilities of AzureHyperdrive. It's the heart attack and disease dataset under Kaggle. This database contains 76 attributes, but all published experiments refer to using a subset of 14 of them. In particular, the Cleveland database is the only one that has been used by ML researchers to this date.

The task is a binary classifaction task able to predict a heart disease in the patient (0: no presence, 1: presence). I started with the most simple approach (Logistic Regression) using AzureHyperDrive for hyperparameters tuning. Thus, I got the first model able to give an answer of the problem (Occam's razor).

Using AzureAutoML we will be able to benchmark different approaches as well as the one carried out by Hyperdrive. 

Nature of the data is *balanced* and features are numerical, non requring too much feature engineering. Due to this fact,primary metric to be optimized be Accurary, other metrics would be analized and take into account though: MCC, Precision, Recall and AUC-ROC Curves.


In [2]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'automl-experiment'

experiment=Experiment(ws, experiment_name)
dataset = Dataset.get_by_name(ws, name='heart_disease')
data = dataset.to_pandas_dataframe()


In [3]:
#cluster configuration
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpu-cluster")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4)

#Using CPU VM.
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_STANDARD_NC6", "STANDARD_D2_V2")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = vm_size,
                                                                min_nodes = compute_min_nodes, 
                                                                max_nodes = compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)

    # can poll for a minimum number of nodes and for a specific timeout. 
    #if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

     # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())


creating a new compute target...
Creating....
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-04-19T21:54:19.979000+00:00', 'errors': None, 'creationTime': '2021-04-19T21:54:17.236375+00:00', 'modifiedTime': '2021-04-19T21:54:32.686523+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


In [4]:
#setting computing resources

# Using vm_size = "Standard_D2_V2" in provisioning configuration.
# max_nodes should be no greater than 4.

#name of the compute cluster
#cpu_cluster_name = "automlCompute"

# Verify that cluster does not exist already (good practice extracted by Microsoft official documentation: https://docs.microsoft.com/)
#try:
#    compute_target_aml= ComputeTarget(workspace=ws, name=cpu_cluster_name)
#    print('Found existing cluster, use it.')
#except ComputeTargetException:
#creating compute (cpu) cluster
#    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
#                                                           max_nodes=4)
#    compute_target_aml = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

#compute_target_aml.wait_for_completion(show_output=True)

## AutoML Configuration

In [5]:
# Set parameters for AutoMLConfig

'''automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=data, #note: this should contain both features and label/target
    label_column_name='y',
    debug_log='automated_ml_errors.log',
    featurization= 'auto',
    compute_target=compute_target_aml,
    #verbosity= logging.INFO)
    n_cross_validations=5)'''

from azureml.train.automl import AutoMLConfig
import time
import logging

automl_settings = {
    "name": "AutoML_Demo_Experiment_{0}".format(time.time()),
    "experiment_timeout_minutes" : 30,
    "enable_early_stopping" : True,
    "iteration_timeout_minutes": 10,
    "n_cross_validations": 5,
    "primary_metric": 'accuracy',
    "max_concurrent_iterations": 10,
}

automl_config = AutoMLConfig(task='classification',
                             debug_log='automl_errors.log',
                             compute_target=compute_target,
                             training_data=dataset,
                             label_column_name='target',
                             **automl_settings,
                             )

In [6]:
print(type(dataset))

<class 'azureml.data.tabular_dataset.TabularDataset'>


In [7]:
#submitting experiment
remote_run = experiment.submit(automl_config,show_output=True)

Submitting remote run.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster


Experiment,Id,Type,Status,Details Page,Docs Page
automl-experiment,AutoML_eaf562aa-79f3-44b6-9592-aa6dec49470e,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

******************************************************************

## Run Details


In [8]:
from azureml.widgets import RunDetails
#visualizing progress of runs
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Best Model



In [28]:
# Retrieve and save your best automl model.
import joblib
best_run, fitted_model = remote_run.get_output()
print('Best run:',best_run)
print('Best fitted model:', fitted_model)

#save best model

joblib.dump (fitted_model, 'best_automodel_lr_automl.joblib')

Best run: Run(Experiment: automl-experiment,
Id: AutoML_eaf562aa-79f3-44b6-9592-aa6dec49470e_49,
Type: azureml.scriptrun,
Status: Completed)
Best fitted model: Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                    min_weight_fraction_leaf=0.0,
                                                                                                    n_

['best_automodel_lr_automl.joblib']

In [29]:
print('Run properties: ',best_run.properties)

Run properties:  {'runTemplate': 'automl_child', 'pipeline_id': '__AutoML_Ensemble__', 'pipeline_spec': '{"pipeline_id":"__AutoML_Ensemble__","objects":[{"module":"azureml.train.automl.ensemble","class_name":"Ensemble","spec_class":"sklearn","param_args":[],"param_kwargs":{"automl_settings":"{\'task_type\':\'classification\',\'primary_metric\':\'accuracy\',\'verbosity\':20,\'ensemble_iterations\':15,\'is_timeseries\':False,\'name\':\'AutoML_Demo_Experiment_1618869290.1873217\',\'compute_target\':\'cpu-cluster\',\'subscription_id\':\'1b944a9b-fdae-4f97-aeb1-b7eea0beac53\',\'region\':\'southcentralus\',\'spark_service\':None}","ensemble_run_id":"AutoML_eaf562aa-79f3-44b6-9592-aa6dec49470e_49","experiment_name":"automl-experiment","workspace_name":"quick-starts-ws-143190","subscription_id":"1b944a9b-fdae-4f97-aeb1-b7eea0beac53","resource_group_name":"aml-quickstarts-143190"}}]}', 'training_percent': '100', 'predicted_cost': None, 'iteration': '49', '_aml_system_scenario_identification': '

## Model Deployment


In [32]:
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core.model import Model
from azureml.train.automl import constants
from azureml.automl.core.shared import constants


#deploying the already registed model (best), creating an inference and --> model exposed as a WS-*.
#getting environment from best run
env = best_run.get_environment()

#registeting the model
#model = best_run.register_model(model_name='best_automodel_lr_automl.pkl')
model = remote_run.register_model(model_name=best_run.properties['model_name'])


best_run.download_file('outputs/scoring_file_v_1_0_0.py', 'score.py')
best_run.download_file(constants.CONDA_ENV_FILE_PATH, 'env.yml')
inference_config = InferenceConfig(entry_script='score.py', environment=env)

deployment_config = AciWebservice.deploy_configuration(
                                                       cpu_cores=1,
                                                       memory_gb=1,
                                                       description='predicting heart diseases',
                                                       auth_enabled=True,
                                                       enable_app_insights= True,
                                                       collect_model_data = True)




#inference_config = InferenceConfig(entry_script='score.py',environment=env)
#deployment_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1)
service = Model.deploy(ws, 'heart-disease-service', [model], inference_config, deployment_config, overwrite=True)
service.wait_for_deployment(show_output=True)





Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-04-19 23:47:59+00:00 Creating Container Registry if not exists.
2021-04-19 23:47:59+00:00 Registering the environment.
2021-04-19 23:47:59+00:00 Use the existing image.
2021-04-19 23:47:59+00:00 Generating deployment configuration.
2021-04-19 23:48:00+00:00 Submitting deployment to compute.
2021-04-19 23:48:04+00:00 Checking the status of deployment heart-disease-service..
2021-04-19 23:52:00+00:00 Checking the status of inference endpoint heart-disease-service.
Succeeded
ACI service creation operation finished, operation "Succeeded"


AttributeError: 'AciWebservice' object has no attribute 'getlogs'

TODO: In the cell below, send a request to the web service you deployed to test it.

In [50]:
import requests
import json 
print('Score URI:',service.scoring_uri)
print('Swagger URL:',service.swagger_uri)



#it could have done also with and endpoint.py script as in the project 2, but for simplicity just make a qury with a payload and auth.
#testing with dummy values
import urllib.request
import json
import os
import ssl



data = {
    "data":
    [
        {
            'age': "0",
            'sex': "0",
            'cp': "0",
            'trestbps': "0",
            'chol': "0",
            'fbs': "0",
            'restecg': "0",
            'thalach': "0",
            'exang': "0",
            'oldpeak': "0",
            'slope': "0",
            'ca': "0",
            'thal': "0",
        },
    ],
}

body = str.encode(json.dumps(data))

headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ service.get_keys()[0])}

req = urllib.request.Request(service.scoring_uri, body, headers)

try:
    response = urllib.request.urlopen(req)

    result = response.read()
    print(result)
except urllib.error.HTTPError as error:
    print("The request failed with status code: " + str(error.code))

    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
    print(error.info())
    print(json.loads(error.read().decode("utf8", 'ignore')))


#headers = {'Content-type': 'application/json'}
#headers['Authorization'] = f'Bearer {service.get_keys()[0]}'
# Make the request and display the response
#response = requests.post(service.scoring_uri, data.iloc[0,0:13], headers=headers)
#print('Prediction :', response.text)

Score URI: http://10ba1fe4-cb4f-479d-882f-b0f35d8ee221.southcentralus.azurecontainer.io/score
Swagger URL: http://10ba1fe4-cb4f-479d-882f-b0f35d8ee221.southcentralus.azurecontainer.io/swagger.json
b'"{\\"result\\": [1]}"'


In [51]:
print(service.get_logs())

2021-04-19T23:51:52,963393900+00:00 - gunicorn/run 
2021-04-19T23:51:52,964883100+00:00 - rsyslog/run 
2021-04-19T23:51:52,989448900+00:00 - nginx/run 
2021-04-19T23:51:52,994555500+00:00 - iot-server/run 
rsyslogd: /azureml-envs/azureml_1c91e9a5baadb44a18d64bdd7f599d89/lib/libuuid.so.1: no version information available (required by rsyslogd)
/usr/sbin/nginx: /azureml-envs/azureml_1c91e9a5baadb44a18d64bdd7f599d89/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_1c91e9a5baadb44a18d64bdd7f599d89/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_1c91e9a5baadb44a18d64bdd7f599d89/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_1c91e9a5baadb44a18d64bdd7f599d89/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml

In [52]:
#adding a cell for deleting the compute cluster
#cpu_cluster.delete()

#deleting service
service.delete()

In [54]:
service.state

'Deleting'