# Automated ML

Import all dependencies

In [1]:
import json
import azureml.core
from azureml.core import Workspace, Experiment, Model
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails
from azureml.train.automl import AutoMLConfig
from azureml.core.environment import Environment
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice, Webservice
from azureml.core.dataset import Dataset


print('SDK version:', azureml.core.VERSION)



SDK version: 1.42.0


## Initialize workspace
Initialize a workspace from configuration

In [2]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')

# choose a name for experiment
experiment_name = 'automl-heart-experiment'

experiment=Experiment(ws, experiment_name)

run = experiment.start_logging()

quick-starts-ws-202911
aml-quickstarts-202911
southcentralus
1b944a9b-fdae-4f97-aeb1-b7eea0beac53


## Create Compute Cluster

In [3]:
amlcompute_cluster_name = "auto-ml"

# Verify if the cluster does not exist
try:
    compute_target = ComputeTarget(
        workspace = ws,
        name = amlcompute_cluster_name
    )
    print('Found Existing cluster, use it')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size = 'STANDARD_D2_V2',
        max_nodes = 4
    )
    compute_target = ComputeTarget.create(
        ws, 
        amlcompute_cluster_name,
        compute_config
    )
compute_target.wait_for_completion(
    show_output = True,
    min_node_count = 1,
    timeout_in_minutes = 3
)

Found Existing cluster, use it
Succeeded...................................
AmlCompute wait for completion finished

Wait timeout has been reached
Current provisioning state of AmlCompute is "Succeeded" and current node count is "0"


## Dataset
I used the Heart Failure Dataset from kaggle.
Cardiovascular diseases (CVDs) are the number 1 cause of death globally, taking an estimated 17.9 million lives each year, which accounts for 31% of all deaths worlwide.
Heart failure is a common event caused by CVDs and this dataset contains 12 features that can be used to predict mortality by heart failure.

Most cardiovascular diseases can be prevented by addressing behavioural risk factors such as tobacco use, unhealthy diet and obesity, physical inactivity and harmful use of alcohol using population-wide strategies.

People with cardiovascular disease or who are at high cardiovascular risk (due to the presence of one or more risk factors such as hypertension, diabetes, hyperlipidaemia or already established disease) need early detection and management wherein a machine learning model can be of great help.

I want to create a model to a model to predict mortality by heart failure.

In [4]:
key = "heart-failure"

dataset = ws.datasets[key]

df = dataset.to_pandas_dataframe()
df.describe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


In [5]:
dataset.take(5).to_pandas_dataframe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


## AutoML Configuration

Here are the automl configurations:
* **max_concurrent_iterations:** Represents the maximum number of iterations that would be executed in parallel. The default value is 1.
* **experiment_timeout_minutes:** The minutes before experiment terminates
* **n_cross_validations:** To avoid overfitting, we need to use cross validation
* **primary_metric:** Accuracy
* **task:** Classification

In [6]:
# automl settings
automl_settings = {
    "experiment_timeout_minutes": 15,
    "iterations": 40,
    "max_concurrent_iterations": 4,
    "n_cross_validations": 3,
    "primary_metric": "accuracy"
}

# automl config
automl_config = AutoMLConfig(
    compute_target = compute_target,
    task = "classification",
    training_data = dataset,
    label_column_name = 'DEATH_EVENT',
    enable_early_stopping = True,
    debug_log = 'automl_errors.log',
    **automl_settings
)

In [7]:
# Submit your experiment
remote_run = experiment.submit(automl_config, show_output = True)

Submitting remote run.
No run_configuration provided, running on auto-ml with default configuration
Running on remote compute: auto-ml


Experiment,Id,Type,Status,Details Page,Docs Page
automl-heart-experiment,AutoML_aef658ec-40b2-4036-b15c-0d4f4eedb798,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

********************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

**********************************************************************************

## Run Details

In [8]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Best Model

In [10]:
best_run, fitted_model = remote_run.get_output()

Package:azureml-automl-runtime, training version:1.43.0, current version:1.42.0
Package:azureml-core, training version:1.43.0, current version:1.42.0
Package:azureml-dataset-runtime, training version:1.43.0, current version:1.42.0
Package:azureml-defaults, training version:1.43.0, current version:1.42.0
Package:azureml-interpret, training version:1.43.0, current version:1.42.0
Package:azureml-mlflow, training version:1.43.0.post1, current version:1.42.0
Package:azureml-pipeline-core, training version:1.43.0, current version:1.42.0
Package:azureml-responsibleai, training version:1.43.0, current version:1.42.0
Package:azureml-telemetry, training version:1.43.0, current version:1.42.0
Package:azureml-train-automl-client, training version:1.43.0, current version:1.42.0
Package:azureml-train-automl-runtime, training version:1.43.0.post1, current version:1.42.0
Package:azureml-train-core, training version:1.43.0, current version:1.42.0
Package:azureml-train-restclients-hyperdrive, training v

In [11]:
best_metrics = best_run.get_metrics()
print(f"Best Run Id: {best_run.id}", 
      f"Accuracy: {best_metrics['accuracy']}",
      f"Best Metrics: {best_metrics}",
      f"Best Model: {fitted_model}")


Best Run Id: AutoML_aef658ec-40b2-4036-b15c-0d4f4eedb798_37 Accuracy: 0.8595286195286196 Best Metrics: {'f1_score_macro': 0.8297084365390162, 'balanced_accuracy': 0.8213546058373645, 'precision_score_micro': 0.8595286195286196, 'average_precision_score_micro': 0.9090721226893348, 'f1_score_weighted': 0.8537289281396695, 'recall_score_weighted': 0.8595286195286196, 'accuracy': 0.8595286195286196, 'AUC_macro': 0.8999929979240324, 'f1_score_micro': 0.8595286195286196, 'precision_score_macro': 0.856624582583786, 'recall_score_macro': 0.8213546058373645, 'weighted_accuracy': 0.8897873448195064, 'AUC_weighted': 0.8999929979240324, 'matthews_correlation': 0.6757879301394585, 'average_precision_score_macro': 0.8852512127689834, 'recall_score_micro': 0.8595286195286196, 'AUC_micro': 0.9087137230894807, 'average_precision_score_weighted': 0.9081776662917459, 'log_loss': 0.41552258436965656, 'precision_score_weighted': 0.8633436421194771, 'norm_macro_recall': 0.6427092116747289, 'accuracy_table':

In [12]:
print(fitted_model._final_estimator)

PreFittedSoftVotingClassifier(classification_labels=numpy.array([0, 1]), estimators=[('21', Pipeline(memory=None, steps=[('standardscalerwrapper', StandardScalerWrapper(copy=True, with_mean=False, with_std=False)), ('xgboostclassifier', XGBoostClassifier(booster='gbtree', colsample_bytree=1, eta=0.3, gamma=0, max_depth=10, max_leaves=511, n_estimators=10, n_jobs=1, objective='reg:logistic', problem_info=ProblemInfo(gpu_training_param_dict={'processing_unit_type': 'cpu'}), random_state=0, reg_alpha=2.1875, reg_lambda=0.4166666666666667, subsample=0.5, tree_method='auto'))], verbose=False)), ('32', Pipeline(memory=None, steps=[('standardscalerwrapper', StandardScalerWrapper(copy=True, with_mean=False, with_std=False)), ('xgboostclassifier', XGBoostClassifier(booster='gbtree', colsample_bytree=0.9, eta=0.3, gamma=10, max_depth=8, max_leaves=127, n_estimators=100, n_jobs=1, objective='reg:logistic', problem_info=ProblemInfo(gpu_training_param_dict={'processing_unit_type': 'cpu'}), random_s

In [13]:
# Save the best model
model = best_run.register_model(
    model_name = 'heart-failure-best-model-automl',
    model_path = './outputs/model.pkl'
)
print(model)

Model(workspace=Workspace.create(name='quick-starts-ws-202911', subscription_id='1b944a9b-fdae-4f97-aeb1-b7eea0beac53', resource_group='aml-quickstarts-202911'), name=heart-failure-best-model-automl, id=heart-failure-best-model-automl:5, version=5, tags={}, properties={})


In [14]:
for model in Model.list(ws):
    print(model.name, 'Version:', model.version)

heart-failure-best-model-automl Version: 5
heart-failure-best-model-automl Version: 4


## Model Deployment

Deploy the model using azure container instance

In [15]:
import os
os.makedirs('./aml-model', exist_ok=True)
best_run.download_file(
    '/outputs/model.pkl',
    os.path.join('./aml-model','heart-failure-best-model-automl.pkl'))

for f in best_run.get_file_names():
    if f.startswith('outputs'):
        output_file_path = os.path.join('./aml-model', f.split('/')[-1])
        print(f"Downloading the {f} to {output_file_path} ")
        best_run.download_file(
            name = f,
            output_file_path = output_file_path
        )

Downloading the outputs/conda_env_v_1_0_0.yml to ./aml-model/conda_env_v_1_0_0.yml 
Downloading the outputs/engineered_feature_names.json to ./aml-model/engineered_feature_names.json 
Downloading the outputs/env_dependencies.json to ./aml-model/env_dependencies.json 
Downloading the outputs/featurization_summary.json to ./aml-model/featurization_summary.json 
Downloading the outputs/generated_code/conda_environment.yaml to ./aml-model/conda_environment.yaml 
Downloading the outputs/generated_code/script.py to ./aml-model/script.py 
Downloading the outputs/generated_code/script_run_notebook.ipynb to ./aml-model/script_run_notebook.ipynb 
Downloading the outputs/internal_cross_validated_models.pkl to ./aml-model/internal_cross_validated_models.pkl 
Downloading the outputs/model.pkl to ./aml-model/model.pkl 
Downloading the outputs/pipeline_graph.json to ./aml-model/pipeline_graph.json 
Downloading the outputs/run_id.txt to ./aml-model/run_id.txt 
Downloading the outputs/scoring_file_pbi_

In [16]:
# Define the environment
best_run.download_file('outputs/conda_env_v_1_0_0.yml', 'conda_env.yml')
environment = Environment.from_conda_specification(
    name = 'heart-failure-env',
    file_path = 'conda_env.yml'
)

In [17]:
# Download the scoring file

model1 = best_run.download_file('outputs/scoring_file_v_1_0_0.py', 'score.py')

In [18]:
# Inference configuration
inference_config = InferenceConfig(
    entry_script = 'score.py',
    environment = environment
)

In [19]:
# ACI configuration
aci_config = AciWebservice.deploy_configuration(
    cpu_cores = 1,
    memory_gb = 1,
    # auth_enabled = True,
    enable_app_insights = True
)

In [20]:
# Deploy
webservice = Model.deploy(
    workspace = ws,
    name = 'heart-failure-ws',
    models = [model],
    inference_config = inference_config,
    deployment_config = aci_config,
    overwrite=True
)

print('****'*20)

********************************************************************************


In [21]:
webservice.wait_for_deployment(show_output=True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2022-08-01 18:19:44+00:00 Creating Container Registry if not exists.
2022-08-01 18:19:44+00:00 Registering the environment.
2022-08-01 18:19:45+00:00 Use the existing image.
2022-08-01 18:19:46+00:00 Generating deployment configuration.
2022-08-01 18:19:46+00:00 Submitting deployment to compute.
2022-08-01 18:19:49+00:00 Checking the status of deployment heart-failure-ws..
2022-08-01 18:22:25+00:00 Checking the status of inference endpoint heart-failure-ws.
Succeeded
ACI service creation operation finished, operation "Succeeded"


## Consume

In [23]:
data = ('{"data": [{"age": 70.0, "anaemia": 0, "creatinine_phosphokinase": 582, '
 '"diabetes": 0, "ejection_fraction": 40, "high_blood_pressure": 0, '
 '"platelets": 51000.0, "serum_creatinine": 2.7, "serum_sodium": 136, "sex": '
 '1, "smoking": 1, "time": 250}, {"age": 62.0, "anaemia": 1, '
 '"creatinine_phosphokinase": 655, "diabetes": 0, "ejection_fraction": 40, '
 '"high_blood_pressure": 0, "platelets": 283000.0, "serum_creatinine": 0.7, '
 '"serum_sodium": 133, "sex": 0, "smoking": 0, "time": 233}, {"age": 65.0, '
 '"anaemia": 0, "creatinine_phosphokinase": 582, "diabetes": 1, '
 '"ejection_fraction": 40, "high_blood_pressure": 0, "platelets": 270000.0, '
 '"serum_creatinine": 1.0, "serum_sodium": 138, "sex": 0, "smoking": 0, '
 '"time": 140}, {"age": 42.0, "anaemia": 0, "creatinine_phosphokinase": 5209, '
 '"diabetes": 0, "ejection_fraction": 30, "high_blood_pressure": 0, '
 '"platelets": 226000.0, "serum_creatinine": 1.0, "serum_sodium": 140, "sex": '
 '1, "smoking": 1, "time": 87}, {"age": 85.0, "anaemia": 1, '
 '"creatinine_phosphokinase": 910, "diabetes": 0, "ejection_fraction": 50, '
 '"high_blood_pressure": 0, "platelets": 235000.0, "serum_creatinine": 1.3, '
 '"serum_sodium": 134, "sex": 1, "smoking": 0, "time": 121}]}')


In [24]:
# Predictions
output = webservice.run(data)
print(output)

{"result": [0, 0, 0, 0, 0]}


In [25]:
webservice.get_logs()



In [None]:
# webservice.delete()
# compute_target.delete()

**Submission Checklist**
- I have registered the model.
- I have deployed the model with the best accuracy as a webservice.
- I have tested the webservice by sending a request to the model endpoint.
- I have deleted the webservice and shutdown all the computes that I have used.
- I have taken a screenshot showing the model endpoint as active.
- The project includes a file containing the environment details.
