# Automated ML

Requisite Dependencies for the Project is imported as below

In [1]:
from azureml.core import Workspace
from azureml.core.experiment import Experiment
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core import  Dataset
import shutil
import os
import zipfile
import pandas as pd
from azureml.train.automl import AutoMLConfig
import json
from azureml.widgets import RunDetails
import joblib

## Workspace

Gather Workspace details from the config file and create and Experiment to run the AutoML.

In [2]:
ws = Workspace.from_config()
# choose a name for experiment
experiment_name = 'automl_capstone_exp'
experiment=Experiment(ws, experiment_name)

## Dataset
Reference:https://www.kaggle.com/andrewmvd/heart-failure-clinical-data?select=heart_failure_clinical_records_dataset.csv

### Overview
This dataset contains 12 features that can be used to predict mortality by heart failure denoted by the dependent variable DEATH_EVENT

Goal :To predict the DEATH EVENT(0 or 1) of a record.




### Loading the Data from Kaggle

In [3]:
#Load Data for the AutoML model
!pip install kaggle

Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[K     |████████████████████████████████| 58 kB 769 kB/s  eta 0:00:01
Collecting python-slugify
  Downloading python_slugify-5.0.2-py2.py3-none-any.whl (6.7 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 3.1 MB/s  eta 0:00:01
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l- \ done
[?25h  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73053 sha256=00d97d5b013baadc884746b68c45019c34777bb8bce443c2df8f77472ec93c7f
  Stored in directory: /home/azureuser/.cache/pip/wheels/77/47/e4/44a4ba1b7dfd53faaa35f59f1175e123b213ff401a8a56876b
Successfully built kaggle
Installing collected packages: text-unidecode, python-slugify, kaggle
Successfully installed kaggle-1.5.12 python-slugify-5.0.2 text-unidecode-1.3


In [4]:
#Create Data Folder and Kaggle Folder (Ref:https://inclusive-ai.medium.com/how-to-use-kaggle-api-with-azure-machine-learning-service-da056708fc5a)
data_folder = os.path.join(os.getcwd(),'data')
os.makedirs(data_folder, exist_ok=True)
kaggle_folder = os.path.join(os.getcwd(), '.kaggle')
os.makedirs(kaggle_folder, exist_ok=True)
kaggle_key_folder = '/home/azureuser/.kaggle'
os.makedirs(kaggle_key_folder, exist_ok=True)

In [5]:
#Upload the kaggle.json(Generated from Kaggle account Page) generated from kaggle in .kaggle folder

kaggle_file = kaggle_folder + '/kaggle.json'
shutil.copy(kaggle_file, kaggle_key_folder)
os.remove(kaggle_file)
!chmod 600 /home/azureuser/.kaggle/kaggle.json

#Data Download
import kaggle
!kaggle --version
!kaggle datasets download -d andrewmvd/heart-failure-clinical-data


Kaggle API 1.5.12
Downloading heart-failure-clinical-data.zip to /mnt/batch/tasks/shared/LS_root/mounts/clusters/heartcompute/code/Users/mashrajiv
100%|██████████████████████████████████████| 3.97k/3.97k [00:00<00:00, 21.7kB/s]
100%|██████████████████████████████████████| 3.97k/3.97k [00:00<00:00, 20.5kB/s]


In [6]:
with zipfile.ZipFile("heart-failure-clinical-data.zip","r") as zip_ref:
    zip_ref.extractall(data_folder)


#View the Unzipped Files
for root, directories, files in os.walk(data_folder, topdown=True):
    for name in files:
        print(os.path.join(root, name))

/mnt/batch/tasks/shared/LS_root/mounts/clusters/heartcompute/code/Users/mashrajiv/data/heart_failure_clinical_records_dataset.csv


In [7]:
#Load the CSV into Data Frames

#train_properties_file = '/mnt/batch/tasks/shared/LS_root/mounts/clusters/mnistcompute/code/Users/mashrajiv/data/train.csv'
#test_properties_file = '/mnt/batch/tasks/shared/LS_root/mounts/clusters/mnistcompute/code/Users/mashrajiv/data/test.csv'
train = pd.read_csv('./data/heart_failure_clinical_records_dataset.csv')


In [8]:
datastore  = ws.get_default_datastore()
datastore.upload('./data','./data_ds')
datastore_paths = [(datastore, 'data_ds/heart_failure_clinical_records_dataset.csv')]
heart_ds = Dataset.Tabular.from_delimited_files(path=datastore_paths)


Uploading an estimated of 1 files
Uploading ./data/heart_failure_clinical_records_dataset.csv
Uploaded ./data/heart_failure_clinical_records_dataset.csv, 1 files out of an estimated total of 1
Uploaded 1 files


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1


In [10]:
heart_df =heart_ds.to_pandas_dataframe()

### Compute Creation

In [12]:
#Required incase Local instance is not used
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cpu_cluster_name = "cpucluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',
                                                           max_nodes=6)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

Creating......
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## AutoML Configuration

Choice of AutoML Settings:
 
#### 1. n_cross_validation 
Indicates how many cross validations to perform and in our case splitting it into 5 portions will ensure that we have ~240 records for training and ~60 for validation.
 
#### 2. Primary Metric
Primary metric chosen here is accuracy to understand how much of the sample has been correctly classified.We could also use AUC as metric where we can see multiple one versus all Precision recall curves for each of the MNIST digits

#### 3. enable early stopping
Early stopping is enabled to prevent overfitting

#### 4. Experiment Stop time 
To handle costs and time

#### 5.Created compute
Going for Remote Compute to avoid dependency issues.

In [19]:

# TODO: Put your automl settings here
automl_settings = automl_settings = {
                                    "n_cross_validations": 5,
                                    "primary_metric": 'accuracy',
                                    "enable_early_stopping": True,
                                    "experiment_timeout_minutes": 20
                                     }

# TODO: Put your automl config here
automl_config = AutoMLConfig(
   compute_target = 'cpucluster',  # Local compute accepts Data Frames but dependency issues
    task='classification',
    training_data=heart_ds,
    label_column_name='DEATH_EVENT',
    **automl_settings)

In [20]:
# TODO: Submit your experiment
automl_run = experiment.submit(automl_config,show_output = False)

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
automl_capstone_exp,AutoML_94ff2bf8-d1ab-42ef-bdb5-d5219de4ff4e,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


## Run Details

There are about 11 models which have run as a part of this experiment.

Voting Ensemble is the top performing model w.r.t the Primary metric.

The Ensemble models perform better as opposed to the individual models since they combine bagging,bosting and stacking to provide the results.
They also combine the results and minimise the variance component of the error.

We can explore the results of automatic training with a Jupyter widget. 
Additionally, we can filter on different accuracy metrics than the  primary metric - Accuracy  with the dropdown selector

In [22]:

RunDetails(automl_run).show()


_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [23]:
automl_run.wait_for_completion(show_output=False)

{'runId': 'AutoML_94ff2bf8-d1ab-42ef-bdb5-d5219de4ff4e',
 'target': 'cpucluster',
 'status': 'Completed',
 'startTimeUtc': '2021-08-02T05:47:18.156659Z',
 'endTimeUtc': '2021-08-02T06:14:59.313177Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'cpucluster',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"9d6a1df3-b413-4338-8f7e-315492df17ef\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.32.0", "azureml-train": "1.32.0", "azureml-train-restclients-hyperdrive": "1.32.0", "azureml-train-core": "1.32.0", "azureml-train-automl": "1.32.0", "azureml-train-automl-runtime": "1.32.0", "azureml-train-automl-client": "1.32.0", "azure

## Best Model

Getting  the best model from the automl experiments and display all the properties of the model.



In [24]:
best_automl_run = automl_run.get_best_child()


In [25]:
best_automl_run

Experiment,Id,Type,Status,Details Page,Docs Page
automl_capstone_exp,AutoML_94ff2bf8-d1ab-42ef-bdb5-d5219de4ff4e_9,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [26]:
best_run_metrics = best_automl_run.get_metrics() # or other runs with runID
for metric_name in best_run_metrics:
     metric = best_run_metrics[metric_name]
     print(metric_name, metric)

recall_score_weighted 0.8763276836158193
average_precision_score_micro 0.914001363343919
average_precision_score_weighted 0.9216747825657275
precision_score_micro 0.8763276836158193
recall_score_macro 0.8445833333333332
f1_score_macro 0.8487715597828899
f1_score_micro 0.8763276836158193
precision_score_macro 0.8859081804947857
matthews_correlation 0.7265774471999167
AUC_weighted 0.9123256506090808
balanced_accuracy 0.8445833333333332
recall_score_micro 0.8763276836158193
AUC_micro 0.9144829550895336
weighted_accuracy 0.8977905378029968
precision_score_weighted 0.894403880597974
norm_macro_recall 0.6891666666666667
average_precision_score_macro 0.8998958306439382
accuracy 0.8763276836158193
f1_score_weighted 0.8712373429020379
AUC_macro 0.9123256506090808
log_loss 0.3849086300466243
accuracy_table aml://artifactId/ExperimentRun/dcid.AutoML_94ff2bf8-d1ab-42ef-bdb5-d5219de4ff4e_9/accuracy_table
confusion_matrix aml://artifactId/ExperimentRun/dcid.AutoML_94ff2bf8-d1ab-42ef-bdb5-d5219de4ff4

In [27]:
best_automl_run.get_file_names()

['accuracy_table',
 'automl_driver.py',
 'azureml-logs/55_azureml-execution-tvmps_a3e0788c9ba62d73bc2e41d62582fcc410ae0f4c2c1d248dcbba1accb8c96dc4_d.txt',
 'azureml-logs/65_job_prep-tvmps_a3e0788c9ba62d73bc2e41d62582fcc410ae0f4c2c1d248dcbba1accb8c96dc4_d.txt',
 'azureml-logs/70_driver_log.txt',
 'azureml-logs/75_job_post-tvmps_a3e0788c9ba62d73bc2e41d62582fcc410ae0f4c2c1d248dcbba1accb8c96dc4_d.txt',
 'azureml-logs/process_info.json',
 'azureml-logs/process_status.json',
 'confusion_matrix',
 'explanation/79a65aa7/classes.interpret.json',
 'explanation/79a65aa7/eval_data_viz.interpret.json',
 'explanation/79a65aa7/expected_values.interpret.json',
 'explanation/79a65aa7/features.interpret.json',
 'explanation/79a65aa7/global_names/0.interpret.json',
 'explanation/79a65aa7/global_rank/0.interpret.json',
 'explanation/79a65aa7/global_values/0.interpret.json',
 'explanation/79a65aa7/local_importance_values.interpret.json',
 'explanation/79a65aa7/per_class_names/0.interpret.json',
 'explanati

In [28]:
#TODO: Save the best model in Outputs Folder

outputs_folder = os.path.join(os.getcwd(),'outputs')
os.makedirs(outputs_folder, exist_ok=True)
best_automl_run.download_file('outputs/model.pkl', output_file_path='./outputs/')
#Downloading the Scoring File
best_automl_run.download_file('outputs/scoring_file_v_1_0_0.py', output_file_path='./outputs/score1.py')
#downloading the Environment
best_automl_run.download_file('outputs/conda_env_v_1_0_0.yml', output_file_path='./outputs/env.yaml')


## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

Registering the model, creating an inference config and deploying the model as a web service.

In [30]:
#Register the Best model
model_auto = best_automl_run.register_model(model_name='AUTOML_ATTEMPT',description ='Heart Failure Prediction using AutoML',
                           model_path='outputs/model.pkl')


In [31]:
from azureml.core.environment import Environment
from azureml.core.model import InferenceConfig
from azureml.core.model import Model
from azureml.core.webservice import AciWebservice
from azureml.core.webservice import webservice

infenv = Environment.from_conda_specification(name = "infenv", file_path = "outputs/env.yaml")

# Combine scoring script & environment in Inference configuration
inference_config = InferenceConfig(entry_script='outputs/score1.py', 
                                    environment=infenv
                                    )

# Set deployment configuration
deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1,tags={'type':'automl-classification'},
                                                        description='Sample Web Service for AutoML Classification')

aci_service_name = "automl-classification"
print (aci_service_name)
aci_service = Model.deploy(ws,aci_service_name,[model_auto],inference_config,deployment_config)
aci_service.wait_for_deployment(True)
print(aci_service.state)



automl-classification
Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-08-02 07:09:49+00:00 Creating Container Registry if not exists..
2021-08-02 07:19:49+00:00 Registering the environment..
2021-08-02 07:19:56+00:00 Building image..
2021-08-02 07:34:45+00:00 Generating deployment configuration..
2021-08-02 07:34:46+00:00 Submitting deployment to compute..
2021-08-02 07:34:51+00:00 Checking the status of deployment automl-classification..
2021-08-02 07:39:20+00:00 Checking the status of inference endpoint automl-classification.
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


Sending a request to the web service  deployed to test it.

In [32]:
#Create the holdout set as a subset from the Train file.
#First 10000 used for training
#Remaining is being used for testing
validation =pd.read_csv('./data/heart_failure_clinical_records_dataset.csv',skiprows=[1,250])
validation_labels = validation['DEATH_EVENT']
validation.drop(columns=['DEATH_EVENT'],axis=1,inplace=True)

test_sample = json.dumps({"data":validation.to_dict(orient='records')})
response = aci_service.run(test_sample)
response

'{"result": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}'

In [33]:
res_dict = json.loads(response)
Predicted_label=pd.Series(res_dict['result'])
from sklearn.metrics import accuracy_score,confusion_matrix
print(accuracy_score(validation_labels,Predicted_label))
print(confusion_matrix(validation_labels,Predicted_label))

0.9562289562289562
[[198   4]
 [  9  86]]


TODO: In the cell below, print the logs of the web service and delete the service

In [36]:
aci_service.get_logs(num_lines=5000, init=False)



In [53]:
#Deleting the WebService
aci_service.delete()