# Automated ML

In [1]:
import logging

from matplotlib import pyplot as plt
import pandas as pd
import os

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.automl.core.featurization import FeaturizationConfig
from azureml.core.dataset import Dataset
from azureml.train.automl import AutoMLConfig


from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails

## Get Workspace and create Experiment

In [2]:
ws = Workspace.from_config()
experiment_name = 'Custom_Health_Insurance_CS'

exp = Experiment(ws, experiment_name)


## Dataset

### Overview
This is a dataset containing info on current customers at an insurance agency. We want to predict which of them are interested in a vehicle insurance. So it's a binary True/False prediction. The dataset is from Kaggle:
https://www.kaggle.com/anmolkumar/health-insurance-cross-sell-prediction

I will train the dataset using AutoML as well as create a pipeline using Hyperdrive and an XGBoost estimator. AutoML receives the raw dataset without any preprocessing like scaling, target balancing etc., while the pipeline includes a preprocessing step.

### Task
We want to predict which of the existing healthcare customers are interested in a vehicle insurance. So it's a binary True/False prediction.

In [3]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
found = False
key = "Health insurance cross sell"

if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        dl_data = 'https://www.kaggle.com/anmolkumar/health-insurance-cross-sell-prediction/download'
        dataset = Dataset.Tabular.from_delimited_files(dl_data)        
        #Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description="https://www.kaggle.com/anmolkumar/health-insurance-cross-sell-prediction")

data = dataset.to_pandas_dataframe()

In [4]:
amlcompute_cluster_name = "MS-CPUcompute"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           vm_priority = 'lowpriority',
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)
compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 2)

Found existing cluster, use it.
Succeeded........................
AmlCompute wait for completion finished

Wait timeout has been reached
Current provisioning state of AmlCompute is "Succeeded" and current node count is "0"


## AutoML Configuration

- **Task:** Classification
- **Primary metric:** Accuracy. When the dataset is balanced as we have made it we can use the metric "accuracy" without any problems. Also this is isn't a critical disease we need to predict so "false negatives" isn't a big issue. 
- **N cross validation:** 5 - Usual values are 3, 5 and 10. But 5 gives a reliable result. 10 takes longer to compute.
- **Enable early stopping:** True - saves us some computational power by stopping runs that shows early signs of giving a bad result (compared to previous runs)
- **Featurization:** Auto - to let AutoML handle preprocessing of dataset as this is somewhat of a simple dataset and doesn't require much feature engineering.

In [5]:
automl_settings = {
    "featurization" : 'auto',
    "n_cross_validations": 5,
    "experiment_timeout_minutes": 90,
    "enable_early_stopping": True,
    "verbosity": logging.INFO
}

automl_config = AutoMLConfig(
    compute_target = compute_target,
    training_data = dataset,
    task='classification',
    primary_metric='accuracy',
    max_concurrent_iterations = 4,
    max_cores_per_iteration = -1,
    label_column_name='Response',
    **automl_settings
)


## Submit experiment

In [6]:
remote_run = exp.submit(automl_config)
RunDetails(remote_run).show()

Running on remote.


_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [7]:
remote_run.wait_for_completion()

{'runId': 'AutoML_05628d2a-0a03-48d5-afd2-fe0401f4d126',
 'target': 'MS-CPUcompute',
 'status': 'Completed',
 'startTimeUtc': '2021-02-13T12:54:15.450122Z',
 'endTimeUtc': '2021-02-13T13:46:44.674239Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'MS-CPUcompute',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"6a204659-7bba-4d58-8fc5-7db0962b0e65\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"UI/01-22-2021_023826_UTC/Health_insurance_cross_sell.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"MS_Test\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"336509a3-005c-4650-b1e5-e1c99e

## Best Model


In [8]:
# Wait for the best model explanation run to complete
from azureml.core.run import Run
model_explainability_run_id = remote_run.id + "_" + "ModelExplain"
print(model_explainability_run_id)
model_explainability_run = Run(experiment=exp, run_id=model_explainability_run_id)
model_explainability_run.wait_for_completion()

AutoML_05628d2a-0a03-48d5-afd2-fe0401f4d126_ModelExplain


{'runId': 'AutoML_05628d2a-0a03-48d5-afd2-fe0401f4d126_ModelExplain',
 'target': 'MS-CPUcompute',
 'status': 'Completed',
 'startTimeUtc': '2021-02-13T13:46:55.379112Z',
 'endTimeUtc': '2021-02-13T13:53:01.624854Z',
 'properties': {'azureml.runsource': 'automl',
  'parentRunId': 'AutoML_05628d2a-0a03-48d5-afd2-fe0401f4d126_36',
  '_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': '78886477-c265-4f7a-9ba5-0ddff3f2cf54',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json',
  'dependencies_versions': '{"azureml-train-automl-runtime": "1.21.0", "azureml-train-automl-client": "1.21.0", "azureml-telemetry": "1.21.0", "azureml-pipeline-core": "1.21.0", "azureml-model-management-sdk": "1.0.1b6.post1", "azureml-interpret": "1.21.0", "azureml-defaults": "1.21.0", "azureml-dataset-runtime": "1.21.0", "azureml-dataprep": "2.8.2", "azureml-dataprep-rslex": "1.6.0", "azureml-dataprep-native": "28.0.0", "azureml-core": "1.21

In [9]:
best_automl_run, fitted_model = remote_run.get_output()
best_automl_run

Package:azureml-automl-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-core, training version:1.21.0.post1, current version:1.20.0
Package:azureml-dataprep, training version:2.8.2, current version:2.7.3
Package:azureml-dataprep-native, training version:28.0.0, current version:27.0.0
Package:azureml-dataprep-rslex, training version:1.6.0, current version:1.5.0
Package:azureml-dataset-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-defaults, training version:1.21.0, current version:1.20.0
Package:azureml-interpret, training version:1.21.0, current version:1.20.0
Package:azureml-pipeline-core, training version:1.21.0, current version:1.20.0
Package:azureml-telemetry, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-client, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-runtime, training version:1.21.0, current version:1.20.0


Experiment,Id,Type,Status,Details Page,Docs Page
Custom_Health_Insurance_CS,AutoML_05628d2a-0a03-48d5-afd2-fe0401f4d126_36,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [10]:
best_automl_run.get_tags()

{'_aml_system_azureml.automlComponent': 'AutoML',
 '_aml_system_ComputeTargetStatus': '{"AllocationState":"steady","PreparingNodeCount":0,"RunningNodeCount":4,"CurrentNodeCount":4}',
 'ensembled_iterations': '[17, 16, 14, 6, 34, 4, 0]',
 'ensembled_algorithms': "['XGBoostClassifier', 'XGBoostClassifier', 'XGBoostClassifier', 'XGBoostClassifier', 'XGBoostClassifier', 'RandomForest', 'LightGBM']",
 'ensemble_weights': '[0.2727272727272727, 0.09090909090909091, 0.2727272727272727, 0.09090909090909091, 0.09090909090909091, 0.09090909090909091, 0.09090909090909091]',
 'best_individual_pipeline_score': '0.8774969891640481',
 'best_individual_iteration': '17',
 '_aml_system_automl_is_child_run_end_telemetry_event_logged': 'True',
 'model_explain_run_id': 'AutoML_05628d2a-0a03-48d5-afd2-fe0401f4d126_ModelExplain',
 'model_explanation': 'True'}

### Conclusion on model
AutoML tried various different model including, XGBoost, RandomForest, ExtremeRanddomForest, Logistic Regression, stack and ensemble models. The best performing model had an accuracy of 87.76% and was an ensemble model consisting of 5 XGBoost Classifiers, 1 RandomForest and 1 LightGBM.

## Register model

In [12]:
best_automl_run.register_model(model_name = "Insurance_AuotML_Model", model_path = './outputs/')
print(fitted_model._final_estimator)

PreFittedSoftVotingClassifier(classification_labels=None,
                              estimators=[('17',
                                           Pipeline(memory=None,
                                                    steps=[('sparsenormalizer',
                                                            <azureml.automl.runtime.shared.model_wrappers.SparseNormalizer object at 0x7f2abd492f28>),
                                                           ('xgboostclassifier',
                                                            XGBoostClassifier(base_score=0.5,
                                                                              booster='gbtree',
                                                                              colsample_bylevel=1,
                                                                              colsample_bynode=1,
                                                                              colsample_bytree=1,
                               

## Delete compute-target

In [13]:
compute_target.delete()

Current provisioning state of AmlCompute is "Deleting"

