# Automated ML

Importing all needed dependencies to complete the project.

In [1]:
import logging
import os
import json
import csv
import numpy as np
import pandas as pd
import pkg_resources
import joblib

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.pipeline.steps import AutoMLStep
from azureml.widgets import RunDetails
from azureml.core import Model, Environment
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.20.0


## Dataset

### Overview
TODO: In this markdown cell, give an overview of the dataset you are using. Also mention the task you will be performing.


TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

https://www.kaggle.com/mathchi/diabetes-data-set

In [2]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'automl-exp'

experiment=Experiment(ws, experiment_name)

In [3]:
# Load the registered dataset from workspace
dataset = Dataset.get_by_name(ws, name='diabetes_data_set')

# Convert the dataset to dataframe
df = dataset.to_pandas_dataframe()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
#Display the first five records of the dataset
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
# Create CPU cluster
amlcompute_cluster_name = "project-cluster"

# Verify if cluster does not exist otherwise use the existing one
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS12_V2',
                                                           vm_priority = 'lowpriority', 
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)


Creating
Succeeded.................................................................................................................
AmlCompute wait for completion finished

Wait timeout has been reached
Current provisioning state of AmlCompute is "Succeeded" and current node count is "0"


## AutoML Configuration

TODO: Explain why you chose the automl settings and cofiguration you used below.

In [7]:
# Automl settings 
automl_settings = {
    "experiment_timeout_minutes": 40,
    "max_concurrent_iterations": 4,
    "primary_metric" : 'accuracy',
    "n_cross_validations": 5,
    "iterations": 35
    
}

# Automl config 
automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=dataset,
                             label_column_name="Outcome",
                             enable_early_stopping= True,
                             featurization = 'auto',
                             debug_log = "automl_errors.log",
                             preprocess = True,
                             **automl_settings
                            )





In [14]:
# Submit experiment
remote_run = experiment.submit(automl_config, show_output=True)

Running on remote.
Running on remote compute: project-cluster
Parent Run ID: AutoML_695b34cd-c5ff-41eb-a951-ada26ec0b959

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing valu

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [15]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [16]:
remote_run.wait_for_completion(show_output=True)



****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

TYPE:         High cardinality feature detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and no high cardinality features were detected.
              Learn more abo

{'runId': 'AutoML_695b34cd-c5ff-41eb-a951-ada26ec0b959',
 'target': 'project-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-01-30T18:55:53.989746Z',
 'endTimeUtc': '2021-01-30T19:18:08.967003Z',
 'properties': {'num_iterations': '35',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'project-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"94ebd732-58a5-4651-8463-986e0c12eb38\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"UI/01-30-2021_063708_UTC/diabetes.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-136635\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"6971f5ac-8af1-446e-8034-05acea24

In [17]:
remote_run

Experiment,Id,Type,Status,Details Page,Docs Page
automl-exp,AutoML_695b34cd-c5ff-41eb-a951-ada26ec0b959,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation


## Best Model

TODO: In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [18]:
# Retrieve and save best automl model.
best_model = remote_run.get_output()
best_model

(Run(Experiment: automl-exp,
 Id: AutoML_695b34cd-c5ff-41eb-a951-ada26ec0b959_33,
 Type: azureml.scriptrun,
 Status: Completed),
 Pipeline(memory=None,
          steps=[('datatransformer',
                  DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                  feature_sweeping_config=None,
                                  feature_sweeping_timeout=None,
                                  featurization_config=None, force_text_dnn=None,
                                  is_cross_validation=None,
                                  is_onnx_compatible=None, logger=None,
                                  observer=None, task=None, working_dir=None)),
                 ('prefittedsoftvotingclassifier',...
                                                                                                 subsample=0.09947368421052633,
                                                                                                 subsample_for_bin=200000,
  

In [None]:
remote_run.get_metrics()

## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [20]:
# Registring the best model
model = remote_run.register_model(model_name='automl-best-model')

In [21]:
from azureml.core.conda_dependencies import CondaDependencies

# create environment
environment = Environment(name="azure-env")
conda_dep = CondaDependencies()

# Nedded packages and scripts
conda_dep.add_conda_package("pandas")
conda_dep.add_conda_package("numpy")
conda_dep.add_conda_package("scikit-learn")
conda_dep.add_pip_package("azureml-defaults")

# Adding dependencies to the created environment
environment.python.conda_dependencies=conda_dep

#remote_run.download_file('outputs/scoring_file_v_1_0_0.py', script_file)

In [24]:
inference_config = InferenceConfig(entry_script='score.py',
                                   environment=environment)
service_name = 'automl-deploy-1'
deployment_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1)

service = Model.deploy(workspace=ws,
                       name=service_name,
                       models=[model],
                       inference_config=inference_config,
                       deployment_config=deployment_config,
                       overwrite=True
                      )
service.wait_for_deployment(show_output=True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running....
Succeeded
ACI service creation operation finished, operation "Succeeded"


In [25]:
# Enable app insights
service.update(enable_app_insights=True)

TODO: In the cell below, send a request to the web service you deployed to test it.

In [40]:

data = {"data":
        [
          {
           "Pregnancies": 6, 
 "Glucose": 148, 
 "BloodPressure": 72, 
 "SkinThickness": 35, 
 "Insulin": 0, 
 "BMI": 33.5, 
 "DiabetesPedigreeFunction": 0.627, 
 "Age": 50
          },
          {
            "Pregnancies": 1, 
 "Glucose": 85, 
 "BloodPressure": 66, 
 "SkinThickness": 29, 
 "Insulin": 20, 
 "BMI": 26.5, 
 "DiabetesPedigreeFunction": 0.351, 
 "Age": 31
          },
      ]
    }
print(data)

{'data': [{'Pregnancies': 6, 'Glucose': 148, 'BloodPressure': 72, 'SkinThickness': 35, 'Insulin': 0, 'BMI': 33.5, 'DiabetesPedigreeFunction': 0.627, 'Age': 50}, {'Pregnancies': 1, 'Glucose': 85, 'BloodPressure': 66, 'SkinThickness': 29, 'Insulin': 20, 'BMI': 26.5, 'DiabetesPedigreeFunction': 0.351, 'Age': 31}]}


In [41]:
#input_data = json.dumps(data)
input_data = json.dumps({
    'data': data
})

output = service.run(input_data)
output

"name 'model' is not defined"

TODO: In the cell below, print the logs of the web service and delete the service

In [29]:
logs = service.get_logs()
logs

'2021-01-30T19:46:59,284869390+00:00 - rsyslog/run \n2021-01-30T19:46:59,285985442+00:00 - gunicorn/run \n2021-01-30T19:46:59,287908931+00:00 - iot-server/run \n2021-01-30T19:46:59,295289172+00:00 - nginx/run \n/usr/sbin/nginx: /azureml-envs/azureml_6f548a71a771959c69a66d86feeed23b/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_6f548a71a771959c69a66d86feeed23b/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_6f548a71a771959c69a66d86feeed23b/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_6f548a71a771959c69a66d86feeed23b/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_6f548a71a771959c69a66d86feeed23b/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)

In [None]:
service.delete()