Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

# Tutorial: Azure Automated Machine Learning

## Import packages
Begin by importing the SDK.

In [1]:
import azureml.dataprep as dprep
import pandas as pd

## Load data
Load the diabetes and doctors data set

In [2]:
diabetes = pd.read_csv("./demodata/diabetes.csv", encoding='cp1252')
doctors = pd.read_csv("./demodata/doctors.csv", encoding='cp1252')
#b = b.dropna(axis=1)
merged = diabetes.merge(doctors, on='PatientID')
merged.to_csv("./demodata/output.csv", index=False)

dataset=dprep.auto_read_file("./demodata/output.csv")
dataset.head(5)


Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic,Physician
0,1354778.0,0.0,171.0,80.0,34.0,23.0,43.509726,1.213191,21.0,0.0,Dan Drayton
1,1147438.0,8.0,92.0,93.0,47.0,36.0,21.240576,0.158365,23.0,0.0,Anthony Frizzell
2,1640031.0,7.0,115.0,47.0,52.0,35.0,41.511523,0.079019,23.0,0.0,Gordon Fredrickson
3,1883350.0,9.0,103.0,78.0,25.0,304.0,29.582192,1.28287,43.0,1.0,Chad Corbitt
4,1424119.0,1.0,85.0,59.0,27.0,35.0,42.604536,0.549542,22.0,0.0,Zachary Fellows


In [3]:
import os
file_path = os.path.join(os.getcwd(), "dflowsdemo.dprep")

dflow_prepared = dataset
package = dprep.Package([dflow_prepared])
package.save(file_path)

Package
  name: None
  path: C:\Users\lighahre\OneDrive - Microsoft\Trainings\AutoML_Practice\dflowsdemo.dprep
  dataflows: [
    Dataflow {
      name: output
      steps: 4
    },
  ]

## Import packages
Import Python packages you need in this tutorial.

In [4]:
import azureml.core
import pandas as pd
from azureml.core.workspace import Workspace
import logging

### Configure workspace

Create a workspace object from the existing workspace. A `Workspace` is a class that accepts your Azure subscription and resource information, and creates a cloud resource to monitor and track your model runs. `Workspace.from_config()` reads the file **aml_config/config.json** and loads the details into an object named `ws`.  `ws` is used throughout the rest of the code in this tutorial.

Once you have a workspace object, specify a name for the experiment and create and register a local directory with the workspace. The history of all runs is recorded under the specified experiment.

In [7]:
ws = Workspace.from_config()
# choose a name for the run history container in the workspace
experiment_name = 'automated-ml-regression'
# project folder
project_folder = './automated-ml-regression'

import os

output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
pd.set_option('display.max_colwidth', -1)
outputDf = pd.DataFrame(data = output, index = [''])
outputDf.T

Found the config file in: C:\Users\lighahre\OneDrive - Microsoft\Trainings\AutoML_Practice\config.json


Unnamed: 0,Unnamed: 1
SDK version,0.1.0.1179783
Subscription ID,20c286f5-1d7a-49dd-b06f-a0a852d9173c
Workspace,aml_autoML
Resource Group,AutoML
Location,eastus2
Project Directory,./automated-ml-regression


## Explore data

Utilize the data flow object you have previously created. Open and execute the data flow and review the results.

In [6]:
import azureml.dataprep as dprep

file_path = os.path.join(os.getcwd(), "dflowsdemo.dprep")

package_saved = dprep.Package.open(file_path)
dflow_prepared = package_saved.dataflows[0]
dflow_prepared.get_profile()

Unnamed: 0,Type,Min,Max,Count,Missing Count,Not Missing Count,Percent missing,Error Count,Empty count,0.1% Quantile,1% Quantile,5% Quantile,25% Quantile,50% Quantile,75% Quantile,95% Quantile,99% Quantile,99.9% Quantile,Mean,Standard Deviation,Variance,Skewness,Kurtosis
PatientID,FieldType.DECIMAL,1.00004e+06,2e+06,15000.0,0.0,15000.0,0.0,0.0,0.0,1001440.0,1101700.0,1098480.0,1252380.0,1505620.0,1754580.0,1952410.0,1990950.0,1999090.0,1502920.0,289253.0,83667600000.0,-0.0122755,-1.20476
Pregnancies,FieldType.DECIMAL,0,14,15000.0,0.0,15000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,6.0,9.3967,11.7078,13.9803,3.22453,3.39102,11.499,0.82339,-0.507502
PlasmaGlucose,FieldType.DECIMAL,44,192,15000.0,0.0,15000.0,0.0,0.0,0.0,44.0,69.4799,69.0,83.9662,104.514,129.267,168.406,178.878,189.379,107.857,31.982,1022.85,0.324805,-0.544078
DiastolicBloodPressure,FieldType.DECIMAL,24,117,15000.0,0.0,15000.0,0.0,0.0,0.0,24.375,49.0,49.0,57.7383,72.0715,84.9568,95.8885,104.541,115.338,71.2207,16.7587,280.855,-0.109286,-0.809598
TricepsThickness,FieldType.DECIMAL,7,93,15000.0,0.0,15000.0,0.0,0.0,0.0,7.0,9.0,9.0,14.9924,30.4977,41.2066,51.9558,56.0781,88.4979,28.814,14.5557,211.869,0.197757,-0.596866
SerumInsulin,FieldType.DECIMAL,14,799,15000.0,0.0,15000.0,0.0,0.0,0.0,14.0,23.7817,23.0,38.3792,83.9646,194.814,407.153,649.557,758.191,137.852,133.068,17707.2,1.76732,3.67903
BMI,FieldType.DECIMAL,18.2005,56.0346,15000.0,0.0,15000.0,0.0,0.0,0.0,18.2147,19.6954,19.6564,21.2604,31.7857,39.2775,46.9957,51.5814,55.4843,31.5096,9.759,95.2381,0.193253,-1.2063
DiabetesPedigree,FieldType.DECIMAL,0.0780438,2.30159,15000.0,0.0,15000.0,0.0,0.0,0.0,0.0783001,0.10233,0.10222,0.137853,0.200657,0.616932,1.1389,1.73306,2.22232,0.398968,0.377944,0.142841,1.67503,2.94313
Age,FieldType.DECIMAL,21,77,15000.0,0.0,15000.0,0.0,0.0,0.0,21.0,21.0,21.0,22.0,24.0,35.0871,56.7703,68.1521,74.5125,30.1377,12.0897,146.161,1.48294,1.22477
Diabetic,FieldType.DECIMAL,0,1,15000.0,0.0,15000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.333333,0.47142,0.222237,0.707036,-1.5002


You prepare the data for the experiment by adding columns to `dflow_X` to be features for our model creation. You define `dflow_y` to be our prediction value; cost.

In [8]:
dflow_X = dflow_prepared.keep_columns(['PatientID','Pregnancies', 'PlasmaGlucose','DiastolicBloodPressure', 'TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age','Physician'])
dflow_y = dflow_prepared.keep_columns('Diabetic')

### Split data into train and test sets

Now you split the data into training and test sets using the `train_test_split` function in the `sklearn` library. This function segregates the data into the x (features) data set for model training and the y (values to predict) data set for testing. The `test_size` parameter determines the percentage of data to allocate to testing. The `random_state` parameter sets a seed to the random generator, so that your train-test splits are always deterministic.

In [9]:
from sklearn.model_selection import train_test_split


x_df = dflow_X.to_pandas_dataframe()
y_df = dflow_y.to_pandas_dataframe()

x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=223)
# flatten y_train to 1d array
y_train.values.flatten()

array([1., 0., 1., ..., 1., 1., 0.])

## Automatically train a model

To automatically train a model:
1. Define settings for the experiment run
1. Submit the experiment for model tuning


### Define settings for autogeneration and tuning

Define the experiment parameters and models settings for autogeneration and tuning. View the full list of [settings](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train).


|Property| Value in this tutorial |Description|
|----|----|---|
|**iteration_timeout_minutes**|10|Time limit in minutes for each iteration|
|**iterations**|30|Number of iterations. In each iteration, the model trains with the data with a specific pipeline|
|**primary_metric**|spearman_correlation | Metric that you want to optimize.|
|**preprocess**| True | True enables experiment to perform preprocessing on the input.|
|**verbosity**| logging.INFO | Controls the level of logging.|
|**n_cross_validationss**|5|Number of cross validation splits


In [10]:
automl_settings = {
    "iteration_timeout_minutes" : 1,
    "iterations" : 20,
    "primary_metric" : 'spearman_correlation',
    "preprocess" : True,
    "verbosity" : logging.INFO,
    "n_cross_validations": 5
}

In [11]:
from azureml.train.automl import AutoMLConfig

# local compute 
automated_ml_config = AutoMLConfig(task = 'regression',
                             debug_log = 'automated_ml_errors.log',
                             path = project_folder,
                             X = x_train.values,
                             y = y_train.values.flatten(),
                             **automl_settings)

ImportError: cannot import name 'Experiment'

### Train the automatic regression model

Start the experiment to run locally. Pass the defined `automated_ml_config` object to the experiment, and set the output to `true` to view progress during the experiment.

In [18]:
from azureml.core.experiment import Experiment
experiment=Experiment(ws, experiment_name)
local_run = experiment.submit(automated_ml_config, show_output=True)

Running on local machine
Parent Run ID: AutoML_f87ee04d-fecc-478e-80ee-37061fc04317
********************************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
SAMPLING %: Percent of the training data to sample.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
********************************************************************************************************************

 ITERATION   PIPELINE                                       SAMPLING %  DURATION      METRIC      BEST
         0   MaxAbsScaler RandomForest                      100.0000    0:00:32       0.7536    0.7536
         1   StandardScalerWrapper DecisionTree             100.0000    0:00:21       0.7639    0.7639
         2   StandardScalerWrapper LightGBM                 100

###  Add a Jupyter widget to see results

Use the Jupyter notebook widget to see a graph and a table of all results.

In [19]:
from azureml.widgets import RunDetails
RunDetails(local_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Retrieve the best model

Select the best pipeline from our iterations. The `get_output` method on `automl_classifier` returns the best run and the fitted model for the last fit invocation. There are overloads on `get_output` that allow you to retrieve the best run and fitted model for any logged metric or a particular iteration.

In [20]:
best_run, fitted_model = local_run.get_output()
print(best_run)
print(fitted_model)

Run(Experiment: automated-ml-regression,
Id: AutoML_f87ee04d-fecc-478e-80ee-37061fc04317_12,
Type: None,
Status: Completed)
Pipeline(memory=None,
     steps=[('datatransformer', DataTransformer(logger=None, task=None)), ('maxabsscaler', MaxAbsScaler(copy=True)), ('decisiontreeregressor', DecisionTreeRegressor(criterion='friedman_mse', max_depth=None,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impur...       min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'))])


## Register the model

Register the model in your Azure Machine Learning Workspace.

In [21]:
description = 'Automated Machine Learning Model - Demo 21th March 19'
tags = None
local_run.register_model(description=description, tags=tags)
print(local_run.model_id) # Use this id to deploy the model as a web service in Azure

Registering model AutoMLf87ee04dfbest
AutoMLf87ee04dfbest


### Download the model

In [25]:
from azureml.core.model import Model
import os

model = Model(workspace=ws, name="AutoML19f877d79best")
model.download(target_dir="/home/nbuser/library/tutorials/output", exist_ok='true')

'/home/nbuser/library/tutorials/output/model.pkl'

### Deploy the model

Use the python SDK to operationalise the model in a docker container

In [None]:
from azureml.train.automl import automlexplainer

from azureml.train.automl.automlexplainer import explain_model
import numpy as np
from sklearn.externals import joblib


#model


train_row = pd.DataFrame(x_df,columns=['PatientID','Pregnancies', 'PlasmaGlucose','DiastolicBloodPressure', 'TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age','Physician'])
test_row = pd.DataFrame(y_df,columns =['Diabetes'])

#print(train_row)
shap_values, expected_values, overall_summary, overall_imp, per_class_summary, per_class_imp = \
    explain_model(fitted_model, train_row, test_row)

print(overall_summary)
print(overall_imp)