In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="capstone_project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code AQFLXGQN4 to authenticate.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.
Workspace name: quick-starts-ws-122641
Azure region: southcentralus
Subscription id: 0c5a644d-c5ce-4e3b-bf42-4cb265317817
Resource group: aml-quickstarts-122641


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

amlcompute_cluster_name = "mycluster"
try:
    aml_compute = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, using it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',max_nodes=4)
    aml_compute = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)
aml_compute.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [3]:
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core import Dataset

dataset_link = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
dataset = Dataset.Tabular.from_delimited_files(path=dataset_link, header=False).to_pandas_dataframe()
dataset.columns =['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'iris_class'] 

print(dataset.head())


   sepal_length  sepal_width  petal_length  petal_width   iris_class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


In [4]:
dataset['iris_class'].value_counts()

Iris-versicolor    50
Iris-virginica     50
Iris-setosa        50
Name: iris_class, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(dataset, test_size=0.2, random_state=223)

In [6]:
from azureml.train.automl import AutoMLConfig


automl_config = AutoMLConfig(
    experiment_timeout_minutes=15,
    task='classification',
    primary_metric='accuracy',
    training_data=x_train,
    label_column_name='iris_class',
    n_cross_validations=5)

This configuration is chosen for the following reasons:
1. experiment_timeout_minutes is 15 as the data is pretty clear for the algorithm to under and it does not require many models to run on.
2. task is classification as it is a classification problem.
3. primary_metric is accuracy as the accuracy will give correct results here as there is no data imbalancing problem.
4. training is x_train which is the 80% of the complete dataset and it is a general recommendation to leave 20% as the test set to evaluate the model on.
5. label_column_name is the iris class as the iris_class is to be predicted.
6. n_cross_validations is 5, for a low bias and moderate variance.

In [7]:
from azureml.widgets import RunDetails

automl_run = exp.submit(automl_config, show_output=True)
RunDetails(automl_run).show()
automl_run.wait_for_completion(show_output=True)

Running on local machine
Parent Run ID: AutoML_5c6b95b6-6baf-4bcb-a78c-b4c0b345b4bf

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing f

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…



****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

TYPE:         High cardinality feature detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and no high cardinality features were detected.
              Learn more abo

{'runId': 'AutoML_5c6b95b6-6baf-4bcb-a78c-b4c0b345b4bf',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-10-24T14:48:48.226137Z',
 'endTimeUtc': '2020-10-24T15:05:16.532194Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'local',
  'DataPrepJsonString': None,
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.16.0", "azureml-train": "1.16.0", "azureml-train-restclients-hyperdrive": "1.16.0", "azureml-train-core": "1.16.0", "azureml-train-automl": "1.16.0", "azureml-train-automl-runtime": "1.16.0", "azureml-train-automl-client": "1.16.0", "azureml-tensorboard": "1.16.0", "azureml-telemetry": "1.16.0", "azureml-sdk": "1.16.0", "azureml-samples": "0+unknow

In the above AutoML run, the `VotingEnsemble` model has performed the best giving an accuracy of 0.9750. The `StandardScalerWrapper RandomForest` model performed just below the best model by accuracy of `0.9667`.


In [8]:
best_automl_run, best_model = automl_run.get_output()
best_automl_run.register_model(model_name = "best_run_automl.pkl", model_path = './outputs/')
print(best_model._final_estimator)
print(best_automl_run)

PreFittedSoftVotingClassifier(classification_labels=None,
                              estimators=[('36',
                                           Pipeline(memory=None,
                                                    steps=[('standardscalerwrapper',
                                                            <azureml.automl.runtime.shared.model_wrappers.StandardScalerWrapper object at 0x7f2a31355d68>),
                                                           ('randomforestclassifier',
                                                            RandomForestClassifier(bootstrap=True,
                                                                                   ccp_alpha=0.0,
                                                                                   class_weight=None,
                                                                                   criterion='gini',
                                                                                   max_depth=None,
  

In [9]:
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os

# Specify parameter sampler
ps = RandomParameterSampling(
   parameter_space={
       "--C": uniform(0.1, 0.9),
       "--max_iter": choice(10, 50, 100)
    }
)

# Specify a Policy
policy = BanditPolicy(
    evaluation_interval=3,
    slack_factor=0.1
)

# Create a SKLearn estimator for use with train.py
est = SKLearn(
    source_directory="./",
    entry_script="train.py",
    compute_target=aml_compute,
)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    primary_metric_name="Accuracy",
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=48,
    max_concurrent_runs=8,
    hyperparameter_sampling=ps,
    policy=policy,
    estimator=est
)

Two hyperparameters are used to be optimized by the hyperdrive which are the following:
1. C : This is the regularization strength. Regularization is the process in which the non significant features are penalised and their effect reduces based on the regularization strength which is optimised here. Thus it is a very important hyperparameter to consider as it desides how do we get rid of the non significant feature's effect.
2. max_iter : These are the number of iteration for which the hyperdrive can run. It is a very important hyperparameter as it is very uncertain that at what point the model performance converges. Thus if the model converges very early for example at 10th iteration we should stop the experiment. Therefore this hyperparameter is used. 

In [10]:
hyperdrive_run = exp.submit(hyperdrive_config)




In [11]:
from azureml.widgets import RunDetails

RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_b6e56e5c-3f9c-4b41-be0b-37e5a6d0bcb3
Web View: https://ml.azure.com/experiments/capstone_project/runs/HD_b6e56e5c-3f9c-4b41-be0b-37e5a6d0bcb3?wsid=/subscriptions/0c5a644d-c5ce-4e3b-bf42-4cb265317817/resourcegroups/aml-quickstarts-122641/workspaces/quick-starts-ws-122641

Streaming azureml-logs/hyperdrive.txt

"<START>[2020-10-24T15:29:48.492675][API][INFO]Experiment created<END>\n""<START>[2020-10-24T15:29:49.622808][GENERATOR][INFO]Successfully sampled '8' jobs, they will soon be submitted to the execution target.<END>\n""<START>[2020-10-24T15:29:49.300645][GENERATOR][INFO]Trying to sample '8' jobs from the hyperparameter space<END>\n"<START>[2020-10-24T15:29:50.1391569Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>


The best performance was observed with an Accuracy of 1 (100%) with the parameter C being at `0.486374154783502` and max_iter at `50`.

In [12]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run)
print(hyperdrive_run.get_metrics())

Run(Experiment: capstone_project,
Id: HD_b6e56e5c-3f9c-4b41-be0b-37e5a6d0bcb3_2,
Type: azureml.scriptrun,
Status: Completed)
{'HD_b6e56e5c-3f9c-4b41-be0b-37e5a6d0bcb3_0': {'Regularization Strength:': 0.36618990846536037, 'Max iterations:': 50, 'Accuracy': 0.9666666666666667}, 'HD_b6e56e5c-3f9c-4b41-be0b-37e5a6d0bcb3_1': {'Regularization Strength:': 0.19418911362047614, 'Max iterations:': 10, 'Accuracy': 0.9}, 'HD_b6e56e5c-3f9c-4b41-be0b-37e5a6d0bcb3_10': {'Regularization Strength:': 0.3343297558772621, 'Max iterations:': 50, 'Accuracy': 0.9666666666666667}, 'HD_b6e56e5c-3f9c-4b41-be0b-37e5a6d0bcb3_11': {'Regularization Strength:': 0.36184700895259303, 'Max iterations:': 10, 'Accuracy': 0.9666666666666667}, 'HD_b6e56e5c-3f9c-4b41-be0b-37e5a6d0bcb3_12': {'Regularization Strength:': 0.7264053235865947, 'Max iterations:': 10, 'Accuracy': 1.0}, 'HD_b6e56e5c-3f9c-4b41-be0b-37e5a6d0bcb3_13': {'Regularization Strength:': 0.8561559017336751, 'Max iterations:': 10, 'Accuracy': 1.0}, 'HD_b6e56e5c

In [13]:
from azureml.core import Model
from azureml.core.webservice import AciWebservice, Webservice


model = best_run.register_model(model_name='hyperdrive_model', model_path='./outputs/model.pkl',
                       model_framework=Model.Framework.SCIKITLEARN,
                       model_framework_version='0.19.1')
aci_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=2,
                                                enable_app_insights=True, auth_enabled=True)  
service_name = 'my-sklearn-service'
service = Model.deploy(ws, service_name, [model], deployment_config=aci_config)
service.wait_for_deployment(show_output = True)
print(service.get_logs())


Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running.........................................................................
Succeeded
ACI service creation operation finished, operation "Succeeded"
2020-10-24T16:13:08,970914984+00:00 - rsyslog/run 
2020-10-24T16:13:08,969694977+00:00 - gunicorn/run 
2020-10-24T16:13:08,972225991+00:00 - iot-server/run 
2020-10-24T16:13:08,998815036+00:00 - nginx/run 
EdgeHubConnectionString and IOTEDGE_IOTHUBHOSTNAME are not set. Exiting...
2020-10-24T16:13:09,299709478+00:00 - iot-server/finish 1 0
2020-10-24T16:13:09,305183408+00:00 - Exit code 1 is normal. Not restarting iot-server.
Starting gunicorn 19.9.0
Listening at: http://127.0.0.1:31311 (14)
Using worker: sync
worker timeout is set to 300
Booting worker with pid: 40
SPARK_HOME not set. Skipping PySpark Initialization.
Initializing logger
2020-10-24 16:13:10,5