## Hyperdrive

#### Get workspace and enviroment

In [2]:
from azureml.core import Workspace, Experiment, Dataset, Environment

ws = Workspace.from_config()
myenv = Environment.get(workspace=ws, name="AzureML-Minimal")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: Michaels_test1
Azure region: northeurope
Subscription id: 336509a3-005c-4650-b1e5-e1c99e57a5e1
Resource group: MS_Test


#### Create compute target

In [4]:
from azureml.core.compute import ComputeTarget

compute_target = "Compute-target3"
try:
    compute = ComputeTarget(ws, compute_target)
    print("found existing compute target.")
except ComputeTargetException:
    print("creating new compute target")
    
    provisioning_config = ComputeTarget.provisioning_configuration(vm_size = "STANDARD_D2_V2",
                                                                min_nodes = 0, 
                                                                max_nodes = 4)    
    compute = ComputeTarget.create(ws, compute_target, provisioning_config)
    
compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)   
print("Azure Machine Learning Compute attached")

found existing compute target.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
Azure Machine Learning Compute attached


#### Create scriptConfig for train.py script

In [5]:
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(source_directory='.',
                      script='train.py',
                      compute_target=compute_target,
                      environment=myenv)

#### import libraries, specify hyperparameters for Hyperdrive, create estimator and earlystopping policy

In [6]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os

# Specify parameter sampler
ps = RandomParameterSampling({
    "--solver": choice('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'),
    "--C": uniform(0.25,2),
    "--max_iter": choice(range(50,150))
})

# Specify a Policy
policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)

#if "training" not in os.listdir():
#    os.mkdir("./training")
if "models" not in os.listdir():
    os.mkdir("./models")

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory='.', entry_script='train.py', compute_target=compute)
# 

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
                                         hyperparameter_sampling=ps, 
                                         primary_metric_name='Accuracy',
                                         primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                         max_total_runs=30,
                                         max_concurrent_runs=4,
                                        policy=policy,
                                        estimator=est
                                        )

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


#### Create experiment and and submit run

In [7]:
experiment_name = 'Uda3'
experiment = Experiment(workspace=ws, name=experiment_name)
hyperdrive_run = experiment.submit(hyperdrive_config)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_0a1fc67d-37bc-4ce1-b504-1ebccd7ada7d
Web View: https://ml.azure.com/experiments/Uda3/runs/HD_0a1fc67d-37bc-4ce1-b504-1ebccd7ada7d?wsid=/subscriptions/336509a3-005c-4650-b1e5-e1c99e57a5e1/resourcegroups/MS_Test/workspaces/Michaels_test1

Streaming azureml-logs/hyperdrive.txt

"<START>[2020-12-28T10:14:43.997005][API][INFO]Experiment created<END>\n""<START>[2020-12-28T10:14:44.587745][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n"<START>[2020-12-28T10:14:45.1152793Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2020-12-28T10:14:44.964122][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_0a1fc67d-37bc-4ce1-b504-1ebccd7ada7d
Web View: https://ml.azure.com/experiments/Uda3/runs/HD_0a1fc67d-37bc-4ce1-b504-1ebccd7ada7d?wsid=/subscriptions/336509a3-005c-4650-b1e5-e1c99e57a5e1/resourcegroup

{'runId': 'HD_0a1fc67d-37bc-4ce1-b504-1ebccd7ada7d',
 'target': 'Compute-target3',
 'status': 'Completed',
 'startTimeUtc': '2020-12-28T10:14:43.738111Z',
 'endTimeUtc': '2020-12-28T10:51:21.148488Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'dbb472c9-fb88-4869-b82d-1149fab2cb2e',
  'score': '0.9150227617602428',
  'best_child_run_id': 'HD_0a1fc67d-37bc-4ce1-b504-1ebccd7ada7d_10',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://michaelstest16096096377.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_0a1fc67d-37bc-4ce1-b504-1ebccd7ada7d/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=gz2vDBa1uWcNMZYyDffUEDN%2B9MLtHI74i1qS5LuiHIs%3D&st=2020-12-28T10%3A41%3A37Z&se=2020-12-28T18%3A51%3A37Z&sp=r'}}

#### Get the best model from all the Hyperdrive runs

In [8]:
best_hyper_run = hyperdrive_run.get_best_run_by_primary_metric()
best_hyper_run_metrics = best_hyper_run.get_metrics()

print('Best Run Id: ', best_hyper_run.id)
print('\n Accuracy:', best_hyper_run_metrics['Accuracy'])

best_hyper_run

Best Run Id:  HD_0a1fc67d-37bc-4ce1-b504-1ebccd7ada7d_10

 Accuracy: 0.9150227617602428


Experiment,Id,Type,Status,Details Page,Docs Page
Uda3,HD_0a1fc67d-37bc-4ce1-b504-1ebccd7ada7d_10,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [10]:
best_hyper_run.get_file_names()

['azureml-logs/55_azureml-execution-tvmps_62c063c2e468dfb90549b445b7d6ab2dd217cb0e536df53ab9bcdddd8afecdec_d.txt',
 'azureml-logs/65_job_prep-tvmps_62c063c2e468dfb90549b445b7d6ab2dd217cb0e536df53ab9bcdddd8afecdec_d.txt',
 'azureml-logs/70_driver_log.txt',
 'azureml-logs/75_job_post-tvmps_62c063c2e468dfb90549b445b7d6ab2dd217cb0e536df53ab9bcdddd8afecdec_d.txt',
 'azureml-logs/process_info.json',
 'azureml-logs/process_status.json',
 'logs/azureml/102_azureml.log',
 'logs/azureml/job_prep_azureml.log',
 'logs/azureml/job_release_azureml.log',
 'outputs/model.joblib']

In [11]:
#Save model and register it
model = best_hyper_run.register_model(model_name='Bank_marketing_Hyperdrive', model_path='outputs/model.joblib')
print(model.name, model.id, model.version, sep='\t')

Bank_marketing_Hyperdrive	Bank_marketing_Hyperdrive:2	2


## Automated Machine Learning (AML)

#### Retreive data

In [12]:
from azureml.data.dataset_factory import TabularDatasetFactory

filepath = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(path=filepath)

#### Import clean script, preprocess data and upload/load to/from Datastore to get the right format for AML

In [13]:
from train import clean_data
import pandas as pd

train, test = clean_data(ds)
data = pd.concat([train, test])

if "data" not in os.listdir():
    os.mkdir("./data")
local_path = 'data/prepared.csv'
data.to_csv(local_path)
datastore = ws.get_default_datastore()
datastore.upload(src_dir='data', target_path='data')
dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, ('data/prepared.csv'))])

Uploading an estimated of 1 files
Target already exists. Skipping upload for data/prepared.csv
Uploaded 0 files


#### Make automl config

In [14]:
from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(
    experiment_timeout_minutes=90,
    compute_target = compute,
    task='classification',
    primary_metric='accuracy',
    training_data=dataset,
    label_column_name='y',
    n_cross_validations=5)

exp = Experiment(workspace=ws, name="uda3")

#### Submit run to AML

In [15]:
automl_run = exp.submit(automl_config)
RunDetails(automl_run).show()
automl_run.wait_for_completion(show_output=True)

Running on remote.


_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…


Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of samples in the training data|
|3692                             |1    

{'runId': 'AutoML_28c64c20-d67c-4845-99f5-456f28feb857',
 'target': 'Compute-target3',
 'status': 'Completed',
 'startTimeUtc': '2020-12-28T11:02:18.931977Z',
 'endTimeUtc': '2020-12-28T12:46:55.661474Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'Compute-target3',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"af5d2813-855c-4165-b6e5-3123ea524914\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"data/prepared.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"MS_Test\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"336509a3-005c-4650-b1e5-e1c99e57a5e1\\\\\\", \\\\\\"workspaceName

#### Get best run from AML

In [16]:
best_automl_run, fitted_model = automl_run.get_output()
best_automl_run

Experiment,Id,Type,Status,Details Page,Docs Page
uda3,AutoML_28c64c20-d67c-4845-99f5-456f28feb857_53,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [17]:
best_automl_run.get_tags()

{'_aml_system_azureml.automlComponent': 'AutoML',
 '_aml_system_ComputeTargetStatus': '{"AllocationState":"steady","PreparingNodeCount":0,"RunningNodeCount":1,"CurrentNodeCount":1}',
 'ensembled_iterations': '[0, 32, 1, 46, 47, 38]',
 'ensembled_algorithms': "['LightGBM', 'XGBoostClassifier', 'XGBoostClassifier', 'XGBoostClassifier', 'XGBoostClassifier', 'XGBoostClassifier']",
 'ensemble_weights': '[0.26666666666666666, 0.26666666666666666, 0.13333333333333333, 0.13333333333333333, 0.06666666666666667, 0.13333333333333333]',
 'best_individual_pipeline_score': '0.9166616084977239',
 'best_individual_iteration': '0',
 '_aml_system_automl_is_child_run_end_telemetry_event_logged': 'True',
 'model_explain_run_id': 'AutoML_28c64c20-d67c-4845-99f5-456f28feb857_ModelExplain',
 'model_explanation': 'True'}

In [18]:
best_automl_run.register_model(model_name = "automl.pkl", model_path = './outputs/')
print(fitted_model._final_estimator)

StackEnsembleClassifier(base_learners=[('0',
                                        Pipeline(memory=None,
                                                 steps=[('maxabsscaler',
                                                         MaxAbsScaler(copy=True)),
                                                        ('lightgbmclassifier',
                                                         LightGBMClassifier(boosting_type='gbdt',
                                                                            class_weight=None,
                                                                            colsample_bytree=1.0,
                                                                            importance_type='split',
                                                                            learning_rate=0.1,
                                                                            max_depth=-1,
                                                                            min_c

#### Clean up compute by deleting compute target

In [19]:
compute.delete()