In [9]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: dp100test
Azure region: westeurope
Subscription id: 3dafc970-6015-4fba-b00d-39dc6a1b7521
Resource group: dp100


In [7]:
# test python script, to be canceled
!python train.py --C=0.6 --max_iter=11

Attempted to log scalar metric Regularization Strength::
0.6
Attempted to log scalar metric Max iterations::
11
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
Attempted to log scalar metric Accuracy:
0.9064741585433144


In [10]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###

cpu_cluster_name = "natcluster001"
compute_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2", min_nodes=0, max_nodes=2)
cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
cpu_cluster.wait_for_completion(show_output=True)

Creating.........
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [4]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os

# Specify parameter sampler
ps = RandomParameterSampling({
    "--C": uniform(0.1, 1),
    "--max_iter": choice([50, 75, 100, 125])
    }
)


# Specify a Policy
policy = BanditPolicy(slack_factor = 0.1, evaluation_interval = 2)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory = './',
compute_target = cpu_cluster,
entry_script = 'train.py'
)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator = est,
hyperparameter_sampling = ps,
policy = policy,
primary_metric_name = 'Accuracy',
primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
max_total_runs = 12,
max_concurrent_runs = 4)

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.
'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


In [8]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

hdr_run = exp.submit(config=hyperdrive_config)

RunDetails(hdr_run).show()

hdr_run.wait_for_completion(show_output=True)

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_58220b74-42c7-4733-b7cf-a24b1a0592c7
Web View: https://ml.azure.com/runs/HD_58220b74-42c7-4733-b7cf-a24b1a0592c7?wsid=/subscriptions/3dafc970-6015-4fba-b00d-39dc6a1b7521/resourcegroups/dp100/workspaces/dp100test&tid=dd951004-cd14-4cb2-8718-3c0bcc7fed4e

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-06-08T12:36:14.229318][API][INFO]Experiment created<END>\n""<START>[2021-06-08T12:36:16.162566][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n""<START>[2021-06-08T12:36:15.979200][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n"

Execution Summary
RunId: HD_58220b74-42c7-4733-b7cf-a24b1a0592c7
Web View: https://ml.azure.com/runs/HD_58220b74-42c7-4733-b7cf-a24b1a0592c7?wsid=/subscriptions/3dafc970-6015-4fba-b00d-39dc6a1b7521/resourcegroups/dp100/workspaces/dp100test&tid=dd951004-cd14-4cb2-8718-3c0bcc7fed4e



{'runId': 'HD_58220b74-42c7-4733-b7cf-a24b1a0592c7',
 'target': 'natcluster001',
 'status': 'Completed',
 'startTimeUtc': '2021-06-08T12:36:13.963009Z',
 'endTimeUtc': '2021-06-08T12:50:23.027974Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'f10b7548-47da-423b-ab7b-809e667bc567',
  'score': '0.9123597572190546',
  'best_child_run_id': 'HD_58220b74-42c7-4733-b7cf-a24b1a0592c7_0',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://dp100test4308685000.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_58220b74-42c7-4733-b7cf-a24b1a0592c7/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=NDamNV6qhGaVYEM%2FfVCf%2BKqxP%2FCxV26DJDhlrbtnnBI%3D&st=2021-06-08T12%3A40%3A24Z&se=2021-06-08T20%3A50%3A24Z&sp=r'},
 'submittedBy': 'Natalia

In [11]:
import joblib
# Get your best run and save the model from that run.

best_hdr_run = hdr_run.get_best_run_by_primary_metric()

hdr_model = best_hdr_run.register_model(model_name='nat-best-hdr-model', model_path='./outputs/model_nat001.pkl')

In [3]:
from azureml.data.dataset_factory import TabularDatasetFactory as tdf

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

url= "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = tdf.from_delimited_files(path=url)

In [4]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

In [12]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    compute_target = cpu_cluster_name,
    experiment_timeout_minutes=30,
    task= 'classification',
    primary_metric='accuracy',
    training_data= ds,
    label_column_name= 'y',
    n_cross_validations= 2)

In [13]:
# Submit your automl run

nat_automl_run = exp.submit(config=automl_config)

RunDetails(nat_automl_run).show()

nat_automl_run.wait_for_completion(show_output=True)

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_ae616f4a-2617-4204-85fe-f26cad0b61d5,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_ae616f4a-2617-4204-85fe-f26cad0b61d5,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+-------------------------

{'runId': 'AutoML_ae616f4a-2617-4204-85fe-f26cad0b61d5',
 'target': 'natcluster001',
 'status': 'Completed',
 'startTimeUtc': '2021-06-09T08:19:41.989118Z',
 'endTimeUtc': '2021-06-09T09:07:28.722569Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '2',
  'target': 'natcluster001',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"a053c50f-1781-4417-a2fd-321f2de8d977\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.28.0", "azureml-train": "1.28.0", "azureml-train-restclients-hyperdrive": "1.28.0", "azureml-train-core": "1.28.0", "azureml-train-automl": "1.28.0", "azureml-train-automl-runtime": "1.28.0", "azureml-train-automl-client": "1.28.0", 

Current provisioning state of AmlCompute is "Deleting"



In [14]:
# Retrieve and save your best automl model.

best_run, fitted_model = nat_automl_run.get_output()
print(best_run)
print(fitted_model)

Package:azureml-automl-runtime, training version:1.29.0, current version:1.28.0.post2
Package:azureml-core, training version:1.29.0, current version:1.28.0
Package:azureml-dataset-runtime, training version:1.29.0, current version:1.28.0
Package:azureml-defaults, training version:1.29.0, current version:1.28.0
Package:azureml-interpret, training version:1.29.0, current version:1.28.0
Package:azureml-mlflow, training version:1.29.0, current version:1.28.0
Package:azureml-pipeline-core, training version:1.29.0, current version:1.28.0
Package:azureml-telemetry, training version:1.29.0, current version:1.28.0
Package:azureml-train-automl-client, training version:1.29.0, current version:1.28.0
Package:azureml-train-automl-runtime, training version:1.29.0, current version:1.28.0


Run(Experiment: udacity-project,
Id: AutoML_ae616f4a-2617-4204-85fe-f26cad0b61d5_27,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=False, enable_feature_sweeping=True, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=True, is_onnx_compatible=False, observer=None, task='classification', working_dir='/mnt/batch/tasks/shared/LS_root/mount...
), random_state=None, reg_alpha=0.5789473684210527, reg_lambda=0.42105263157894735, subsample=0.05))], verbose=False)), ('9', Pipeline(memory=None, steps=[('maxabsscaler', MaxAbsScaler(copy=True)), ('logisticregression', LogisticRegression(C=2.559547922699533, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='saga', tol=0.0001, verbose=0, warm_start=False)

In [15]:
print(nat_automl_run.get_metrics())

{'experiment_status': ['DatasetEvaluation', 'FeaturesGeneration', 'DatasetFeaturization', 'DatasetFeaturizationCompleted', 'DatasetBalancing', 'DatasetCrossValidationSplit', 'ModelSelection'], 'experiment_status_description': ['Gathering dataset statistics.', 'Generating features for the dataset.', 'Beginning to fit featurizers and featurize the dataset.', 'Completed fit featurizers and featurizing the dataset.', 'Performing class balancing sweeping', 'Generating individually featurized CV splits.', 'Beginning model selection.'], 'f1_score_weighted': 0.9112784222114043, 'norm_macro_recall': 0.4853295312954593, 'log_loss': 0.2148951904216666, 'AUC_macro': 0.9465844067585989, 'average_precision_score_macro': 0.8240619238481101, 'recall_score_micro': 0.9156297420333839, 'weighted_accuracy': 0.9585946991606416, 'recall_score_macro': 0.7426647656477297, 'f1_score_micro': 0.9156297420333839, 'recall_score_weighted': 0.9156297420333839, 'average_precision_score_weighted': 0.9550762255059411, 

In [17]:
model = nat_automl_run.register_model(model_name='nat-best-automl-model')
print(model.name, model.id, model.version, sep='\t')

nat-best-automl-model	nat-best-automl-model:1	1
