In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()

In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
cpu_cluster_name = 'cpu-cluster'
try:
    compute_target = ComputeTarget(workspace=ws,name=cpu_cluster_name)
except ComputeTargetException:  
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_D2_V2', max_nodes=4)
    compute_target = ComputeTarget.create(ws, comp_cluster, comp_confg)

compute_target.wait_for_completion(show_output=True)


Running


In [7]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
from azureml.train.hyperdrive import MedianStoppingPolicy
import os

# Specify parameter sampler
ps = RandomParameterSampling({
  "--C": choice(0.01, 0.1, 0.25, 0.5, 1, 2, 5, 10),
  "--max_iter": choice(8,16,32,64,128)
})

# Specify a Policy
policy = BanditPolicy(evaluation_interval=1, slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(
    "./",
    compute_target=compute_target,
    entry_script="train.py" )

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.

hyperdrive_config = HyperDriveConfig(
    estimator=est, 
    hyperparameter_sampling=ps,
    primary_metric_name='Accuracy',
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    policy=policy, 
    max_total_runs=8,
    max_concurrent_runs=4
    )



In [8]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
exp = Experiment(workspace=ws, name="hyperdrive")
hyperdrive_submit = exp.submit(config=hyperdrive_config)
RunDetails(hyperdrive_submit).show()
hyperdrive_submit.wait_for_completion(show_output=True)



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_e147dc9b-369a-4da5-9ce2-410d08056580
Web View: https://ml.azure.com/runs/HD_e147dc9b-369a-4da5-9ce2-410d08056580?wsid=/subscriptions/aa7cf8e8-d23f-4bce-a7b9-1f0b4e0ac8ee/resourcegroups/aml-quickstarts-144961/workspaces/quick-starts-ws-144961&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-05-16T14:42:36.503827][API][INFO]Experiment created<END>\n""<START>[2021-05-16T14:42:37.262725][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-05-16T14:42:37.426478][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_e147dc9b-369a-4da5-9ce2-410d08056580
Web View: https://ml.azure.com/runs/HD_e147dc9b-369a-4da5-9ce2-410d08056580?wsid=/subscriptions/aa7cf8e8-d23f-4bce-a7b9-1f0b4e0ac8ee/resourcegroups/aml-quickstarts-144961/workspaces/quick-starts-ws-144961&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254



{'runId': 'HD_e147dc9b-369a-4da5-9ce2-410d08056580',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-05-16T14:42:36.185701Z',
 'endTimeUtc': '2021-05-16T14:45:39.619645Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'e3104c83-89bb-4c10-8daa-396010adf372',
  'score': '0.9121396054628225',
  'best_child_run_id': 'HD_e147dc9b-369a-4da5-9ce2-410d08056580_2',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg144961.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_e147dc9b-369a-4da5-9ce2-410d08056580/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=s2M07rv2BxY2%2Fv%2B3mBN4UjXhtea6F6MQcl%2B6URBAjok%3D&st=2021-05-16T14%3A35%3A50Z&se=2021-05-16T22%3A45%3A50Z&sp=r'},
 'submittedBy': 'ODL_User 144961'

In [10]:
# Get your best run and save the model from that run.

best_run = hyperdrive_submit.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()


print('Best Run Id: ', best_run.id)
print('\n Accuracy:', best_run_metrics['Accuracy'])

model = best_run.register_model(model_name='hyperdrive_best', model_path='outputs/model_logreg.joblib')

Best Run Id:  HD_e147dc9b-369a-4da5-9ce2-410d08056580_2

 Accuracy: 0.9121396054628225


In [17]:
from azureml.data.dataset_factory import TabularDatasetFactory
import pandas as pd
# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

link = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files(path=link)


In [20]:
def clean_data(data):
    # Dict for cleaning data
    months = {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6,
              "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12}
    weekdays = {"mon": 1, "tue": 2, "wed": 3,
                "thu": 4, "fri": 5, "sat": 6, "sun": 7}

    # Clean and one hot encode data
    x_df = data.to_pandas_dataframe().dropna()
    jobs = pd.get_dummies(x_df.job, prefix="job")
    x_df.drop("job", inplace=True, axis=1)
    x_df = x_df.join(jobs)
    x_df["marital"] = x_df.marital.apply(lambda s: 1 if s == "married" else 0)
    x_df["default"] = x_df.default.apply(lambda s: 1 if s == "yes" else 0)
    x_df["housing"] = x_df.housing.apply(lambda s: 1 if s == "yes" else 0)
    x_df["loan"] = x_df.loan.apply(lambda s: 1 if s == "yes" else 0)
    contact = pd.get_dummies(x_df.contact, prefix="contact")
    x_df.drop("contact", inplace=True, axis=1)
    x_df = x_df.join(contact)
    education = pd.get_dummies(x_df.education, prefix="education")
    x_df.drop("education", inplace=True, axis=1)
    x_df = x_df.join(education)
    x_df["month"] = x_df.month.map(months)
    x_df["day_of_week"] = x_df.day_of_week.map(weekdays)
    x_df["poutcome"] = x_df.poutcome.apply(
        lambda s: 1 if s == "success" else 0)

    y_df = x_df.pop("y").apply(lambda s: 1 if s == "yes" else 0)

    return x_df,y_df


In [24]:
from sklearn.model_selection import train_test_split

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

training_data = pd.concat([x_train, y_train], axis=1)
y_name = y_train.name

In [30]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.


automl_settings = {
    "enable_early_stopping" : True,
    "iteration_timeout_minutes": 5,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1,
    "primary_metric": 'accuracy',
    "featurization": 'auto',
    "verbosity": logging.INFO,
    "n_cross_validations":5
}

automl_config = AutoMLConfig(experiment_timeout_minutes=60,
                             task = 'classification',
                             debug_log = 'automl_errors.log',
                             compute_target=compute_target,
                             experiment_exit_score = 0.99,
                             blocked_models = ['KNN','LinearSVM'],
                             enable_onnx_compatible_models=True,
                             training_data = ds,
                             label_column_name = y_name,
                             **automl_settings
                            )




In [31]:
experiment = Experiment(ws, "automl")
run = experiment.submit(config=automl_config, show_output=True)

Submitting remote run.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster


Experiment,Id,Type,Status,Details Page,Docs Page
automl,AutoML_4dd772c2-37f2-44b9-9476-182859091ba9,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+-------------------------

In [36]:
# Retrieve and save your best automl model.

best_run, fitted_model = run.get_output()
print(fitted_model.steps)

model_name = best_run.properties['model_name']
description = 'AutoML forecast example'
tags = None

model = run.register_model(model_name = model_name, 
                                  description = description, 
                                  tags = tags)

[('datatransformer', DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                feature_sweeping_config=None, feature_sweeping_timeout=None,
                featurization_config=None, force_text_dnn=None,
                is_cross_validation=None, is_onnx_compatible=None, logger=None,
                observer=None, task=None, working_dir=None)), ('prefittedsoftvotingclassifier', PreFittedSoftVotingClassifier(classification_labels=None,
                              estimators=[('32',
                                           Pipeline(memory=None,
                                                    steps=[('sparsenormalizer',
                                                            <azureml.automl.runtime.shared.model_wrappers.SparseNormalizer object at 0x7f0cbcaeca90>),
                                                           ('xgboostclassifier',
                                                            XGBoostClassifier(base_score=0.5,
                     

In [None]:
try:
    compute_target.delete()
except:
    print("Compute target does not exist")