In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="quick-starts-ws-133322")
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-133322
Azure region: southcentralus
Subscription id: 5a4ab2ba-6c51-4805-8155-58759ad589d8
Resource group: aml-quickstarts-133322


In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
cpu_cluster_name = "cpu-cluster"
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
cpu_cluster.wait_for_completion(show_output=True)

Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [14]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
from azureml.core import ScriptRunConfig
import os

# Specify parameter sampler
# ps = ### YOUR CODE HERE ###
ps = RandomParameterSampling(
    {
        '--C': uniform(0.01, 1.00),
        '--max_iter': choice(100, 200, 300, 400, 500)
    }
)

# Specify a Policy
# policy = ### YOUR CODE HERE ###
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=5)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
# est = SKLearn(source_directory='.', entry_script='train.py', compute_target=cpu_cluster)
# est = ScriptRunConfig(source_directory='.',
#                      script='train.py',
#                      compute_target=cpu_cluster
#                     )

est = SKLearn(source_directory=os.path.join('./'), entry_script='train.py', compute_target=cpu_cluster)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.

hyperdrive_config = HyperDriveConfig(estimator=est,
                                    hyperparameter_sampling=ps,
                                    policy=policy,
                                    primary_metric_name='Accuracy',
                                    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                    max_total_runs=20,
                                    max_concurrent_runs=4)



In [15]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
hyperdrive_run = exp.submit(hyperdrive_config)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_cde90b4b-93d5-4734-97a0-81a9190ded97
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_cde90b4b-93d5-4734-97a0-81a9190ded97?wsid=/subscriptions/5a4ab2ba-6c51-4805-8155-58759ad589d8/resourcegroups/aml-quickstarts-133322/workspaces/quick-starts-ws-133322

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-01-04T15:10:04.156957][API][INFO]Experiment created<END>\n""<START>[2021-01-04T15:10:04.742591][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-01-04T15:10:05.086072][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-01-04T15:10:07.0212263Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_cde90b4b-93d5-4734-97a0-81a9190ded97
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_cde90b4b-93d5-4734-97a0-81a9190ded97?wsid=/subscriptions/5a4ab

{'runId': 'HD_cde90b4b-93d5-4734-97a0-81a9190ded97',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-01-04T15:10:03.95228Z',
 'endTimeUtc': '2021-01-04T15:23:39.896935Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '507158cb-a58d-4403-9ae0-f388ab1d8ef1',
  'score': '0.9099645928174',
  'best_child_run_id': 'HD_cde90b4b-93d5-4734-97a0-81a9190ded97_19',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg133322.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_cde90b4b-93d5-4734-97a0-81a9190ded97/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=MpnaUq88nqPwWRvxJHMNrBS%2FQuz0q8rB%2FF%2BsGP97BLo%3D&st=2021-01-04T15%3A14%3A04Z&se=2021-01-04T23%3A24%3A04Z&sp=r'}}

In [16]:
import joblib
from azureml.core.model import Model
# Get your best run and save the model from that run.
### YOUR CODE HERE ###

best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
# parameter_values = best_run.get_details()['runDefinition']['Arguments']
# Run 1 - C = 0.02, max-iter = 50

model = best_run.register_model(model_name='hyperdrive_best_model', model_path='./outputs/model.pkl', model_framework=Model.Framework.SCIKITLEARN, model_framework_version='0.19.1')

In [17]:
print(best_run.get_details()['runDefinition']['arguments'])

['--C', '0.023699514107491826', '--max_iter', '100']


AutoML

In [5]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
web_path = 'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'
data_set = TabularDatasetFactory.from_delimited_files(path=web_path)


In [6]:
from train import clean_data
from sklearn.model_selection import train_test_split
import os

# Use the clean_data function to clean your data.
x, y = clean_data(data_set)

try:
    os.makedirs('./data', exist_ok=True)
except OSError as error:
    print('New directory cannot be created')

data_df = x
data_df['y'] = y
local_path = 'data/clean-data.csv'
data_df.to_csv(local_path)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)


In [7]:
import pandas as pd

train_df = x_train
train_df['y'] = y_train
print(train_df.head())

train_path = 'data/train-data.csv'
train_df.to_csv(train_path)

test_df = x_test
test_df['y'] = y_test
print(test_df.head())

test_path = 'data/test-data.csv'
test_df.to_csv(test_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


       age  marital  default  housing  loan  month  day_of_week  duration  \
24186   28        0        0        0     1      7            1       101   
18087   56        0        0        1     0      7            3       173   
6950    30        1        0        0     0      8            1        65   
8200    35        0        0        0     0      6            1        41   
16051   50        1        0        1     1      6            5       111   

       campaign  pdays  ...  contact_telephone  education_basic.4y  \
24186         2    999  ...                  0                   0   
18087         4    999  ...                  0                   0   
6950          2    999  ...                  0                   0   
8200          3    999  ...                  1                   0   
16051         1    999  ...                  1                   0   

       education_basic.6y  education_basic.9y  education_high.school  \
24186                   0                   

In [8]:
datastore = ws.get_default_datastore()
datastore.upload(src_dir='data', target_path='data')

Uploading an estimated of 3 files
Uploading data/clean-data.csv
Uploaded data/clean-data.csv, 1 files out of an estimated total of 3
Uploading data/test-data.csv
Uploaded data/test-data.csv, 2 files out of an estimated total of 3
Uploading data/train-data.csv
Uploaded data/train-data.csv, 3 files out of an estimated total of 3
Uploaded 3 files


$AZUREML_DATAREFERENCE_067a4c5ed0b84eada5ff51da63b82752

In [9]:
clean_data = TabularDatasetFactory.from_delimited_files(path=[(datastore, ('data/clean-data.csv'))])
train_data = TabularDatasetFactory.from_delimited_files(path=[(datastore, ('data/train-data.csv'))])
test_data = TabularDatasetFactory.from_delimited_files(path=[(datastore, ('data/test-data.csv'))])


In [10]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=train_data,
    label_column_name='y',
    n_cross_validations=5, 
    compute_target=cpu_cluster)

In [11]:
# Submit your automl run
from azureml.widgets import RunDetails

### YOUR CODE HERE ##
run = exp.submit(config=automl_config, show_output=True)
RunDetails(run).show()
run.wait_for_completion(show_output=True)



Running on remote.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster
Parent Run ID: AutoML_e5e99d19-0f66-46bd-8ef1-57d7d23fee8a

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+-------------------------

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…



****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of samples in the training data|
|2582                             |1                                |23065                                 |
+---------------------------------+---------------------------------+--------------------------------------+

********************************************

{'runId': 'AutoML_e5e99d19-0f66-46bd-8ef1-57d7d23fee8a',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-01-04T14:16:50.972619Z',
 'endTimeUtc': '2021-01-04T15:00:16.138316Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'cpu-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"f75eaead-5a14-460c-b33f-7ac8d7ab1eb5\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"data/train-data.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-133322\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"5a4ab2ba-6c51-4805-8155-58759ad589d8\\\\\\", \\\\\\"work

In [13]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###
best_run, fitted_model = run.get_output()
automl_model = best_run.register_model(model_path='./outputs/', model_name='bank_marketing_automl.pkl')

In [18]:
print(best_run)

Run(Experiment: udacity-project,
Id: HD_cde90b4b-93d5-4734-97a0-81a9190ded97_19,
Type: azureml.scriptrun,
Status: Completed)


In [19]:
print(fitted_model)

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                    min_samples_leaf=0.035789473684210524,
                                                                                                    min_samples_split=0.01,
                                                                                                    min_weight_fraction_leaf=0.0,