In [None]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
cluster_name = "compute-train"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', 
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    
compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

Found existing compute target
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [3]:
%%writefile conda_dependencies.yml

dependencies:
- python=3.6.2
- scikit-learn
- pip:
  - azureml-defaults

Overwriting conda_dependencies.yml


In [4]:
from azureml.core import Environment

sklearn_env = Environment.from_conda_specification(name = 'sklearn-env', file_path = './conda_dependencies.yml')

In [5]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os

# Specify parameter sampler
### YOUR CODE HERE ###
ps = RandomParameterSampling( {
    "--C": choice(0.01, 0.05, 0.1, 0.5, 1, 50, 100, 1000),
    "--max_iter": choice(100, 200, 400, 800, 2000)
    }
)

# Specify a Policy
### YOUR CODE HERE ###
policy = BanditPolicy(slack_amount=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
# Using ScriptRunConfig since SKLearn is deprecated
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(source_directory=".",
                      script='train.py',
                      arguments=['--C', 1.0 , '--max_iter', 100],
                      compute_target=compute_target,
                      environment=sklearn_env)


# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=50,
                                     max_concurrent_runs=4)

In [6]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
hyperdrive_run = exp.submit(hyperdrive_config)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_3d51eb86-982d-4c26-8f36-0f46d4cb6bdb
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_3d51eb86-982d-4c26-8f36-0f46d4cb6bdb?wsid=/subscriptions/f5878af0-ca26-411c-9906-acf91f5420e2/resourcegroups/GroupRisk/workspaces/MSC-MLS-LRN-DEV

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-01-04T14:45:51.421932][API][INFO]Experiment created<END>\n""<START>[2021-01-04T14:45:52.130829][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n""<START>[2021-01-04T14:45:51.966382][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n"<START>[2021-01-04T14:45:52.2105108Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_3d51eb86-982d-4c26-8f36-0f46d4cb6bdb
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_3d51eb86-982d-4c26-8f36-0f46d4cb6bdb?wsid=/subscriptions/f5878af0-ca26-411c-9906-a

{'runId': 'HD_3d51eb86-982d-4c26-8f36-0f46d4cb6bdb',
 'target': 'compute-train',
 'status': 'Completed',
 'startTimeUtc': '2021-01-04T14:45:51.220544Z',
 'endTimeUtc': '2021-01-04T15:10:39.711936Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '9a563ba4-3344-447c-bf54-b194ebeb100f',
  'score': '0.9129643117261471',
  'best_child_run_id': 'HD_3d51eb86-982d-4c26-8f36-0f46d4cb6bdb_28',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mscmlslrndev1788919189.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_3d51eb86-982d-4c26-8f36-0f46d4cb6bdb/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=hqJchDFZtHRp60jP4zufLZwbB%2FpBC1GrmUjQZ6u3wAs%3D&st=2021-01-04T15%3A01%3A01Z&se=2021-01-04T23%3A11%3A01Z&sp=r'}}

In [7]:
import joblib
# Get your best run and save the model from that run.

### YOUR CODE HERE ###
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])

['--C', '1', '--max_iter', '100', '--C', '50', '--max_iter', '200']


In [9]:
model = best_run.register_model(model_name='train-optimized', model_path='outputs/model.joblib')

In [25]:
model.download("./hyperdrive_optimized_model")

'hyperdrive_optimized_model/model.joblib'

In [29]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(path=path)

In [50]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)
x["label"] = y

In [53]:
if "data" not in os.listdir():
    os.mkdir("./data")
x.to_csv("./data/dataset_clean.csv", index=False)

In [56]:
ds = ws.get_default_datastore()
ds.upload(src_dir='./data', target_path='dataset', overwrite=True, show_progress=True)

Uploading an estimated of 1 files
Uploading ./data/dataset_clean.csv
Uploaded ./data/dataset_clean.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_cfcdd940cfe043299d8fdd0a32fda7e6

In [58]:
from azureml.core.dataset import Dataset
dataset = Dataset.Tabular.from_delimited_files(path=ds.path('dataset/dataset_clean.csv'))

In [59]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=50,
    compute_target=compute_target,
    task='classification',
    primary_metric='accuracy',
    training_data= dataset,
    label_column_name="label",
    n_cross_validations=4)

In [60]:
# Submit your automl run

### YOUR CODE HERE ###
auto_run = exp.submit(automl_config)

Running on remote.


In [63]:
from azureml.widgets import RunDetails
RunDetails(auto_run).show()
auto_run.wait_for_completion(show_output=True)

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…


Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of samples in the training data|
|3692                             |1    

{'runId': 'AutoML_0ca672e4-a7eb-4ea1-8708-3831d6426aeb',
 'target': 'compute-train',
 'status': 'Completed',
 'startTimeUtc': '2021-01-05T14:50:38.535234Z',
 'endTimeUtc': '2021-01-05T15:53:47.933718Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '4',
  'target': 'compute-train',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"48c2fdfb-fec7-4f71-b0a3-70c6aaea3d07\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"dataset/dataset_clean.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"GroupRisk\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"f5878af0-ca26-411c-9906-acf91f5420e2\\\\\\", \\\\\\"workspa

In [67]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###
best_run, fitted_model = auto_run.get_output()

In [72]:
model_name = best_run.properties['model_name']

In [None]:
model_auto = best_run.register_model(model_name = "auto_ML_best", description = "Best Auto ML Model", model_path='outputs/model.pkl')

In [16]:
required_run = list(exp.get_runs())[2]
required_run

Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_0ca672e4-a7eb-4ea1-8708-3831d6426aeb,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [17]:
required_run.id

'AutoML_0ca672e4-a7eb-4ea1-8708-3831d6426aeb'

In [18]:
from azureml.train.automl.run import AutoMLRun
auto_run = AutoMLRun(experiment=exp, run_id=required_run.id)

In [19]:
auto_run.get_details()

{'runId': 'AutoML_0ca672e4-a7eb-4ea1-8708-3831d6426aeb',
 'target': 'compute-train',
 'status': 'Completed',
 'startTimeUtc': '2021-01-05T14:50:38.535234Z',
 'endTimeUtc': '2021-01-05T15:53:47.933718Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '4',
  'target': 'compute-train',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"48c2fdfb-fec7-4f71-b0a3-70c6aaea3d07\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"dataset/dataset_clean.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"GroupRisk\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"f5878af0-ca26-411c-9906-acf91f5420e2\\\\\\", \\\\\\"workspa

In [10]:
best_run, fitted_model = auto_run.get_output()

In [13]:
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_0ca672e4-a7eb-4ea1-8708-3831d6426aeb_42,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [26]:
fitted_model.named_steps

{'datatransformer': DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                 feature_sweeping_config=None, feature_sweeping_timeout=None,
                 featurization_config=None, force_text_dnn=None,
                 is_cross_validation=None, is_onnx_compatible=None, logger=None,
                 observer=None, task=None, working_dir=None),
 'prefittedsoftvotingclassifier': PreFittedSoftVotingClassifier(classification_labels=None,
                               estimators=[('29',
                                            Pipeline(memory=None,
                                                     steps=[('standardscalerwrapper',
                                                             <azureml.automl.runtime.shared.model_wrappers.StandardScalerWrapper object at 0x7fb4ae509588>),
                                                            ('xgboostclassifier',
                                                             XGBoostClassifier(base_score=0.5,
   

In [22]:
best_run.properties["run_properties"]

"classification_labels=None,\n                              estimators=[('29',\n                                           Pipeline(memory=None,\n                                                    steps=[('standardscalerwrapper',\n                                                            <azureml.automl.runtime.shared.model_wrappers.StandardScalerWrapper object at 0x7f260c43d3c8>"

In [9]:
for run in exp.get_runs():
    status = run.get_status()
    print(status)
    if status == "Running":
        run.cancel()

Canceled
Canceled
Completed
Completed
Canceled
Completed
Completed
Canceled


In [None]:
try:
    compute_target.delete()
    print('Deleted existing compute target')
except ComputeTargetException:
    print('Could not find compute, maybe it is already deleted...')