## AutoML

In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="udacity-project", subscription_id='8bad5f66-e234-46b1-b07f-d3cfd6255f67')
exp = Experiment(workspace=ws, name="automl-kenny")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: udacity-project
Azure region: eastus
Subscription id: 8bad5f66-e234-46b1-b07f-d3cfd6255f67
Resource group: udacity-project


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

cluster_name = "udacity-project"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print("I found an exisiting cluster so I'm using it.")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           vm_priority = 'lowpriority',
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)

I found an exisiting cluster so I'm using it.


In [3]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
file_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(file_path)

In [4]:
from sklearn.model_selection import train_test_split
import pandas as pd
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)
x_train, x_test, y_train, y_test = train_test_split(x, y)

train_data = pd.concat([x_train, y_train], axis=1)
train_data.head()

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,y
7176,35,1,0,1,0,7,1,56,1,999,...,0,1,0,0,0,0,0,0,0,0
17265,39,1,0,1,0,5,5,159,1,999,...,1,0,0,0,0,0,1,0,0,0
23775,50,1,0,1,0,8,4,15,5,999,...,0,0,0,1,0,0,0,0,0,0
3519,29,0,0,1,0,5,4,566,2,999,...,0,0,0,0,1,0,0,0,0,1
21665,40,1,0,0,0,11,5,71,2,999,...,0,0,0,0,0,0,0,1,0,0


In [8]:
from azureml.train.automl import AutoMLConfig
from azureml.widgets import RunDetails
# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    primary_metric="accuracy",
    training_data=train_data,
    label_column_name="y",
    n_cross_validations=5,
    max_concurrent_iterations=8,
    max_cores_per_iteration=-1,
    iterations = 100
)

In [9]:
# Submit your automl run
automl_run = exp.submit(automl_config, show_output=True)
RunDetails(automl_run).show()

Running on local machine
Parent Run ID: AutoML_01aaad1a-2baa-4383-b879-e27151df557c

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely p

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [11]:
# Retrieve and save your best automl model.
best_automl_run, best_model = automl_run.get_output()
print(best_model)

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                  min_samples_leaf=0.035789473684210524,
                                                                                                  min_samples_split=0.15052631578947367,
                                                                                                  min_weight_fraction_

In [12]:
best_automl_run.register_model(model_name = "best_run_automl.pkl", model_path = './outputs/')
print(best_model._final_estimator)

PreFittedSoftVotingClassifier(classification_labels=None,
                              estimators=[('40',
                                           Pipeline(memory=None,
                                                    steps=[('sparsenormalizer',
                                                            <azureml.automl.runtime.shared.model_wrappers.SparseNormalizer object at 0x7fb3b46b24e0>),
                                                           ('xgboostclassifier',
                                                            XGBoostClassifier(base_score=0.5,
                                                                              booster='gbtree',
                                                                              colsample_bylevel=1,
                                                                              colsample_bynode=1,
                                                                              colsample_bytree=1,
                               

In [13]:
y_predicted = best_model.predict(x_test)

In [14]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_predicted)
print("Accuracy of the test set is: {}".format(accuracy))

Accuracy of the test set is: 0.9172129157562515


In [15]:
import joblib
joblib.dump(best_model, "best_automl_model.joblib")