In [1]:
import azureml.core
from azureml.core import Workspace

import logging
import os
import csv
from datetime import datetime
import pytz


from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import seaborn as sns
import tensorflow as tf

import matplotlib.pyplot as plt
import re
import pydot
import graphviz

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep



Workspace Config

In [3]:
# Check core SDK version number
# print("SDK version:", azureml.core.VERSION)

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

print("Tensorflow version: ", tf.__version__)
print("Current DateTime: ", datetime.now(pytz.timezone("America/New_York")).strftime("%m/%d/%Y %H:%M:%S"))

Ready to use Azure ML 1.44.0 to work with nahmed30-azureml-workspace
nahmed30-azureml-workspace
epe-poc-nazeer
centralus
16bc73b5-82be-47f2-b5ab-f2373344794c
Tensorflow version:  2.9.1
Current DateTime:  08/31/2022 01:59:48


**Create Experiment**

In [5]:
# Choose a name for the run history container in the workspace.
# NOTE: update these to match your existing experiment name
experiment_folder = 'spam_training-hyperdrive'
experiment_name = 'ml-spam-experiment-prjassign1'
project_folder = 'spam_training-hyperdrive'

os.makedirs(project_folder, exist_ok=True)

print('Project Folder is ready.')

experiment = Experiment(ws, experiment_name)
experiment

Project Folder is ready.


Name,Workspace,Report Page,Docs Page
ml-spam-experiment-prjassign1,nahmed30-azureml-workspace,Link to Azure Machine Learning studio,Link to Documentation


**Create Compute**

In [6]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# NOTE: update the cluster name to match the existing cluster
# Choose a name for your CPU cluster
amlcompute_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)
# For a more detailed view of current AmlCompute status, use get_status().

Found existing cluster, use it.
Succeeded.......................................................................................................
AmlCompute wait for completion finished

Wait timeout has been reached
Current provisioning state of AmlCompute is "Succeeded" and current node count is "0"


> **Note**: Compute instances and clusters are based on standard Azure virtual machine images. For this exercise, the *Standard_DS11_v2* image is recommended to achieve the optimal balance of cost and performance. If your subscription has a quota that does not include this image, choose an alternative image; but bear in mind that a larger image may incur higher cost and a smaller image may not be sufficient to complete the tasks. Alternatively, ask your Azure administrator to extend your quota.

You'll need a Python environment to be hosted on the compute, so let's define that as Conda configuration file.

In [8]:
%%writefile $project_folder/hyperdrive_env.yml
name: batch_environment
dependencies:
- python
- scikit-learn
- pandas
- numpy
- pip
- pip:
  - azureml-defaults

Overwriting spam_training-hyperdrive/hyperdrive_env.yml


**Prepare Data**

In [7]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
# NOTE: update the key to match the dataset name
found = False
key = "UdacityPrjEmailSpamDataSet"
description_text = "Spam Detection DataSet for Udacity Capstone Proj "

dataset = None
if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

df = dataset.to_pandas_dataframe()
df.describe()

Unnamed: 0,v1,v2,Column3,Column4,Column5
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


**Review Dataset Results**

In [9]:
dataset.take(5).to_pandas_dataframe()

Unnamed: 0,v1,v2,Column3,Column4,Column5
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


**AutoML to Train**

In [14]:
automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 4,
    "primary_metric" : 'accuracy',
    "n_cross_validations": 5
}

automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=dataset,
                             label_column_name="v1",  
                             path = project_folder,
                             enable_early_stopping= True,
                             test_size=0.2,
                             featurization= 'auto',
                             debug_log = "spam_automl_errors.log",
                             **automl_settings
                            )


**Submit your Experiment**

In [15]:
remote_run=experiment.submit(automl_config, show_output=True)

Submitting remote run.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster


Experiment,Id,Type,Status,Details Page,Docs Page
ml-spam-experiment-prjassign1,AutoML_51fd6e53-4abd-4d2a-bd24-996e05c9705b,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+------------------------------+--------------------------------+--------------------------------------+
|Size of the smallest class    |Name/Label of the smallest class|Number of samples in the training data|
|598                           |spam                    

**Get Run Details**

In [16]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [17]:
print(" Experiment Run Status :", remote_run.get_status())

 Experiment Run Status : Completed


**List all child runs**

In [18]:
for child_run in remote_run.get_children():
    print(child_run, "\n ---------------------------------------------")

Run(Experiment: ml-spam-experiment-prjassign1,
Id: AutoML_51fd6e53-4abd-4d2a-bd24-996e05c9705b_48,
Type: azureml.scriptrun,
Status: Completed) 
 ---------------------------------------------
Run(Experiment: ml-spam-experiment-prjassign1,
Id: AutoML_51fd6e53-4abd-4d2a-bd24-996e05c9705b_49,
Type: azureml.scriptrun,
Status: Completed) 
 ---------------------------------------------
Run(Experiment: ml-spam-experiment-prjassign1,
Id: AutoML_51fd6e53-4abd-4d2a-bd24-996e05c9705b_47,
Type: azureml.scriptrun,
Status: Completed) 
 ---------------------------------------------
Run(Experiment: ml-spam-experiment-prjassign1,
Id: AutoML_51fd6e53-4abd-4d2a-bd24-996e05c9705b_46,
Type: azureml.scriptrun,
Status: Canceled) 
 ---------------------------------------------
Run(Experiment: ml-spam-experiment-prjassign1,
Id: AutoML_51fd6e53-4abd-4d2a-bd24-996e05c9705b_45,
Type: azureml.scriptrun,
Status: Completed) 
 ---------------------------------------------
Run(Experiment: ml-spam-experiment-prjassign1,

**Get Best Model**

In [20]:
# Retrieve best model from Pipeline Run
best_model_output = remote_run.get_output()




**Get Child Runs**

In [27]:
best_run,fitted_model = remote_run.get_output()
print(best_run)
print('\nBest Model Definition:')
print(best_run)

print('\nBest Run Metrics:')
best_run_metrics = best_run.get_metrics()
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)



Run(Experiment: ml-spam-experiment-prjassign1,
Id: AutoML_51fd6e53-4abd-4d2a-bd24-996e05c9705b_49,
Type: azureml.scriptrun,
Status: Completed)

Best Model Definition:
Run(Experiment: ml-spam-experiment-prjassign1,
Id: AutoML_51fd6e53-4abd-4d2a-bd24-996e05c9705b_49,
Type: azureml.scriptrun,
Status: Completed)

Best Run Metrics:
AUC_macro 0.9934949722828439
weighted_accuracy 0.9964976499326113
balanced_accuracy 0.9742731089370402
f1_score_macro 0.9812097264916027
log_loss 0.03351236021806607
AUC_micro 0.9983405104800065
precision_score_weighted 0.9912494557178411
f1_score_micro 0.9912508241357271
recall_score_micro 0.9912508241357271
accuracy 0.9912508241357271
norm_macro_recall 0.9485462178740803
average_precision_score_micro 0.9982509380112734
f1_score_weighted 0.991178004952662
precision_score_micro 0.9912508241357271
average_precision_score_macro 0.9926393246388001
average_precision_score_weighted 0.9968797292090859
recall_score_macro 0.9742731089370402
precision_score_macro 0.988619

Finally, having found the best performing model, you can register it.

In [29]:
os.makedirs('./outputs', exist_ok=True)

In [34]:
import joblib
joblib.dump(fitted_model,filename="outputs/spamautoml.joblib")

['outputs/spamautoml.joblib']

In [36]:
bestmodelname = best_run.properties['model_name']
print(bestmodelname)

AutoML51fd6e53449


In [39]:
from azureml.core.model import Model
registered_model = remote_run.register_model(model_name= bestmodelname, description="spam deduction")

In [40]:
from azureml.automl.core.shared import constants
env=best_run.get_environment()
print(env)

Environment(Name: AzureML-AutoML,
Version: 123)


In [41]:
best_run.download_files()