**Spam Deduction AutoML SDK** 

In [3]:
import azureml.core
from azureml.core import Workspace

import logging
import os
import csv
from datetime import datetime
import pytz


from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import seaborn as sns
import tensorflow as tf

import matplotlib.pyplot as plt
import re
import pydot
import graphviz

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

print("Azure SDK version: ", azureml.core.VERSION)
print("Tensorflow version: ", tf.__version__)

Azure SDK version:  1.44.0
Tensorflow version:  2.9.1


**Initialize and Access Workspace**

In [4]:
# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')
print("Current DateTime: ", datetime.now(pytz.timezone("America/New_York")).strftime("%m/%d/%Y %H:%M:%S"))

Ready to use Azure ML 1.44.0 to work with nahmed30-azureml-workspace
nahmed30-azureml-workspace
epe-poc-nazeer
centralus
16bc73b5-82be-47f2-b5ab-f2373344794c
Current DateTime:  08/31/2022 10:34:46


**Create an AzureML Experiment**

In [5]:
# Choose a name for the run history container in the workspace.
# NOTE: update these to match your existing experiment name
experiment_folder = 'email_spam_automl_experiments'
experiment_name = 'email-spam-automl-experiment1'
project_folder = 'email_spam_automl_project'

os.makedirs(project_folder, exist_ok=True)

print('Project Folder is ready.')

experiment = Experiment(ws, experiment_name)
experiment

Project Folder is ready.


Name,Workspace,Report Page,Docs Page
email-spam-automl-experiment1,nahmed30-azureml-workspace,Link to Azure Machine Learning studio,Link to Documentation


**Create Compute**

In [6]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# NOTE: update the cluster name to match the existing cluster
# Choose a name for your CPU cluster
amlcompute_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)
# For a more detailed view of current AmlCompute status, use get_status().

Found existing cluster, use it.
Succeeded.......................................................................................................
AmlCompute wait for completion finished

Wait timeout has been reached
Current provisioning state of AmlCompute is "Succeeded" and current node count is "0"


In [7]:
%%writefile $project_folder/hyperdrive_env.yml
name: batch_environment
dependencies:
- python
- scikit-learn
- pandas
- numpy
- pip
- pip:
  - azureml-defaults

Writing email_spam_automl_project/hyperdrive_env.yml


**Prepare Data**

In [8]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
# NOTE: update the key to match the dataset name
found = False
key = "UdacityPrjEmailSpamDataSet"
description_text = "Email Spam Detection DataSet for Udacity Capstone Proj "

dataset = None
if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        spam_data = 'https://www.kaggle.com/code/rumbleftw/beginner-friendly-spam-ham-sms-classification/data#:~:text=calendar_view_week-,spam,-.csv'
        dataset = Dataset.Tabular.from_delimited_files(spam_data)        
        #Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)


df = dataset.to_pandas_dataframe()
df.describe()

Unnamed: 0,v1,v2,Column3,Column4,Column5
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


**Review Dataset Results**

In [9]:
dataset.take(5).to_pandas_dataframe()

Unnamed: 0,v1,v2,Column3,Column4,Column5
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


**AutoML to Train**

In [11]:
automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 4,
    "primary_metric" : 'accuracy',
    "n_cross_validations": 5
}

automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=dataset,
                             label_column_name="v1",  
                             path = project_folder,
                             enable_early_stopping= True,
                             test_size=0.2,
                             featurization= 'auto',
                             debug_log = "email_spam_automl_errors.log",
                             **automl_settings
                            )


**Create Pipeline and AutoMLStep**

You can define outputs for the AutoMLStep using TrainingOutput.

In [13]:
from azureml.pipeline.core import PipelineData, TrainingOutput

ds = ws.get_default_datastore()

metrics_output_name = 'emailspam_metrics_output'
best_model_output_name = 'emailspam_best_model_output'

metrics_data = PipelineData(name='emailspam_metrics_data',
                           datastore=ds,
                           pipeline_output_name=metrics_output_name,
                           training_output=TrainingOutput(type='Metrics'))
model_data = PipelineData(name='emailspam_model_data',
                           datastore=ds,
                           pipeline_output_name=best_model_output_name,
                           training_output=TrainingOutput(type='Model'))

Create an AutoMLStep.

In [14]:
automl_step = AutoMLStep(
    name='spamemail_automl_module',
    automl_config=automl_config,
    outputs=[metrics_data, model_data],
    allow_reuse=True)

Add AutoMLStep to Pipeline

In [15]:
from azureml.pipeline.core import Pipeline
pipeline = Pipeline(
    description="emailspam_pipeline_with_automlstep",
    workspace=ws,    
    steps=[automl_step])

Submit Pipeline Experiment

In [16]:
pipeline_run = experiment.submit(pipeline)

Created step spamemail_automl_module [9424ec20][bb20e21f-e7e8-43b6-bdba-5dc4b475ea0f], (This step will run and generate new outputs)
Submitted PipelineRun 25aa2abc-e97c-45a2-8f95-ff1beed8dd83
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/25aa2abc-e97c-45a2-8f95-ff1beed8dd83?wsid=/subscriptions/16bc73b5-82be-47f2-b5ab-f2373344794c/resourcegroups/epe-poc-nazeer/workspaces/nahmed30-azureml-workspace&tid=db05faca-c82a-4b9d-b9c5-0f64b6755421


In [None]:
from azureml.widgets import RunDetails
# RunDetails(pipeline_run).show()

In [18]:
pipeline_run.wait_for_completion()

PipelineRunId: 25aa2abc-e97c-45a2-8f95-ff1beed8dd83
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/25aa2abc-e97c-45a2-8f95-ff1beed8dd83?wsid=/subscriptions/16bc73b5-82be-47f2-b5ab-f2373344794c/resourcegroups/epe-poc-nazeer/workspaces/nahmed30-azureml-workspace&tid=db05faca-c82a-4b9d-b9c5-0f64b6755421
PipelineRun Status: Running


StepRunId: c65516f5-057b-4846-82cd-ba98e59d4a02
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/c65516f5-057b-4846-82cd-ba98e59d4a02?wsid=/subscriptions/16bc73b5-82be-47f2-b5ab-f2373344794c/resourcegroups/epe-poc-nazeer/workspaces/nahmed30-azureml-workspace&tid=db05faca-c82a-4b9d-b9c5-0f64b6755421
StepRun( spamemail_automl_module ) Status: Running


**Examine Results**

*Retrieve the metrics of all child runs*

Outputs of above run can be used as inputs of other steps in pipeline. In this tutorial, we will examine the outputs by retrieve output data and running some tests.

In [None]:
metrics_output = pipeline_run.get_pipeline_output(metrics_output_name)
num_file_downloaded = metrics_output.download('.', show_progress=True)

**--------------------------------------------------------------------------------------**

**Submit your Experiment**

In [None]:
remote_run=experiment.submit(automl_config, show_output=True)

**Get Run Details**

In [None]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show()

In [None]:
print(" Experiment Run Status :", remote_run.get_status())

**List all child runs**

In [None]:
for child_run in remote_run.get_children():
    print(child_run, "\n ---------------------------------------------")

**Get Best Model**

In [None]:
# Retrieve best model from Pipeline Run
best_model_output = remote_run.get_output()


**Get Child Runs**

In [None]:
best_run,fitted_model = remote_run.get_output()
print(best_run)
print('\nBest Model Definition:')
print(best_run)

print('\nBest Run Metrics:')
best_run_metrics = best_run.get_metrics()
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)

Finally, having found the best performing model, you can register it.

In [None]:
os.makedirs('./outputs', exist_ok=True)

In [None]:
import joblib
joblib.dump(fitted_model,filename="outputs/spamautoml.joblib")

In [None]:
bestmodelname = best_run.properties['model_name']
print(bestmodelname)

In [None]:
from azureml.core.model import Model
registered_model = remote_run.register_model(model_name= bestmodelname, description="spam deduction")

In [None]:
from azureml.automl.core.shared import constants
env=best_run.get_environment()
print(env)

In [None]:
best_run.download_files()