**Spam Deduction AutoML SDK** 

In [2]:
import azureml.core
from azureml.core import Workspace

import logging
import os
import csv
from datetime import datetime
import pytz


from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import seaborn as sns
import tensorflow as tf

import matplotlib.pyplot as plt
import re
import pydot
import graphviz

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

print("Azure SDK version: ", azureml.core.VERSION)
print("Tensorflow version: ", tf.__version__)

Azure SDK version:  1.44.0
Tensorflow version:  2.9.1


**Initialize and Access Workspace**

In [3]:
# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')
print("Current DateTime: ", datetime.now(pytz.timezone("America/New_York")).strftime("%m/%d/%Y %H:%M:%S"))

Performing interactive authentication. Please follow the instructions on the terminal.


The default web browser has been opened at https://login.microsoftonline.com/organizations/oauth2/v2.0/authorize. Please continue the login in the web browser. If no web browser is available or if the web browser fails to open, use device code flow with `az login --use-device-code`.


Interactive authentication successfully completed.
Ready to use Azure ML 1.44.0 to work with nahmed30-azureml-workspace
nahmed30-azureml-workspace
epe-poc-nazeer
centralus
16bc73b5-82be-47f2-b5ab-f2373344794c
Current DateTime:  08/31/2022 13:27:19


**Create an AzureML Experiment**

In [4]:
# Choose a name for the run history container in the workspace.
# NOTE: update these to match your existing experiment name
experiment_folder = 'email_spam_automl_experiments'
experiment_name = 'email-spam-automl-experiment1'
project_folder = 'email_spam_automl_project'

os.makedirs(project_folder, exist_ok=True)

print('Project Folder is ready.')

experiment = Experiment(ws, experiment_name)
experiment

Project Folder is ready.


Name,Workspace,Report Page,Docs Page
email-spam-automl-experiment1,nahmed30-azureml-workspace,Link to Azure Machine Learning studio,Link to Documentation


**Create Compute**

In [5]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# NOTE: update the cluster name to match the existing cluster
# Choose a name for your CPU cluster
amlcompute_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)
# For a more detailed view of current AmlCompute status, use get_status().

Found existing cluster, use it.
Succeeded....................................................................................................
AmlCompute wait for completion finished

Wait timeout has been reached
Current provisioning state of AmlCompute is "Succeeded" and current node count is "0"


In [6]:
%%writefile $project_folder/hyperdrive_env.yml
name: batch_environment
dependencies:
- python
- scikit-learn
- pandas
- numpy
- pip
- pip:
  - azureml-defaults

Overwriting email_spam_automl_project/hyperdrive_env.yml


**Prepare Data**

In [20]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
# NOTE: update the key to match the dataset name
found = False
key = "UdacityPrjEmailSpamDataSet"
description_text = "Email Spam Detection DataSet for Udacity Capstone Proj "

dataset = None
if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        spam_data = 'https://www.kaggle.com/code/rumbleftw/beginner-friendly-spam-ham-sms-classification/data#:~:text=calendar_view_week-,spam,-.csv'
        dataset = Dataset.Tabular.from_delimited_files(spam_data)        
        #Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)


df = dataset.to_pandas_dataframe()
df.describe()

Unnamed: 0,v1,v2,Column3,Column4,Column5
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


**Review Dataset Results**

In [21]:
dataset.take(5).to_pandas_dataframe()

Unnamed: 0,v1,v2,Column3,Column4,Column5
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


**Train and Test SPLIT**

https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-cross-validation-data-splits

In [36]:
train_data, validate_data = dataset.random_split(percentage=0.8, seed=1)

**AutoML to Train**

In [28]:
automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 4,
    "primary_metric" : 'accuracy',
    "n_cross_validations": 5
}

automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=dataset,
                             label_column_name="v1",  
                             path = project_folder,
                             enable_early_stopping= True,
                             test_size=0.2,
                             featurization= 'auto',
                             debug_log = "email_spam_automl_errors.log",
                             **automl_settings
                            )


**Create Pipeline and AutoMLStep**

You can define outputs for the AutoMLStep using TrainingOutput.

In [29]:
from azureml.pipeline.core import PipelineData, TrainingOutput

ds = ws.get_default_datastore()

metrics_output_name = 'emailspam_metrics_output'
best_model_output_name = 'emailspam_best_model_output'

metrics_data = PipelineData(name='emailspam_metrics_data',
                           datastore=ds,
                           pipeline_output_name=metrics_output_name,
                           training_output=TrainingOutput(type='Metrics'))
model_data = PipelineData(name='emailspam_model_data',
                           datastore=ds,
                           pipeline_output_name=best_model_output_name,
                           training_output=TrainingOutput(type='Model'))

Create an AutoMLStep.

In [30]:
automl_step = AutoMLStep(
    name='spamemail_automl_module',
    automl_config=automl_config,
    outputs=[metrics_data, model_data],
    allow_reuse=True)

Add AutoMLStep to Pipeline

In [31]:
from azureml.pipeline.core import Pipeline
pipeline = Pipeline(
    description="emailspam_pipeline_with_automlstep",
    workspace=ws,    
    steps=[automl_step])

Submit Pipeline Experiment

In [32]:
pipeline_run = experiment.submit(pipeline)

Created step spamemail_automl_module [8946a466][7485c5f0-0c5f-46a0-a3fa-fc852c95390f], (This step will run and generate new outputs)
Submitted PipelineRun ff4ac578-a878-4f37-8a32-845b10670b55
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/ff4ac578-a878-4f37-8a32-845b10670b55?wsid=/subscriptions/16bc73b5-82be-47f2-b5ab-f2373344794c/resourcegroups/epe-poc-nazeer/workspaces/nahmed30-azureml-workspace&tid=db05faca-c82a-4b9d-b9c5-0f64b6755421


In [None]:
from azureml.widgets import RunDetails
RunDetails(pipeline_run).show()

In [34]:
pipeline_run.wait_for_completion()

PipelineRunId: ff4ac578-a878-4f37-8a32-845b10670b55
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/ff4ac578-a878-4f37-8a32-845b10670b55?wsid=/subscriptions/16bc73b5-82be-47f2-b5ab-f2373344794c/resourcegroups/epe-poc-nazeer/workspaces/nahmed30-azureml-workspace&tid=db05faca-c82a-4b9d-b9c5-0f64b6755421
PipelineRun Status: Running


StepRunId: 19a393c8-0f5a-4a64-b891-a802346138e5
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/19a393c8-0f5a-4a64-b891-a802346138e5?wsid=/subscriptions/16bc73b5-82be-47f2-b5ab-f2373344794c/resourcegroups/epe-poc-nazeer/workspaces/nahmed30-azureml-workspace&tid=db05faca-c82a-4b9d-b9c5-0f64b6755421
StepRun( spamemail_automl_module ) Status: Running

StepRun(spamemail_automl_module) Execution Summary
StepRun( spamemail_automl_module ) Status: Finished

No scores improved over last 10 iterations, so experiment stopped early. This early stopping behavior can be disabled by setting enable_early_stopping = False in AutoMLConfig for

'Finished'

**Examine Results**

*Retrieve the metrics of all child runs*

Outputs of above run can be used as inputs of other steps in pipeline. In this tutorial, we will examine the outputs by retrieve output data and running some tests.

In [40]:
metrics_output = pipeline_run.get_pipeline_output(metrics_output_name)
num_file_downloaded = metrics_output.download('./email_spam_automl_project', show_progress=True)

Downloading azureml/19a393c8-0f5a-4a64-b891-a802346138e5/emailspam_metrics_data
Downloaded azureml/19a393c8-0f5a-4a64-b891-a802346138e5/emailspam_metrics_data, 1 files out of an estimated total of 1


**Retrive Best Model**

In [41]:
import json
with open(metrics_output._path_on_datastore) as f:
    metrics_output_result = f.read()
    
deserialized_metrics_output = json.loads(metrics_output_result)
df = pd.DataFrame(deserialized_metrics_output)
df

Unnamed: 0,19a393c8-0f5a-4a64-b891-a802346138e5_0,19a393c8-0f5a-4a64-b891-a802346138e5_2,19a393c8-0f5a-4a64-b891-a802346138e5_3,19a393c8-0f5a-4a64-b891-a802346138e5_1,19a393c8-0f5a-4a64-b891-a802346138e5_4,19a393c8-0f5a-4a64-b891-a802346138e5_5,19a393c8-0f5a-4a64-b891-a802346138e5_13,19a393c8-0f5a-4a64-b891-a802346138e5_7,19a393c8-0f5a-4a64-b891-a802346138e5_8,19a393c8-0f5a-4a64-b891-a802346138e5_9,...,19a393c8-0f5a-4a64-b891-a802346138e5_31,19a393c8-0f5a-4a64-b891-a802346138e5_30,19a393c8-0f5a-4a64-b891-a802346138e5_17,19a393c8-0f5a-4a64-b891-a802346138e5_21,19a393c8-0f5a-4a64-b891-a802346138e5_16,19a393c8-0f5a-4a64-b891-a802346138e5_12,19a393c8-0f5a-4a64-b891-a802346138e5_28,19a393c8-0f5a-4a64-b891-a802346138e5_33,19a393c8-0f5a-4a64-b891-a802346138e5_37,19a393c8-0f5a-4a64-b891-a802346138e5_36
average_precision_score_micro,[0.9974021899666828],[0.9941713301400481],[0.9863255165205846],[0.9957460984878426],[0.9933769473610221],[0.9820116222061792],[0.996571234231763],[0.9981758769794824],[0.9916312838716872],[0.9942057874206371],...,[0.9965657352573427],[0.8168519111275538],[0.9946391296690995],[0.9957154903951778],[0.9978579294549457],[0.99832608256557],[0.998308883251297],[0.9982195217505708],[0.998498486867273],[0.9981524618518488]
recall_score_micro,[0.9856414166578592],[0.9670194722511614],[0.9744238599246072],[0.9845193338466881],[0.9768924924380828],[0.9394241367335538],[0.9883330061954876],[0.9876608637445707],[0.9584967764339961],[0.9800327641134816],...,[0.9840719099314018],[0.8658322135153226],[0.9730775618667995],[0.9820522112001934],[0.9881095458823411],[0.9883337611289779],[0.9883337611289779],[0.9876608637445707],[0.9908018903534599],[0.9890069101578817]
log_loss,[0.07148743074855661],[0.591594053028278],[0.6117418881648135],[0.06079730247665531],[0.09410963185901053],[0.19699072741529194],[0.09164538870615606],[0.04284798513694774],[0.14954982700084093],[0.07424503582791869],...,[0.05608078247970414],[0.39430986737770846],[0.10443498442790138],[0.06370762127054275],[0.04316025881724485],[0.061801358383168005],[0.05594217259125115],[0.0439440784479362],[0.03502063596909942],[0.0757832799083887]
norm_macro_recall,[0.9023107402861952],[0.8624266370993598],[0.8265477514909485],[0.8892466091575478],[0.8329523259077953],[0.5484772613804871],[0.9210555076949574],[0.9091387665581214],[0.6947651224425417],[0.8601977847661573],...,[0.8864483369399208],[0.0],[0.8107511829447314],[0.8714394570923314],[0.9207570148125388],[0.9143593717787265],[0.9143593717787265],[0.9091387665581214],[0.9439682643896206],[0.9202839404007002]
weighted_accuracy,[0.9961627075188704],[0.9777621175543569],[0.9930995397008731],[0.9966167821894978],[0.9952564829278984],[0.9893228017557714],[0.9968455940041251],[0.9977919809058147],[0.9922226296373189],[0.99518978871219],...,[0.9965182883304713],[0.9763446516633548],[0.9937269933751309],[0.9961606535063761],[0.9965980005501601],[0.9979039283049274],[0.9979039283049274],[0.9977919809058147],[0.9966285172182747],[0.9978213640390378]
AUC_macro,[0.9930908888539735],[0.9847568074251501],[0.9696018516207632],[0.9883755323097068],[0.9852439476783298],[0.977109261709912],[0.9918513297835071],[0.9957472725041132],[0.9863388329140641],[0.9852918187275232],...,[0.9903276629345961],[0.5],[0.9875347081043057],[0.9889786279592153],[0.9939598236383913],[0.9957474332176159],[0.9957601386998673],[0.9957706566085255],[0.9945046076150339],[0.9949470727976845]
f1_score_macro,[0.9680895029505227],[0.9291167292593171],[0.9413725757739752],[0.9651940676699379],[0.9467544785321607],[0.8370290422702968],[0.9743414632338517],[0.972594184222034],[0.8968546042872703],[0.954492156239532],...,[0.9643435757565518],[0.4640302758137837],[0.9373876479321781],[0.959519105739625],[0.9737844069845562],[0.9742129681515609],[0.9742129681515609],[0.972594184222034],[0.9801723451119544],[0.9757525408309252]
accuracy,[0.9856414166578592],[0.9670194722511614],[0.9744238599246072],[0.9845193338466881],[0.9768924924380828],[0.9394241367335538],[0.9883330061954876],[0.9876608637445707],[0.9584967764339961],[0.9800327641134816],...,[0.9840719099314018],[0.8658322135153226],[0.9730775618667995],[0.9820522112001934],[0.9881095458823411],[0.9883337611289779],[0.9883337611289779],[0.9876608637445707],[0.9908018903534599],[0.9890069101578817]
f1_score_weighted,[0.9853610423786494],[0.9670979785167779],[0.9735046839386887],[0.984131478084041],[0.9760014890325206],[0.9315583991913347],[0.9881443612021995],[0.9873942195240166],[0.9551108574349932],[0.9794242553157751],...,[0.9836715792679158],[0.8036038754030779],[0.971919677573187],[0.9815356446309877],[0.9879140650328975],[0.9880945406212478],[0.9880945406212478],[0.9873942195240166],[0.9907049675999502],[0.9888095263472845]
AUC_micro,[0.9974498562111315],[0.9941819003491359],[0.990151115963678],[0.996226254610808],[0.9939997465958674],[0.9840836931960772],[0.9974777062033493],[0.9982372001168051],[0.9915564195418589],[0.9948772751193007],...,[0.9966717319841709],[0.8658322135153227],[0.9946907155384596],[0.9961879210713402],[0.9979049247318132],[0.9983833607803323],[0.9983646252500439],[0.9982784619366539],[0.998529713670553],[0.9982284770941133]


**Retrieve the Best Model**

In [42]:
# Retrieve best model from Pipeline Run
best_model = pipeline_run.get_pipeline_output(best_model_output_name)
num_file_downloaded = best_model.download('.', show_progress=True)

Downloading azureml/19a393c8-0f5a-4a64-b891-a802346138e5/emailspam_model_data
Downloaded azureml/19a393c8-0f5a-4a64-b891-a802346138e5/emailspam_model_data, 1 files out of an estimated total of 1


**Test the Model**

In [46]:
train_data, test_data = dataset.random_split(percentage=0.8, seed=1)

In [49]:
df = dataset.to_pandas_dataframe()
df.describe()

Unnamed: 0,v1,v2,Column3,Column4,Column5
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [50]:
df = df.replace('spam', 1)
df = df.replace('ham', 0)
df

Unnamed: 0,v1,v2,Column3,Column4,Column5
0,0,"Go until jurong point, crazy.. Available only ...",,,
1,0,Ok lar... Joking wif u oni...,,,
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,0,U dun say so early hor... U c already then say...,,,
4,0,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,,,
5568,0,Will Ì_ b going to esplanade fr home?,,,
5569,0,"Pity, * was in mood for that. So...any other s...",,,
5570,0,The guy did some bitching but I acted like i'd...,,,


In [52]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.oov_token = '<oovToken>'
tokenizer.fit_on_texts(df.v2)
vocab = tokenizer.word_index
vocabCount = len(vocab)+1

vocabCount

8922

In [53]:
SPLIT = 5000

xTrain = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(df.v2.to_numpy()), padding='pre', maxlen=171)
yTrain = df.v1.to_numpy()
dim = xTrain.shape[1]
xTest = xTrain[SPLIT:]
yTest = yTrain[SPLIT:]

xTrain = xTrain[:SPLIT]
yTrain = yTrain[:SPLIT]

xTrain.shape, yTrain.shape, xTest.shape, yTest.shape

((5000, 171), (5000,), (572, 171), (572,))

In [54]:
from sklearn.metrics import confusion_matrix
yPred = best_model.predict(xTest)
cm = confusion_matrix(yTest, yPred)

NameError: name 'best_model' is not defined

**--------------------------------------------------------------------------------------**