**Spam Deduction AutoML SDK** 

In [3]:
import azureml.core
from azureml.core import Workspace

import logging
import os
import csv
from datetime import datetime
import pytz


from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import seaborn as sns
import tensorflow as tf

import matplotlib.pyplot as plt
import re
import pydot
import graphviz

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

print("Azure SDK version: ", azureml.core.VERSION)
print("Tensorflow version: ", tf.__version__)

Azure SDK version:  1.44.0
Tensorflow version:  2.9.1


**Initialize and Access Workspace**

In [2]:
# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')
print("Current DateTime: ", datetime.now(pytz.timezone("America/New_York")).strftime("%m/%d/%Y %H:%M:%S"))

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


Ready to use Azure ML 1.44.0 to work with nahmed30-azureml-workspace
nahmed30-azureml-workspace
epe-poc-nazeer
centralus
16bc73b5-82be-47f2-b5ab-f2373344794c
Tensorflow version:  2.9.1
Current DateTime:  08/31/2022 10:33:19


**Create an AzureML Experiment**

In [4]:
# Choose a name for the run history container in the workspace.
# NOTE: update these to match your existing experiment name
experiment_folder = 'email_spam_automl_experiments'
experiment_name = 'email-spam-automl-experiment1'
project_folder = 'email_spam_automl_project'

os.makedirs(project_folder, exist_ok=True)

print('Project Folder is ready.')

experiment = Experiment(ws, experiment_name)
experiment

Project Folder is ready.


Name,Workspace,Report Page,Docs Page
spam-automl-experiment1,nahmed30-azureml-workspace,Link to Azure Machine Learning studio,Link to Documentation


**Create Compute**

In [None]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# NOTE: update the cluster name to match the existing cluster
# Choose a name for your CPU cluster
amlcompute_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)
# For a more detailed view of current AmlCompute status, use get_status().

> **Note**: Compute instances and clusters are based on standard Azure virtual machine images. For this exercise, the *Standard_DS11_v2* image is recommended to achieve the optimal balance of cost and performance. If your subscription has a quota that does not include this image, choose an alternative image; but bear in mind that a larger image may incur higher cost and a smaller image may not be sufficient to complete the tasks. Alternatively, ask your Azure administrator to extend your quota.

You'll need a Python environment to be hosted on the compute, so let's define that as Conda configuration file.

In [None]:
%%writefile $project_folder/hyperdrive_env.yml
name: batch_environment
dependencies:
- python
- scikit-learn
- pandas
- numpy
- pip
- pip:
  - azureml-defaults

**Prepare Data**

In [None]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
# NOTE: update the key to match the dataset name
found = False
key = "UdacityPrjEmailSpamDataSet"
description_text = "Spam Detection DataSet for Udacity Capstone Proj "

dataset = None
if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

df = dataset.to_pandas_dataframe()
df.describe()

**Review Dataset Results**

In [None]:
dataset.take(5).to_pandas_dataframe()

**AutoML to Train**

In [None]:
automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 4,
    "primary_metric" : 'accuracy',
    "n_cross_validations": 5
}

automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=dataset,
                             label_column_name="v1",  
                             path = project_folder,
                             enable_early_stopping= True,
                             test_size=0.2,
                             featurization= 'auto',
                             debug_log = "spam_automl_errors.log",
                             **automl_settings
                            )


**Submit your Experiment**

In [None]:
remote_run=experiment.submit(automl_config, show_output=True)

**Get Run Details**

In [None]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show()

In [None]:
print(" Experiment Run Status :", remote_run.get_status())

**List all child runs**

In [None]:
for child_run in remote_run.get_children():
    print(child_run, "\n ---------------------------------------------")

**Get Best Model**

In [None]:
# Retrieve best model from Pipeline Run
best_model_output = remote_run.get_output()


**Get Child Runs**

In [None]:
best_run,fitted_model = remote_run.get_output()
print(best_run)
print('\nBest Model Definition:')
print(best_run)

print('\nBest Run Metrics:')
best_run_metrics = best_run.get_metrics()
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)

Finally, having found the best performing model, you can register it.

In [None]:
os.makedirs('./outputs', exist_ok=True)

In [None]:
import joblib
joblib.dump(fitted_model,filename="outputs/spamautoml.joblib")

In [None]:
bestmodelname = best_run.properties['model_name']
print(bestmodelname)

In [None]:
from azureml.core.model import Model
registered_model = remote_run.register_model(model_name= bestmodelname, description="spam deduction")

In [None]:
from azureml.automl.core.shared import constants
env=best_run.get_environment()
print(env)

In [None]:
best_run.download_files()