# Connect to your workspace

In [1]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.41.0 to work with nahmed30-azureml-workspace


# Create Compute

In [2]:
# Create compute

from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# NOTE: update the cluster name to match the existing cluster
# Choose a name for your CPU cluster
amlcompute_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)
# For a more detailed view of current AmlCompute status, use get_status().


Found existing cluster, use it.
Succeeded......................................................................................................................
AmlCompute wait for completion finished

Wait timeout has been reached
Current provisioning state of AmlCompute is "Succeeded" and current node count is "0"


# Prepare data

In [3]:
from azureml.core import Dataset

found = False
key = "UdacityPrjEmailSpamDataSet"
description_text = "Spam Detection DataSet for Udacity Capstone Proj "

dataset = None
if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 
if found:
        dataset
        df = dataset.to_pandas_dataframe()

df.describe()

Unnamed: 0,v1,v2,Column3,Column4,Column5
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


# Prepare a training script

In [4]:
import os

experiment_folder = 'emailspam_training_hyperdrive_09102022_v1'
os.makedirs(experiment_folder, exist_ok=True)

print('Folder ready.')

Folder ready.


# Create  Python script to train the model.

In [5]:
%%writefile $experiment_folder/emailspam_training_09102022.py

# Import libraries
import argparse, joblib, os
from azureml.core import Run

import logging
import os
import csv
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn import datasets
import tensorflow as tf
import regex as re
from tensorflow import keras
from tensorflow.keras import layers

import pickle
import tempfile
from tensorflow.keras.models import Sequential, load_model, save_model, Model
from tensorflow.keras.layers import Dense


from sklearn.preprocessing import StandardScaler
# from tensorflow.keras import models, layers

import nltk
from nltk.corpus import stopwords
from sklearn.metrics import roc_auc_score, roc_curve

import azureml.core
from azureml.core import Workspace
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace

from azureml.core.dataset import Dataset


# Get the experiment run context
run = Run.get_context()

# Get script arguments
parser = argparse.ArgumentParser()

# Input dataset
parser.add_argument("--input-data", type=str, dest='input_data', help='training dataset')

#hyperdrive_feature
parser.add_argument("--hyperdrive_feature", type=bool, dest='hyperdrive_feature', help='hyperdrive feature')

# Hyperparameters
parser.add_argument('--units', type=int, default=64, help="Number of nodes")
parser.add_argument('--optimizer', type=str, default='adam', help="Algorithm of Choice")

# Add arguments to args collection
args = parser.parse_args()

# Log Hyperparameter values 
run.log("Number of Nodes:", np.int(args.units))  
run.log("Algorithm of Choice:", np.str(args.optimizer))  

# load the email spam dataset -- Get the training data from the input
print("Loading Email Spam Data...")
df = run.input_datasets['training_data'].to_pandas_dataframe() 

# Cleanup and Prepare Data # Find and eliminate stop words 
nltk.download('stopwords')
stop_words= set(stopwords.words("english"))
stop_words.update(['https', 'http', 'amp', 'CO', 't', 'u', 'new', "I'm", "would"])


spam = df.query("v1=='spam'").v2.str.cat(sep=" ")
ham = df.query("v1=='ham'").v2.str.cat(sep=" ")

# convert spam to 1 and ham to 0
df = df.replace('spam', 1)
df = df.replace('ham', 0)

# Clean the text
def cleanText(text):
    whitespace = re.compile(r"\s+")
    web_address = re.compile(r"(?i)http(s):\/\/[a-z0-9.~_\-\/]+")
    user = re.compile(r"(?i)@[a-z0-9_]+")
    text = text.replace('.', '')
    text = whitespace.sub(' ', text)
    text = web_address.sub('', text)
    text = user.sub('', text)
    text = re.sub(r"\[[^()]*\]", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r"(?:@\S*|#\S*|http(?=.*://)\S*)", "", text)
    return text.lower()

df.v2 = [cleanText(item) for item in df.v2]

# Tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.oov_token = '<oovToken>'
tokenizer.fit_on_texts(df.v2)
vocab = tokenizer.word_index
vocabCount = len(vocab)+1


# Split Train and Test
SPLIT = 5000

# Split data into training set and test set
xTrain = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(df.v2.to_numpy()), padding='pre', maxlen=171)
yTrain = df.v1.to_numpy()

dim = xTrain.shape[1]
xTest = xTrain[SPLIT:]
yTest = yTrain[SPLIT:]

xTrain = xTrain[:SPLIT]
yTrain = yTrain[:SPLIT]

# Train a Keras Sequential classification model without the specified hyperparameters
print('Training a classification model')

#------------------------------------------------------------
#model = tf.keras.Sequential()
#model.add(tf.keras.layers.Embedding(input_dim=vocabCount+1, output_dim=64, input_length=dim))
#model.add(tf.keras.layers.GlobalAveragePooling1D())
#model.add(tf.keras.layers.Dense(64, activation='relu'))
#model.add(tf.keras.layers.Dense(32, activation='relu'))
#model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#model.summary()

#--------------------------------------------------------------

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=vocabCount+1, output_dim=64, input_length=dim))
model.add(tf.keras.layers.GlobalAveragePooling1D())
# for i in range(args.num_layers):
model.add(tf.keras.layers.Dense(args.units, activation='relu'))
model.add(tf.keras.layers.Dense(args.units, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid')) 

model.compile(loss='binary_crossentropy', optimizer=args.optimizer, metrics=['Accuracy'])
model.summary()


model.fit(xTrain, yTrain, batch_size=32, epochs=100, initial_epoch=6, validation_data=(xTest, yTest))

# calculate accuracy
y_hat = model.predict(xTest)
acc = np.average(y_hat == yTest)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
# y_scores = model.predict_proba(xTest)
# auc = roc_auc_score(yTest,y_scores[:,1])
# print('AUC: ' + str(auc))
# run.log('AUC', np.float(auc))

# Save the model in the run outputs
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/emailspam_model09102022.pkl')
    

run.complete()


Writing emailspam_training_hyperdrive_09102022_v1/emailspam_training_09102022.py


You'll need a Python environment to be hosted on the compute, so let's define that as Conda configuration file.

In [6]:
%%writefile $experiment_folder/emailspam_hyperdrive_env_09102022.yml
name: batch_environment
dependencies:
- python=3.8.5
- scikit-learn
- pandas
- numpy
- regex
- tensorflow
- nltk
- pip
- pip:
  - azureml-defaults

Writing emailspam_training_hyperdrive_09102022_v1/emailspam_hyperdrive_env_09102022.yml


# Run a hyperparameter tuning experiment

In [7]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.train.hyperdrive import RandomParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice
from azureml.widgets import RunDetails

# from wordcloud import WordCloud

# Create a Python environment for the experiment
hyper_env = Environment.from_conda_specification("experiment_env", experiment_folder + "/emailspam_hyperdrive_env_09102022.yml")

# Get the training dataset
emailspam_ds = ws.datasets.get("UdacityPrjEmailSpamDataSet")

hyperdrive_feature = True

# Create a script config
script_config = ScriptRunConfig(source_directory=experiment_folder,
                                script='emailspam_training_09102022.py',
                                # Add non-hyperparameter arguments -in this case, the training dataset
                                arguments = ['--input-data', emailspam_ds.as_named_input('training_data'),
                                '--hyperdrive_feature', hyperdrive_feature],
                                environment=hyper_env,
                                compute_target = amlcompute_cluster_name)

                                
# ----------------------------------------------------------------------

params = RandomParameterSampling( 
    {
    "--units": choice(64, 80),
    "--optimizer": choice('adam', 'sgd')
    })

# Configure hyperdrive settings
hyperdrive = HyperDriveConfig(run_config=script_config, 
                          hyperparameter_sampling=params, 
                          policy=None, # No early stopping policy
                          primary_metric_name='Accuracy', # Find the highest Accuracy metric
                          primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                          max_total_runs=24, # Restict the experiment to 48 iterations
                          max_concurrent_runs=2) # Run up to 2 iterations in parallel

# Run the experiment
experiment = Experiment(workspace=ws, name='emailspam-hyperdrive-exp-09122022')
run = experiment.submit(config=hyperdrive)

# Show the status in the notebook as the experiment runs
# RunDetails(run).show()
run.wait_for_completion()


{'runId': 'HD_6b599155-3c22-48ba-a53a-fe7195d4bbca',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2022-09-12T16:32:51.282442Z',
 'endTimeUtc': '2022-09-12T16:42:24.857601Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name":"Accuracy","goal":"maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '6d9948ce-8ece-4472-872e-b267657d1b85',
  'user_agent': 'python/3.8.5 (Linux-5.4.0-1077-azure-x86_64-with-glibc2.10) msrest/0.6.21 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.41.0',
  'space_size': '4',
  'score': '0.011864883368379871',
  'best_child_run_id': 'HD_6b599155-3c22-48ba-a53a-fe7195d4bbca_3',
  'best_metric_status': 'Succeeded',
  'best_data_container_id': 'dcid.HD_6b599155-3c22-48ba-a53a-fe7195d4bbca_3'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'configuration': None,
  'attribution': None,
  'telemetryValues': {'amlClient

# Determine the best performing run

In [8]:
# Print all child runs, sorted by the primary metric
for child_run in run.get_children_sorted_by_primary_metric():
    print(child_run)

{'run_id': 'HD_6b599155-3c22-48ba-a53a-fe7195d4bbca_3', 'hyperparameters': '{"--optimizer": "adam", "--units": 64}', 'best_primary_metric': 0.011864883368379871, 'status': 'Completed'}
{'run_id': 'HD_6b599155-3c22-48ba-a53a-fe7195d4bbca_1', 'hyperparameters': '{"--optimizer": "adam", "--units": 80}', 'best_primary_metric': 0.011082449019511956, 'status': 'Completed'}
{'run_id': 'HD_6b599155-3c22-48ba-a53a-fe7195d4bbca_2', 'hyperparameters': '{"--optimizer": "sgd", "--units": 80}', 'best_primary_metric': 0.0, 'status': 'Completed'}
{'run_id': 'HD_6b599155-3c22-48ba-a53a-fe7195d4bbca_0', 'hyperparameters': '{"--optimizer": "sgd", "--units": 64}', 'best_primary_metric': 0.0, 'status': 'Completed'}


In [9]:
# Get the best run, and its metrics and arguments
best_run = run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()

In [22]:
script_arguments = best_run.get_details() ['runDefinition']['arguments']
print('Best Run Id: ', best_run.id)

Best Run Id:  HD_6b599155-3c22-48ba-a53a-fe7195d4bbca_3


In [15]:
print(' -Accuracy:', best_run_metrics['Accuracy'])
print(' -Arguments:',script_arguments)

 -Accuracy: 0.011864883368379871
 -Arguments: ['--input-data', 'DatasetConsumptionConfig:training_data', '--hyperdrive_feature', 'True', '--optimizer', 'adam', '--units', '64']


Now that you've found the best run, you can register the model it trained.

In [17]:
from azureml.core import Model

# Register model
reg_model = best_run.register_model(model_path='outputs/emailspam_model09102022.pkl', model_name='emailspam_model_09102022',
                        tags={'Training context':'Hyperdrive'},
                        properties={'Accuracy': best_run_metrics['Accuracy']})

In [24]:
from azureml.core.model import InferenceConfig

# Combine scoring script & environment in Inference configuration
inference_config = InferenceConfig(entry_script="09102022/score.py",
                                   environment=hyper_env)

In [26]:
from azureml.core.webservice.aci import AciWebservice
# Set deployment configuration
deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1,
                                                       memory_gb = 1)

In [28]:
# Define the model, inference, & deployment configuration and web service name and location to deploy
service = Model.deploy(workspace = ws,
                       name = "my-emailspam-service",
                       models = [reg_model],
                       inference_config = inference_config,
                       deployment_config = deployment_config)

In [29]:
service

AciWebservice(workspace=Workspace.create(name='nahmed30-azureml-workspace', subscription_id='16bc73b5-82be-47f2-b5ab-f2373344794c', resource_group='epe-poc-nazeer'), name=my-emailspam-service, image_id=None, image_digest=None, compute_type=ACI, state=Transitioning, scoring_uri=None, tags={}, properties={}, created_by={'userObjectId': 'a8930881-263c-498d-8975-58e6a0c28f2c', 'userPuId': '10032001567EC76C', 'userIdp': None, 'userAltSecId': None, 'userIss': 'https://sts.windows.net/db05faca-c82a-4b9d-b9c5-0f64b6755421/', 'userTenantId': 'db05faca-c82a-4b9d-b9c5-0f64b6755421', 'userName': 'Nazeer Ahmed', 'upn': 'nahmed30@optumcloud.com'})