# Automated ML

Import Dependencies.

In [1]:
import azureml.core

print("This notebook was created using version 1.41.0 of the Azure ML SDK")
print("You are currently using version", azureml.core.VERSION, "of the Azure ML SDK")

This notebook was created using version 1.41.0 of the Azure ML SDK
You are currently using version 1.40.0 of the Azure ML SDK


In [2]:
import os
import json
import logging
import pandas as pd

from azureml.core.run import Run
from azureml.core.model import Model
from azureml.widgets import RunDetails
from azureml.core.dataset import Dataset
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.experiment import Experiment
from azureml.core.webservice import Webservice
from helper import run_inference, get_result_df
from sklearn.datasets import fetch_20newsgroups
from azureml.core.compute import AmlCompute, ComputeTarget 
from azureml.core.compute_target import ComputeTargetException

In [3]:
# # check to see if ACI is already registered
# (myenv) $ az provider show -n Microsoft.ContainerInstance -o table

# azureuser@lyasolis1:~/cloudfiles/code/Users/lyasolis/starter_file$ az provider show -n Microsoft.ContainerInstance -o table
# Namespace                    RegistrationPolicy    RegistrationState
# ---------------------------  --------------------  -------------------
# Microsoft.ContainerInstance  RegistrationRequired  Registered



In [4]:
subscription_id = os.getenv("SUBSCRIPTION_ID", default="fbe09221-d2fa-4355-8174-808a6c0b6925")
resource_group = os.getenv("RESOURCE_GROUP", default="udacity-capstone")
workspace_name = os.getenv("WORKSPACE_NAME", default="udacity-capstone-ws")
workspace_region = os.getenv("WORKSPACE_REGION", default="northeurope")


In [5]:
ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
# write the details of the workspace to a configuration file to the notebook library
ws.write_config()


In [6]:
# Choose an experiment name.
experiment_name = "automl-classification-text-dnn"

experiment = Experiment(ws, experiment_name)

output = {}
output["Subscription ID"] = ws.subscription_id
output["Workspace Name"] = ws.name
output["Resource Group"] = ws.resource_group
output["Location"] = ws.location
output["Experiment Name"] = experiment.name
output["SDK Version"] = azureml.core.VERSION
pd.set_option("display.max_colwidth", None)
outputDf = pd.DataFrame(data=output, index=[""])
outputDf.T


Unnamed: 0,Unnamed: 1
Subscription ID,fbe09221-d2fa-4355-8174-808a6c0b6925
Workspace Name,udacity-capstone-ws
Resource Group,udacity-capstone
Location,northeurope
Experiment Name,automl-classification-text-dnn
SDK Version,1.40.0


In [7]:
#Create Compute Cluster
num_nodes = 1

# Choose a name for your cluster.
amlcompute_cluster_name = "dnntext-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print("Found existing cluster, use it.")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size="STANDARD_NC6",  # CPU for BiLSTM, such as "STANDARD_D2_V2"
        # To use BERT (this is recommended for best performance), select a GPU such as "STANDARD_NC6"
        # or similar GPU option available in your workspace
        idle_seconds_before_scaledown=60,
        max_nodes=num_nodes,
    )
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Dataset

### Overview
For this notebook we will use 20 Newsgroups data from scikit-learn. We filter the data to contain four classes and take a sample as training data. Please note that for accuracy improvement, more data is needed. For this notebook we provide a small-data example since it's a student project, and computational resources are expensive.


In [8]:
data_dir = "text-dnn-data"  # Local directory to store data
blobstore_datadir = data_dir  # Blob store directory to store data in
target_column_name = "y"
feature_column_name = "X"


def get_20newsgroups_data():
    """Fetches 20 Newsgroups data from scikit-learn
    Returns them in form of pandas dataframes
    """
    remove = ("headers", "footers", "quotes")
    categories = [
        "rec.sport.baseball",
        "rec.sport.hockey",
        "comp.graphics",
        "sci.space",
    ]

    data_train = fetch_20newsgroups(
        subset="train",
        categories=categories,
        shuffle=True,
        random_state=42,
        remove=remove,
    )
    
    data_test = fetch_20newsgroups(
        subset="test",
        categories=categories,
        shuffle=True,
        random_state=42,
        remove=remove,
    )
    
    data_train = pd.DataFrame(
        {feature_column_name: data_train.data, target_column_name: data_train.target}
    )
    data_test = pd.DataFrame(
        {feature_column_name: data_test.data, target_column_name: data_test.target}
    )

    data_train = remove_blanks_20news(data_train, feature_column_name, target_column_name)
    data_test = remove_blanks_20news(data_test, feature_column_name, target_column_name)

    return data_train, data_test


def remove_blanks_20news(data, feature_column_name, target_column_name):

    data[feature_column_name] = (
        data[feature_column_name]
        .replace(r"\n", " ", regex=True)
        .apply(lambda x: x.strip())
    )
    data = data[data[feature_column_name] != ""]

    return data

In [9]:
data_train, data_test = get_20newsgroups_data()

if not os.path.isdir(data_dir):
    os.mkdir(data_dir)

train_data_fname = data_dir + "/train_data.csv"
test_data_fname = data_dir + "/test_data.csv"

data_train.to_csv(train_data_fname, index=False)
data_test.to_csv(test_data_fname, index=False)

datastore = ws.get_default_datastore()
datastore.upload(src_dir=data_dir, target_path=blobstore_datadir, overwrite=True)
train_dataset = Dataset.Tabular.from_delimited_files(
    path=[(datastore, blobstore_datadir + "/train_data.csv")]
)

"Datastore.upload" is deprecated after version 1.0.69. Please use "Dataset.File.upload_directory" to upload your files             from a local directory and create FileDataset in single method call. See Dataset API change notice at https://aka.ms/dataset-deprecation.


Uploading an estimated of 2 files
Uploading text-dnn-data/test_data.csv
Uploaded text-dnn-data/test_data.csv, 1 files out of an estimated total of 2
Uploading text-dnn-data/train_data.csv
Uploaded text-dnn-data/train_data.csv, 2 files out of an estimated total of 2
Uploaded 2 files


In [10]:
train_dataset = Dataset.Tabular.from_delimited_files(
    path=[(datastore, blobstore_datadir + "/train_data.csv")]
)

## AutoML Configuration

Parameters generated for AutoML were as follows:  

  * Primary metric "accuracy" - ratio of predictions that exactly match the true class labels.
  * Early Stopping enabled - this parameter is used to stop initation early if the score isn't improving.
  * Featurization "auto" - this is to automatically check data and flag any issues (such as class imbalance, missing values etc).
  * Validation 0.3 - 30% of data was held out for validation.
  * Itiration time out was set to 10 min for each itiration.
  * Experiment time out was set to 60min, which means that it would terminate after 60 min, and best model is selected from the models generated within this time frame. This consists of all itirations within experiment.
  * Blocked_models parameter to exclude some models that can take a longer time to train on some text datasets. If we were to remove models from the blocked_models list, experiment_timeout_hours parameter value would need to be used and increased in order to allow sufficient time to improve the results.

In [11]:
automl_settings = {
    "iteration_timeout_minutes": 10,
    "primary_metric": "accuracy",
    "max_concurrent_iterations": num_nodes,
    "max_cores_per_iteration": -1,
    "enable_dnn": True,
    "featurization": "auto",
    "enable_early_stopping": True,
    "validation_size": 0.3,
    "verbosity": logging.INFO,
    "enable_voting_ensemble": False,
    "enable_stack_ensemble": False,
}

automl_config = AutoMLConfig(
    experiment_timeout_minutes=60,
    task="classification",
    debug_log="automl_errors.log",
    compute_target=compute_target,
    training_data=train_dataset,
    label_column_name=target_column_name,
    blocked_models=["LightGBM", "XGBoostClassifier"],
    **automl_settings,
)

In [12]:
#Submit experiment
automl_run = experiment.submit(automl_config, show_output=True)

Submitting remote run.
No run_configuration provided, running on dnntext-cluster with default configuration
Running on remote compute: dnntext-cluster


Experiment,Id,Type,Status,Details Page,Docs Page
automl-classification-text-dnn,AutoML_b6f10e92-3c93-46bf-be25-0acf0a98a7a5,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: TextDNNTraining. Training a deep learning text model, this may take a while.
Current status: TextDNNTrainingCompleted. Completed training a deep learning text model.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: ModelSelection. Beginning model selection.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

********************************************************************************************

TYPE:         Missing feature values i

## Run Details


Use the `RunDetails` widget to show the different experiments.

In [14]:
RunDetails(automl_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Best Model

Get the best model from the automl experiments and display all the properties of the model.


In [15]:
best_run, fitted_model = automl_run.get_output()
print(best_run)

Run(Experiment: automl-classification-text-dnn,
Id: AutoML_b6f10e92-3c93-46bf-be25-0acf0a98a7a5_10,
Type: None,
Status: Completed)


In [16]:
print(fitted_model)

Pipeline(steps=[('datatransformer',
                 DataTransformer(enable_feature_sweeping=True, working_dir='/mnt/batch/tasks/shared/LS_root/mounts/clusters/lyasolis1/code/Users/lyasolis/starter_file')),
                ('StandardScalerWrapper',
                 StandardScalerWrapper(copy=True, with_mean=False, with_std=True)),
                ('LogisticRegression',
                 LogisticRegression(C=24.420530945486497,
                                    multi_class='multinomial', n_jobs=-1))])


In [17]:
# Retrieve the best Run object
best_run = automl_run.get_best_child()
# Download the featurization summary JSON file locally
best_run.download_file(
    "outputs/featurization_summary.json", "featurization_summary.json"
)

# Render the JSON as a pandas DataFrame
with open("featurization_summary.json", "r") as f:
    records = json.load(f)

featurization_summary = pd.DataFrame.from_records(records)
featurization_summary["Transformations"].tolist()


[['StringCast-CharGramTfIdf',
  'StringCast-WordGramTfIdf',
  'StringCast-StringConcatTransformer-PretrainedTextDNNTransformer']]

In [18]:
#Save the best model
summary_df = get_result_df(automl_run)
best_dnn_run_id = summary_df["run_id"].iloc[0]
best_dnn_run = Run(experiment, best_dnn_run_id)

In [19]:
model_dir = "Model"  # Local folder where the model will be stored temporarily
if not os.path.isdir(model_dir):
    os.mkdir(model_dir)

best_dnn_run.download_file("outputs/model.pkl", model_dir + "/model.pkl")


In [20]:
summary_df

Unnamed: 0_level_0,run_id,primary_metric,Score
run_algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,AutoML_b6f10e92-3c93-46bf-be25-0acf0a98a7a5_10,accuracy,0.91
LinearSVM,AutoML_b6f10e92-3c93-46bf-be25-0acf0a98a7a5_29,accuracy,0.9
SGD,AutoML_b6f10e92-3c93-46bf-be25-0acf0a98a7a5_20,accuracy,0.9
RandomForest,AutoML_b6f10e92-3c93-46bf-be25-0acf0a98a7a5_19,accuracy,0.89
ExtremeRandomTrees,AutoML_b6f10e92-3c93-46bf-be25-0acf0a98a7a5_16,accuracy,0.89


In [21]:
best_dnn_run_id

'AutoML_b6f10e92-3c93-46bf-be25-0acf0a98a7a5_10'

## Model Deployment

Deploy the model. 
Register the model, create an inference config and deploy the model as a web service.

In [21]:
# Register the model
model_name = "textDNN-20News"
model = Model.register(
    model_path=model_dir + "/model.pkl", model_name=model_name, tags=None, workspace=ws
)

Registering model textDNN-20News


In [22]:
# Set with the deployment name
name = "text-classification-model"

# load existing web service
service = Webservice(name=name, workspace=ws)

In [23]:
# enable application insight
service.update(enable_app_insights=True)


In [24]:
logs = service.get_logs()

for line in logs.split('\n'):
    print(line)

2022-05-16T07:53:22,174378700+00:00 - rsyslog/run 
2022-05-16T07:53:22,199131600+00:00 - gunicorn/run 
2022-05-16T07:53:22,203196600+00:00 - iot-server/run 
2022-05-16T07:53:22,399399500+00:00 - nginx/run 
EdgeHubConnectionString and IOTEDGE_IOTHUBHOSTNAME are not set. Exiting...
2022-05-16T07:53:22,874160400+00:00 - iot-server/finish 1 0
2022-05-16T07:53:22,876841000+00:00 - Exit code 1 is normal. Not restarting iot-server.
Starting gunicorn 20.1.0
Listening at: http://127.0.0.1:31311 (74)
Using worker: sync
worker timeout is set to 300
Booting worker with pid: 102
SPARK_HOME not set. Skipping PySpark Initialization.
Initializing logger
2022-05-16 07:53:24,648 | root | INFO | Starting up app insights client
2022-05-16 07:53:24,649 | root | INFO | Starting up request id generator
2022-05-16 07:53:24,650 | root | INFO | Starting up app insight hooks
2022-05-16 07:53:24,650 | root | INFO | Invoking user's init function
Better speed can be achieved with apex installed from https://www.git

In [34]:
# Print examples from test set to test endpoint
test_dataset = Dataset.Tabular.from_delimited_files(
    path=[(datastore, blobstore_datadir + "/test_data.csv")]
)

# preview the first 5 rows of the dataset
test_dataset.take(5).to_pandas_dataframe()

Unnamed: 0,X,y
0,"I commend everybody to look at the FTP site 'ftp.cicb.fr' -> Ethernet address 129.20.128.2 <- in the directory /pub/Images/ASTRO: there are lots of images (all of kinds in astronomy subject) especially in GIF format and a NEW ! directory of some JPL animations For your comfort, README files in all subdirectories give size and description of each image, and a 7 days' newer images' list is in READMENEW Note: you can connect it as 'anonymous' or 'ftp' user, then the quota for each is 8 users connected in the same time. So, if the server responds you ""connection refused"", be patient ! 2nd note: this site is reachable by Gopher at 'roland.cicb.fr' -> Ethernet address 129.20.128.27 <- in 'Divers serveurs Ftp/Le serveur ftp du CRI-CICB/Images/ASTRO' If you have any comments, suggestions, problems, then you can contact me at E-mail 'rousself@univ-rennes1.fr'",3
1,why?,1
2,"Wood played most of his junior career in Seattle. He was one of the leading scorers on a mediocre team when he was traded away in 1992. He rarely lost a fight and was one of the toughest players in the WHL. However, I was extremely surprised when he was drafted, especially in the third round. He certainly didn't look like NHL material...",2
3,"There would be some point to doing long-term monitoring of things like particles and fields, not to mention atmospheric phenomena. However, there is no particular plan to establish any sort of monitoring network. To be precise, there is no particular plan, period. This is a large part of the problem. In this context, it's not surprising that unexciting but useful missions like this get short shrift at budget time. The closest approach to any sort of long-term planetary monitoring mission is the occasional chance to piggyback something like this on top of a flashier mission like Galileo or Cassini. It is most unlikely that there is much happening on Pluto that would be worth monitoring, and it is a prohibitively difficult mission to fly without new propulsion technology (something the planetary community has firmly resisted being the guinea pigs for). The combined need to arrive at Pluto within a reasonable amount of time, and then kill nearly all of the cruise velocity to settle into an orbit, is beyond what can reasonably be done with current (that is, 1950s-vintage) propulsion. Most of this can be done just about as well from Earth. The few things that can't be, can be done better from a Voyager-like spacecraft that is *not* constrained by the need to enter orbit around a planet.",3
4,"I'm sure all of you have heard of the extraordiary start by rookie J.T. Snow of the California Angeles. Other than the fact that his father was a star receiver with the L.A. Rams and is now a radio personality in Los Angeles and J.T. came from the Yankees organization I don't know much about J.T. If anyone has info and background on the young fenom....please post. By the way, for those of you not following his exploits he has hit four home runs in three days. Two last night. He has also delivered the winning hit a couple of times for the Angeles in this young season. Thanks...",1


Send a request to the web service to test it.

In [29]:
!python3 endpoint.py #Will run this same code in cells below to demonstrate steps taken

{'Results': [3]}


### Running endpoint.py in cell:

In [27]:
import requests
import json
from azureml.core import Webservice

scoring_uri = service.scoring_uri
key, _ = service.get_keys()

In [28]:
# Set the appropriate headers
headers = {"Content-Type": "application/json"}
headers["Authorization"] = f"Bearer {key}"

In [35]:
# Make the request and display the response and logs
data = {
  "Inputs": {
    "data": [
      {
        "X": "I'm sure all of you have heard of the extraordiary start by rookie J.T. Snow of the California Angeles. Other than the fact that his father was a star receiver with the L.A. Rams and is now a radio personality in Los Angeles and J.T. came from the Yankees organization I don't know much about J.T. If anyone has info and background on the young fenom....please post. By the way, for those of you not following his exploits he has hit four home runs in three days. Two last night. He has also delivered the winning hit a couple of times for the Angeles in this young season. Thanks..."
      }
    ]
  },
  "GlobalParameters": {
    "method": "predict"
  }
}


In [36]:
# Convert to JSON string
input_data = json.dumps(data)
with open("data.json", "w") as _f:
    _f.write(input_data)


In [37]:
# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers=headers)
print(resp.json())


{'Results': [1]}


Print the logs of the web service and delete the service

In [38]:
print(service.get_logs())

2022-05-16T08:03:29,641607800+00:00 - gunicorn/run 
2022-05-16T08:03:29,646538900+00:00 - iot-server/run 
2022-05-16T08:03:29,650045700+00:00 - rsyslog/run 
2022-05-16T08:03:29,764471900+00:00 - nginx/run 
EdgeHubConnectionString and IOTEDGE_IOTHUBHOSTNAME are not set. Exiting...
2022-05-16T08:03:30,189871600+00:00 - iot-server/finish 1 0
2022-05-16T08:03:30,192467900+00:00 - Exit code 1 is normal. Not restarting iot-server.
Starting gunicorn 20.1.0
Listening at: http://127.0.0.1:31311 (76)
Using worker: sync
worker timeout is set to 300
Booting worker with pid: 104
SPARK_HOME not set. Skipping PySpark Initialization.
Initializing logger
2022-05-16 08:03:31,870 | root | INFO | Starting up app insights client
2022-05-16 08:03:31,871 | root | INFO | Starting up request id generator
2022-05-16 08:03:31,871 | root | INFO | Starting up app insight hooks
2022-05-16 08:03:31,872 | root | INFO | Invoking user's init function
Better speed can be achieved with apex installed from https://www.git

## Delete resources

In [40]:
service.delete()
model.delete()

**Submission Checklist**
- I have registered the model.
- I have deployed the model with the best accuracy as a webservice.
- I have tested the webservice by sending a request to the model endpoint.
- I have deleted the webservice and shutdown all the computes that I have used.
- I have taken a screenshot showing the model endpoint as active.
- The project includes a file containing the environment details.
