# Automated ML

Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [6]:
from azureml.core.workspace import Workspace
from azureml.core import Run, Dataset
from azureml.core import Experiment, Webservice, Model

from azureml.core import Environment
from azureml.core.model import InferenceConfig
from azureml.core.webservice import LocalWebservice, AciWebservice

from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

from azureml.train.automl import AutoMLConfig
from azureml.widgets import RunDetails

from pprint import pprint

import os
import requests
import json

In [7]:
ws = Workspace.from_config()

## Dataset

### Overview


I use historical price data of ETHBTC (Ethereum/Bitcoin) crypto pair in daily period for over 3 years. The data is downloaded from Binance.us, and it is uploaded to my Azure ML workspace to register as a Dataset.

It is a time series dataset that includes a time column, OHLC columns, and volume column.

* time: the daily time period
* open: the opening price of a period
* high: the highest price in a period
* low: the lowest price in a period
* close: the closing price of a period 
* volume: the buy/sell amounts of the market in a period

My goal is to forecast the future price of ETHBTC. Since this is a time series data, the `time` column is a required feature. For the purpose of this project, I will only target the `close` price to forecast. In other words, this is a univariate analysis where the other value features are not considered in the model.

The data is uploaded as a Dataset in Azure ML Studio, and it can be accessed by name using Python SDK. A copy of the time series data is stored as a CSV file in this project repository.

In [None]:
dataset_name = 'ETHBTC'

dataset = Dataset.get_by_name(workspace=ws, name=dataset_name)

# preview data frame
df = dataset.to_pandas_dataframe()

df

In [9]:
# confirm that time column is of datetime type
df.dtypes

time      datetime64[ns]
open             float64
high             float64
low              float64
close            float64
volume           float64
dtype: object

In [46]:
target_column =  'close'

def get_data(df):
    # only do univariate forecasting on closing price
    close_df = df[['time', target_column]]
    # close_df.index = pd.DatetimeIndex(close_df['time'])

    train_data = close_df.iloc[:-50]
    test_data = close_df.iloc[-50:]
    
    return (train_data, test_data)

train_data, test_data = get_data(df)
# train_data = df.iloc[:-50]
# test_data = df.iloc[-50:]
print(train_data)
print(test_data)


# test_labels = test_data.pop(label).values
# print(test_labels)

           time  close
0    2017-07-14   0.09
1    2017-07-15   0.09
2    2017-07-16   0.08
3    2017-07-17   0.09
4    2017-07-18   0.11
...         ...    ...
1463 2021-07-16   0.06
1464 2021-07-17   0.06
1465 2021-07-18   0.06
1466 2021-07-19   0.06
1467 2021-07-20   0.06

[1468 rows x 2 columns]
           time  close
1468 2021-07-21   0.06
1469 2021-07-22   0.06
1470 2021-07-23   0.06
1471 2021-07-24   0.06
1472 2021-07-25   0.06
1473 2021-07-26   0.06
1474 2021-07-27   0.06
1475 2021-07-28   0.06
1476 2021-07-29   0.06
1477 2021-07-30   0.06
1478 2021-07-31   0.06
1479 2021-08-01   0.06
1480 2021-08-02   0.07
1481 2021-08-03   0.07
1482 2021-08-04   0.07
1483 2021-08-05   0.07
1484 2021-08-06   0.07
1485 2021-08-07   0.07
1486 2021-08-08   0.07
1487 2021-08-09   0.07
1488 2021-08-10   0.07
1489 2021-08-11   0.07
1490 2021-08-12   0.07
1491 2021-08-13   0.07
1492 2021-08-14   0.07
1493 2021-08-15   0.07
1494 2021-08-16   0.07
1495 2021-08-17   0.07
1496 2021-08-18   0.07
1497 2021

In [6]:
# choose a name for experiment
experiment_name = 'forecast-ethbtc-automl'


experiment=Experiment(ws, experiment_name)
experiment

Name,Workspace,Report Page,Docs Page
forecast-ethbtc-automl,capstone,Link to Azure Machine Learning studio,Link to Documentation


In [11]:
# create or retrieve a compute target

# Choose a name for your CPU cluster
cpu_cluster_name = "capstone-cluster"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:        
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2s_V3',
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## AutoML Configuration

Here we set up a forecasting task for AutoML to forecast daily (`D`) data in 20 steps in the future horizon. 
We supply the train data that only contains `time` column and a target column `close` price in order to only do univariate forecast.
We chose the default primary metric as `normalized_root_mean_squared_error`, but afterward we can extract `root_mean_squared_error` to compare with the HyperDrive model.



In [13]:
from azureml.automl.core.forecasting_parameters import ForecastingParameters

forecasting_parameters = ForecastingParameters(time_column_name='time', 
                                               forecast_horizon=20,                                               
                                               freq='D')

In [None]:
from azureml.core.workspace import Workspace
from azureml.core.experiment import Experiment
from azureml.train.automl import AutoMLConfig
import logging

automl_config = AutoMLConfig(task='forecasting',
                             primary_metric='normalized_root_mean_squared_error',
                             experiment_timeout_minutes=15,
                             enable_early_stopping=True,
                             training_data=train_data,
                             label_column_name=target_column,
                             n_cross_validations=3,
                             enable_ensembling=False,
                             verbosity=logging.INFO,
                             forecasting_parameters=forecasting_parameters)

In [52]:
# Submit the experiment
automl_run = experiment.submit(automl_config, show_output=True)

No run_configuration provided, running on local with default configuration
Running in the active local environment.


Experiment,Id,Type,Status,Details Page,Docs Page
forecast-ethbtc-automl,AutoML_4d4f91b7-5150-43ff-a8b3-d6b22fe69c76,automl,Preparing,Link to Azure Machine Learning studio,Link to Documentation


Current status: DatasetFeaturization. Beginning to featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
             

INFO:interpret_community.common.explanation_utils:Using default datastore for uploads


In [54]:
automl_run.wait_for_completion(show_output=False)

{'runId': 'AutoML_4d4f91b7-5150-43ff-a8b3-d6b22fe69c76',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2021-09-24T03:26:42.696062Z',
 'endTimeUtc': '2021-09-24T03:41:51.421667Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'normalized_root_mean_squared_error',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '3',
  'target': 'local',
  'DataPrepJsonString': None,
  'EnableSubsampling': 'False',
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'forecasting',
  'dependencies_versions': '{"azureml-widgets": "1.33.0", "azureml-train": "1.33.0", "azureml-train-restclients-hyperdrive": "1.33.0", "azureml-train-core": "1.33.0", "azureml-train-automl": "1.33.0", "azureml-train-automl-runtime": "1.33.0", "azureml-train-automl-client": "1.33.0", "azureml-tensorboard": "1.33.0", "azureml-telemetry": "1.33.0", "azureml-sdk": "1.33.0", "a

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

Use the `RunDetails` widget to show the different experiments.

In [8]:
# from azureml.train.automl.run import AutoMLRun
# automl_run = AutoMLRun(experiment=experiment, run_id='AutoML_4d4f91b7-5150-43ff-a8b3-d6b22fe69c76')

In [9]:
RunDetails(automl_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Best Model

Get the best model from the automl experiments and display all the properties of the model.



In [None]:
# Retrieve and save your best automl model.
automl_best_run, automl_best_model = automl_run.get_output()
automl_best_run

In [57]:
print('Best Run Id: ', automl_best_run.id)

automl_best_run_metrics = automl_best_run.get_metrics()
print('RMSE:', automl_best_run_metrics['root_mean_squared_error']) # similar metric used in the hyperdrive approach

print()
print('All available performance metrics:')
for metric, value in automl_best_run_metrics.items():    
    print(metric, '=', value)

Best Run Id:  AutoML_4d4f91b7-5150-43ff-a8b3-d6b22fe69c76_28
RMSE: 0.003142505272188411

All available performance metrics:
explained_variance = -0.010044641716390338
normalized_root_mean_squared_log_error = 0.03248901698830249
mean_absolute_percentage_error = 3.926263175676116
normalized_mean_absolute_error = 0.026115871066831203
spearman_correlation = -0.415908400885382
r2_score = -0.3526314115439102
normalized_median_absolute_error = 0.019975198925286795
median_absolute_error = 0.001933000000000003
root_mean_squared_log_error = 0.0029546663975496463
mean_absolute_error = 0.0025272328431372552
normalized_root_mean_squared_error = 0.03247396168428657
root_mean_squared_error = 0.003142505272188411
predicted_true = aml://artifactId/ExperimentRun/dcid.AutoML_4d4f91b7-5150-43ff-a8b3-d6b22fe69c76_28/predicted_true
residuals = aml://artifactId/ExperimentRun/dcid.AutoML_4d4f91b7-5150-43ff-a8b3-d6b22fe69c76_28/residuals


In [59]:
print('Model parameters:')

def print_model(model, prefix=""):
    for step in model.steps:
        print(prefix + step[0])
        if hasattr(step[1], 'estimators') and hasattr(step[1], 'weights'):
            pprint({'estimators': list(
                e[0] for e in step[1].estimators), 'weights': step[1].weights})
            print()
            for estimator in step[1].estimators:
                print_model(estimator[1], estimator[0] + ' - ')
        else:
            pprint(step[1].get_params())
            print()

print_model(automl_best_model)

Model parameters:
timeseriestransformer
{'country_or_region': None,
 'drop_column_names': [],
 'featurization_config': FeaturizationConfig(
    blocked_transformers=None,
    column_purposes=None,
    transformer_params=None,
    dataset_language=None,
    drop_columns=None,
    prediction_transform_type=None
),
 'force_time_index_features': None,
 'freq': 'D',
 'grain_column_names': ['_automl_dummy_grain_col'],
 'group': None,
 'lookback_features_removed': False,
 'max_horizon': 20,
 'origin_time_colname': 'origin',
 'pipeline': Pipeline(memory=None,
         steps=[('make_numeric_na_dummies',
                 MissingDummiesTransformer(numerical_columns=[])),
                ('impute_na_numeric_datetime',
                 TimeSeriesImputer(end=None, freq='D', impute_by_horizon=False, input_column=[], limit=None, limit_direction='forward', method=OrderedDict([('ffill', [])]), option='fillna', order=None, origin=None, value={})),
                ('grain_dropper', S...
                ('

### Save the best model

In [None]:
# Create a folder for model outputs in the current directory
# os.makedirs('./automl-outputs', exist_ok=True)

# Download the best run's outputs
automl_best_run.download_files(prefix='outputs/', append_prefix=False,
                               output_directory='./automl-outputs/')

## Model Deployment

Rgister the model, create an inference config and deploy the model as a web service.

In [64]:
model_name = automl_best_run.properties['model_name']

print('best model name:', model_name)

best model name: AutoML4d4f91b7528


In [65]:
model = automl_run.register_model(model_name = model_name, 
                                  description = 'AutoML forecast ETHBTC close price', 
                                  tags = None)

# Model to be deployed
model

Model(workspace=Workspace.create(name='capstone', subscription_id='d876eeb1-1ac0-424f-8c00-6284386d5106', resource_group='nanodegree'), name=AutoML4d4f91b7528, id=AutoML4d4f91b7528:1, version=1, tags={}, properties={})

### Prepare inference config

In [None]:
automlenv = Environment(name='AzureML-AutoML')

inference_config = InferenceConfig(
    environment=automlenv,
    source_directory="./automl-outputs",
    entry_script="./scoring_file_v_1_0_0.py")

### Deploy to local to preview

In [109]:
local_deployment_config = LocalWebservice.deploy_configuration(port=6789)

local_service = Model.deploy(
    ws,
    "forecast-service",
    [model],
    inference_config,
    local_deployment_config,
    overwrite=True,
)

local_service.wait_for_deployment(show_output=True)

Downloading model AutoML4d4f91b7528:1 to /tmp/azureml_0fb67urs/AutoML4d4f91b7528/1
Generating Docker build context.
Package creation Succeeded
Logging into Docker registry viennaglobal.azurecr.io
Logging into Docker registry viennaglobal.azurecr.io
Building Docker image from Dockerfile...
Step 1/5 : FROM viennaglobal.azurecr.io/azureml/azureml_21de62ceab4bd15bd023f919521b0887
 ---> 589e55ecf2b7
Step 2/5 : COPY azureml-app /var/azureml-app
 ---> 7504f775a44c
Step 3/5 : RUN mkdir -p '/var/azureml-app' && echo eyJhY2NvdW50Q29udGV4dCI6eyJzdWJzY3JpcHRpb25JZCI6ImQ4NzZlZWIxLTFhYzAtNDI0Zi04YzAwLTYyODQzODZkNTEwNiIsInJlc291cmNlR3JvdXBOYW1lIjoibmFub2RlZ3JlZSIsImFjY291bnROYW1lIjoiY2Fwc3RvbmUiLCJ3b3Jrc3BhY2VJZCI6Ijk3ODdlODFmLWE4MWQtNGE3My05NjY3LTQ0MGFjYWVjZjAxNyJ9LCJtb2RlbHMiOnt9LCJtb2RlbHNJbmZvIjp7fX0= | base64 --decode > /var/azureml-app/model_config_map.json
 ---> Running in 423e45b1338d
 ---> dae2435386ec
Step 4/5 : RUN mv '/var/azureml-app/tmpx7ad1yh1.py' /var/azureml-app/main.py
 ---> Running

In [117]:
local_service

LocalWebservice(workspace=Workspace.create(name='capstone', subscription_id='d876eeb1-1ac0-424f-8c00-6284386d5106', resource_group='nanodegree'), name=forecast-service, image_id=None, compute_type=None, state=Local, scoring_uri=running, tags=http://localhost:6789/score, properties=None, created_by=None)

### Test local service endpoint

In [116]:
local_uri = local_service.scoring_uri
print('local service endpoint:', local_uri)

print('test endpoint:')
headers = {"Content-Type": "application/json"}
data = {
        "data":
        [
            {
                'time': "2021-09-10 00:00:00,000000",
            },
        ]
}
data = json.dumps(data)

response = requests.post(local_uri, data=data, headers=headers)
# print(response)
print(response.json())

local service endpoint: http://localhost:6789/score
test endpoint:
{"forecast": [0.0648978], "index": [{"time": 1631232000000, "_automl_dummy_grain_col": "_automl_dummy_grain_col"}]}


### Check local service logs

In [119]:
print(local_service.get_logs())

2021-09-25T21:01:29,551730109+00:00 - iot-server/run 
2021-09-25T21:01:29,551353905+00:00 - gunicorn/run 
Dynamic Python package installation is disabled.
Starting HTTP server
2021-09-25T21:01:29,553250522+00:00 - rsyslog/run 
2021-09-25T21:01:29,555499742+00:00 - nginx/run 
rsyslogd: /azureml-envs/azureml_732632de80c129004c31329a710852e7/lib/libuuid.so.1: no version information available (required by rsyslogd)
EdgeHubConnectionString and IOTEDGE_IOTHUBHOSTNAME are not set. Exiting...
2021-09-25T21:01:29,910614373+00:00 - iot-server/finish 1 0
2021-09-25T21:01:29,915501816+00:00 - Exit code 1 is normal. Not restarting iot-server.
Starting gunicorn 20.1.0
Listening at: http://127.0.0.1:31311 (11)
Using worker: sync
worker timeout is set to 300
Booting worker with pid: 39
SPARK_HOME not set. Skipping PySpark Initialization.
Generating new fontManager, this may take some time...
Initializing logger
2021-09-25 21:01:32,033 | root | INFO | Starting up app insights client
logging socket was 

### Delete local service

In [None]:
local_service.delete()

### Re-deploy to cloud

In [100]:
cloud_deployment_config = AciWebservice.deploy_configuration(
    cpu_cores=0.5, 
    memory_gb=1, 
    auth_enabled=True,
    enable_app_insights=True
)

cloud_service = Model.deploy(
    ws,
    "forecast-service",
    [model],
    inference_config,
    cloud_deployment_config,
    overwrite=True,
)

cloud_service.wait_for_deployment(show_output=True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-09-25 08:27:35+00:00 Creating Container Registry if not exists.
2021-09-25 08:27:37+00:00 Use the existing image.
2021-09-25 08:27:37+00:00 Generating deployment configuration.
2021-09-25 08:27:38+00:00 Submitting deployment to compute.
2021-09-25 08:27:40+00:00 Checking the status of deployment forecast-service..
2021-09-25 08:32:33+00:00 Checking the status of inference endpoint forecast-service.
Succeeded
ACI service creation operation finished, operation "Succeeded"


### Test cloud endpoint

In [122]:
%run test/endpoint.py

{"forecast": [0.0648978], "index": [{"time": 1631232000000, "_automl_dummy_grain_col": "_automl_dummy_grain_col"}]}


### Check cloud service logs

In [24]:
print(cloud_service.get_logs())

2021-09-25 19:06:34,833 | root | INFO | 200
127.0.0.1 - - [25/Sep/2021:19:06:34 +0000] "POST /score?verbose=true HTTP/1.0" 200 138 "-" "Go-http-client/1.1"
2021-09-25 19:06:39,648 | root | INFO | 200
127.0.0.1 - - [25/Sep/2021:19:06:39 +0000] "GET /swagger.json HTTP/1.0" 200 1993 "-" "Go-http-client/1.1"
2021-09-25 21:15:46,862 | root | INFO | Validation Request Content-Type
2021-09-25 21:15:46,866 | root | INFO | Scoring Timer is set to 60.0 seconds
2021-09-25 21:15:47,117 | root | INFO | 200
127.0.0.1 - - [25/Sep/2021:21:15:47 +0000] "POST /score HTTP/1.0" 200 127 "-" "Python-urllib/3.6"
2021-09-25 21:16:17,843 | root | INFO | Validation Request Content-Type
2021-09-25 21:16:17,844 | root | INFO | Scoring Timer is set to 60.0 seconds
2021-09-25 21:16:18,088 | root | INFO | 200
127.0.0.1 - - [25/Sep/2021:21:16:18 +0000] "POST /score HTTP/1.0" 200 127 "-" "Python-urllib/3.6"
2021-09-25 21:17:13,246 | root | INFO | Validation Request Content-Type
2021-09-25 21:17:13,247 | root | INFO | 

### Delete cloud service

In [8]:
# cloud_service = Webservice(ws, name="forecast-service")

In [9]:
cloud_service.delete()

# Delete Compute Resources

In [12]:
cpu_cluster.delete()