<a href="https://colab.research.google.com/github/JayThibs/hyperdrive-vs-automl-plus-deployment/blob/main/automl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Automated ML

Note: For data exploration, go to hyperparameter_tuning.ipynb

# Import Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.26.0


In [2]:
# %%writefile feature_preprocessing.py

import numpy as np
import pandas as pd

def bools(df):
    """
    public_meeting: we will fill the nulls as 'False'
    permit: we will fill the nulls as 'False
    """
    z = ['public_meeting', 'permit']
    for i in z:
        df[i].fillna(False, inplace = True)
        df[i] = df[i].apply(lambda x: float(x))
    return df

def locs(df, trans = ['longitude', 'latitude', 'gps_height', 'population']):
    """
    fill in the nulls for ['longitude', 'latitude', 'gps_height', 'population'] by using medians from 
    ['subvillage', 'district_code', 'basin'], and lastly the overall median
    """
    df.loc[df.longitude == 0, 'latitude'] = 0
    for z in trans:
        df[z].replace(0., np.NaN, inplace = True)
        df[z].replace(1., np.NaN, inplace = True)
        
        for j in ['district_code', 'basin']:
        
            df['median'] = df.groupby([j])[z].transform('median')
            df[z] = df[z].fillna(df['median'])
        
        df[z] = df[z].fillna(df[z].median())
        del df['median']
    return df

def construction(df):
    """
    A lot of null values for construction year. Of course, this is a missing value (a placeholder).
    For modeling purposes, this is actually fine, but we'll have trouble with visualizations if we
    compare the results for different years, so we'll set the value to something closer to
    the other values that aren't placeholders. Let's look at the unique years and set the null
    values to 50 years sooner.
    Let's set it to 1910 since the lowest "good" value is 1960.
    """
    df.loc[df['construction_year'] < 1950, 'construction_year'] = 1910
    return df

# Alright, now let's drop a few columns
# Needed to drop quite a few categorical columns so that the data would fit in memory in Azure
# Tested the model before and after (from 6388 columns to 278) in Colab and only had a ~0.03% reduction in performance

def removal(df):
  # id: we drop the id column because it is not a useful predictor.
  # amount_tsh: is mostly blank - delete
  # wpt_name: not useful, delete (too many values)
  # subvillage: too many values, delete
  # scheme_name: this is almost 50% nulls, so we will delete this column
  # num_private: we will delete this column because ~99% of the values are zeros.
  features_to_drop = ['id','amount_tsh',  'num_private', 
          'quantity', 'quality_group', 'source_type', 'payment', 
          'waterpoint_type_group', 'extraction_type_group', 'wpt_name', 
          'subvillage', 'scheme_name', 'funder', 'installer', 'recorded_by',
          'ward']
  df = df.drop(features_to_drop, axis=1)

  return df

def dummy(df):
    dummy_cols = ['basin', 'lga', 'public_meeting',
       'scheme_management', 'permit', 'extraction_type',
       'extraction_type_class', 'management', 'management_group',
       'payment_type', 'water_quality', 'quantity_group', 'source',
       'source_class', 'waterpoint_type', 'region']

    df = pd.get_dummies(df, columns=dummy_cols)

    return df

def dates(df):
    """
    date_recorded: this might be a useful variable for this analysis, although the year itself would be useless in a practical scenario moving into the future. We will convert this column into a datetime, and we will also create 'year_recorded' and 'month_recorded' columns just in case those levels prove to be useful. A visual inspection of both casts significant doubt on that possibility, but we'll proceed for now. We will delete date_recorded itself, since random forest cannot accept datetime
    """
    df['date_recorded'] = pd.to_datetime(df['date_recorded'])
    df['year_recorded'] = df['date_recorded'].apply(lambda x: x.year)
    df['month_recorded'] = df['date_recorded'].apply(lambda x: x.month)
    df['date_recorded'] = (pd.to_datetime(df['date_recorded'])).apply(lambda x: x.toordinal())
    return df

def dates2(df):
    """
    Turn year_recorded and month_recorded into dummy variables
    """
    for z in ['month_recorded', 'year_recorded']:
        df[z] = df[z].apply(lambda x: str(x))
        good_cols = [z+'_'+i for i in df[z].unique()]
        df = pd.concat((df, pd.get_dummies(df[z], prefix = z)[good_cols]), axis = 1)
        del df[z]
    return df

def small_n(df):
    "Collapsing small categorical value counts into 'other'"
    cols = [i for i in df.columns if type(df[i].iloc[0]) == str]
    df[cols] = df[cols].where(df[cols].apply(lambda x: x.map(x.value_counts())) > 100, "other")
    return df

## Dataset

### Overview

We'll be using the Pump it Up dataset from the DrivenData competition.

The description of the problem: 

> Using data from Taarifa and the Tanzanian Ministry of Water, can you predict which pumps are functional, which need some repairs, and which don't work at all? This is an intermediate-level practice competition. Predict one of these three classes based on a number of variables about what kind of pump is operating, when it was installed, and how it is managed. A smart understanding of which waterpoints will fail can improve maintenance operations and ensure that clean, potable water is available to communities across Tanzania.

In other words, our goal is to predict which water pumps are non-functioning or functioning, but in need of repair.

In this project, we will train a model using AutoML to train multiple multiple and choose the best performing model for deployment.

In [3]:
# We loaded the dataset into Azure and we are grabbing it here.

from azureml.core import Workspace, Experiment, Dataset
# from feature_preprocessing import *

# download config file in azure and put it in the current Notebooks folder
ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="Pump-it-Up-Data-Mining-the-Water-Table")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

# download config file in azure and put it in the current Notebooks folder
ws = run.experiment.workspace

dataset = Dataset.get_by_name(ws, name='Pump-it-Up-dataset')
X = dataset.to_pandas_dataframe()
y = X[['status_group']]
del X['status_group']

# Cleaning up the features of our dataset
X = bools(X)
X = locs(X)
X = construction(X)
X = removal(X)
X = dummy(X)
X = dates(X)
x = dates2(X)
X = small_n(X)

# Removing ">", "[" and "]" from the headers to make the data compatible with different algorithms (namely, xgboost)
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
X.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X.columns.values]

# Converting the population values to log
X['population'] = np.log(X['population'])

# Splitting the dataset into a training and test set
# Test set will be used later
# The same random seed (42) for the Hyperdrive model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Concatenating the features and labels together to feed to our AutoML model
clean_train_df = pd.concat([X_train, y_train], axis=1)

Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code RADN33JZH to authenticate.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.
Workspace name: quick-starts-ws-142702
Azure region: southcentralus
Subscription id: 5a4ab2ba-6c51-4805-8155-58759ad589d8
Resource group: aml-quickstarts-142702


In [4]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Get the default datastore to be entered as a parameter in tabular dataset creation
datastore = ws.get_default_datastore()

# Change pandas dataframe into a tabular dataset to be used in automl
testing_data = TabularDatasetFactory.register_pandas_dataframe(X_test, datastore, 'automl_data_test')

Method register_pandas_dataframe: This is an experimental method, and may change at any time.<br/>For more information, see https://aka.ms/azuremlexperimental.


Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/7621a94d-ef19-40c1-9395-5625ba129118/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [5]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Get the default datastore to be entered as a parameter in tabular dataset creation
datastore = ws.get_default_datastore()

# Change pandas dataframe into a tabular dataset to be used in automl
training_data = TabularDatasetFactory.register_pandas_dataframe(clean_train_df, datastore, 'automl_data')

Method register_pandas_dataframe: This is an experimental method, and may change at any time.<br/>For more information, see https://aka.ms/azuremlexperimental.


Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/bf02ec3c-c86f-4683-a5d5-3c24856705ee/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [6]:
training_data.take(3).to_pandas_dataframe()

Unnamed: 0,date_recorded,gps_height,longitude,latitude,region_code,district_code,population,construction_year,basin_Internal,basin_Lake Nyasa,...,region_Pwani,region_Rukwa,region_Ruvuma,region_Shinyanga,region_Singida,region_Tabora,region_Tanga,year_recorded,month_recorded,status_group
0,734926,2092.0,35.42602,-4.227446,21,1,5.075174,1998,1,0,...,0,0,0,0,0,0,0,2013,2,functional
1,734213,550.0,35.510074,-5.724555,1,6,5.298317,1910,1,0,...,0,0,0,0,0,0,0,2011,3,functional
2,734328,550.0,32.499866,-9.081222,12,6,5.298317,1910,0,0,...,0,0,0,0,0,0,0,2011,7,non functional


# Setting up Experiment

We'll create a new experiment for our deployment of an AutoML model and create a project folder to hold the training scripts.

In [7]:
experiment_name = 'automl-pump-it-up-operationalize'
project_folder = './automl-pipeline-project'

automl_experiment = Experiment(ws, experiment_name)
automl_experiment

Name,Workspace,Report Page,Docs Page
automl-pump-it-up-operationalize,quick-starts-ws-142702,Link to Azure Machine Learning studio,Link to Documentation


In [8]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Creating a compute cluster if there isn't one that is already created.

cpu_cluster_name = 'hypr-auto-clustr'

try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new computer target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_v2',
                                                          max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
    
cpu_cluster.wait_for_completion(show_output=True)

Creating a new computer target...
Creating....
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


# AutoML Configuration

We'll create a new experiment for our deployment of an AutoML model and create a project folder to hold the training scripts.

Here we create the general AutoML settings object.


Calculate recall to test how well we do on True Positives. We can imagine a real scenario where we want to build a model that does not miss the non-functioning water pumps, and we care much less functioning water pumps that are incorrectly predicted as non-functional. Recall is useful to make sure we miss less True Positives.

In [9]:
from azureml.train.automl import AutoMLConfig

automl_settings = {
    "experiment_timeout_minutes": 120, # to set a limit on the amount of time AutoML will be running
    "max_concurrent_iterations": 5, # applies to the compute target we are using
    "primary_metric" : 'norm_macro_recall' # recall for our performance metric
}

# Setting AutoML config for model training.

automl_config = AutoMLConfig(compute_target=cpu_cluster,
                             task = "classification", # classifying if water pumps are functional
                             training_data=training_data, 
                             label_column_name="status_group", # our target variable for water pump function  
                             path = project_folder,
                             enable_early_stopping= True, # prevents automl from spending too much time on models that stopped improving, saves time and compute costs
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             **automl_settings
                            )

## Create Pipeline and AutoMLStep

Defining the outputs for the AutoMLStep using TrainingOutput.

In [10]:
from azureml.pipeline.core import PipelineData, TrainingOutput

ds = ws.get_default_datastore()
metrics_output_name = 'metrics_output'
best_model_output_name = 'best_model_output'

metrics_data = PipelineData(name='metrics_data',
                           datastore=ds,
                           pipeline_output_name=metrics_output_name,
                           training_output=TrainingOutput(type='Metrics'))
model_data = PipelineData(name='model_data',
                           datastore=ds,
                           pipeline_output_name=best_model_output_name,
                           training_output=TrainingOutput(type='Model'))

## Create the AutoMLStep

In [11]:
# Creating an AutoMLStep

automl_step = AutoMLStep(
    name='automl_module',
    automl_config=automl_config,
    outputs=[metrics_data, model_data],
    allow_reuse=True
    )

In [12]:
# Creating a Pipeline

from azureml.pipeline.core import Pipeline

pipeline = Pipeline(
    description="pipeline_with_automlstep",
    workspace=ws,    
    steps=[automl_step])

In [13]:
print('Submitting AutoML experiment...')

pipeline_run = automl_experiment.submit(pipeline)

Submitting AutoML experiment...
Created step automl_module [aa9427f7][6b0fd945-6306-4c15-92c2-dd1618cd4f89], (This step will run and generate new outputs)
Submitted PipelineRun 7a263f16-70e9-45a6-9056-9a2fdcaeaf7c
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/7a263f16-70e9-45a6-9056-9a2fdcaeaf7c?wsid=/subscriptions/5a4ab2ba-6c51-4805-8155-58759ad589d8/resourcegroups/aml-quickstarts-142702/workspaces/quick-starts-ws-142702&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254


# Run Details

Using the RunDetails widget to show the different experiments.

In [14]:
from azureml.widgets import RunDetails
RunDetails(pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

In [15]:
pipeline_run.wait_for_completion()

PipelineRunId: 7a263f16-70e9-45a6-9056-9a2fdcaeaf7c
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/7a263f16-70e9-45a6-9056-9a2fdcaeaf7c?wsid=/subscriptions/5a4ab2ba-6c51-4805-8155-58759ad589d8/resourcegroups/aml-quickstarts-142702/workspaces/quick-starts-ws-142702&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254
PipelineRun Status: Running


StepRunId: 7a82bb57-89e0-465e-96bc-994d16016b21
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/7a82bb57-89e0-465e-96bc-994d16016b21?wsid=/subscriptions/5a4ab2ba-6c51-4805-8155-58759ad589d8/resourcegroups/aml-quickstarts-142702/workspaces/quick-starts-ws-142702&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254
StepRun( automl_module ) Status: Running

StepRun(automl_module) Execution Summary
StepRun( automl_module ) Status: Finished



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '7a263f16-70e9-45a6-9056-9a2fdcaeaf7c', 'status': 'Completed', 'startTimeUtc': '2021-04-14T20:46:32.687639Z', 'endTimeUtc': '2021

'Finished'

# Examine Results

# Retrive the metrics of all child runs

In [16]:
metrics_output = pipeline_run.get_pipeline_output(metrics_output_name)
num_file_downloaded = metrics_output.download('.', show_progress=True)

Downloading azureml/7a82bb57-89e0-465e-96bc-994d16016b21/metrics_data
Downloaded azureml/7a82bb57-89e0-465e-96bc-994d16016b21/metrics_data, 1 files out of an estimated total of 1


In [17]:
import json
with open(metrics_output._path_on_datastore) as f:
    metrics_output_result = f.read()
    
deserialized_metrics_output = json.loads(metrics_output_result)
df = pd.DataFrame(deserialized_metrics_output)
pd.set_option('display.max_rows', 100)
df_t = df.T
df_t['recall_score_micro'].sort_values()

7a82bb57-89e0-465e-96bc-994d16016b21_12     [0.468013468013468]
7a82bb57-89e0-465e-96bc-994d16016b21_25    [0.5429292929292929]
7a82bb57-89e0-465e-96bc-994d16016b21_9     [0.5656565656565656]
7a82bb57-89e0-465e-96bc-994d16016b21_26    [0.5740740740740741]
7a82bb57-89e0-465e-96bc-994d16016b21_14    [0.5869107744107744]
7a82bb57-89e0-465e-96bc-994d16016b21_4     [0.5997474747474747]
7a82bb57-89e0-465e-96bc-994d16016b21_15    [0.5999579124579124]
7a82bb57-89e0-465e-96bc-994d16016b21_2     [0.6039562289562289]
7a82bb57-89e0-465e-96bc-994d16016b21_10    [0.6094276094276094]
7a82bb57-89e0-465e-96bc-994d16016b21_18    [0.6098484848484849]
7a82bb57-89e0-465e-96bc-994d16016b21_30    [0.6130050505050505]
7a82bb57-89e0-465e-96bc-994d16016b21_11    [0.6144781144781145]
7a82bb57-89e0-465e-96bc-994d16016b21_19     [0.631523569023569]
7a82bb57-89e0-465e-96bc-994d16016b21_35    [0.6323653198653199]
7a82bb57-89e0-465e-96bc-994d16016b21_24    [0.6355218855218855]
7a82bb57-89e0-465e-96bc-994d16016b21_23 

# Best Model

In [26]:
from azureml.train.automl.run import AutoMLRun
best_recall_run_id = df_t['recall_score_micro'].str.get(0).idxmax() # get string for best recall_score_micro run
automl_run = AutoMLRun(automl_experiment, run_id=best_recall_run_id)
automl_run.download_files()

In [27]:
import pickle

with open('outputs/model.pkl', "rb" ) as f:
    best_model = pickle.load(f)
best_model

PipelineWithYTransformations(Pipeline={'memory': None,
                                       'steps': [('datatransformer',
                                                  DataTransformer(enable_dnn=None,
                                                                  enable_feature_sweeping=None,
                                                                  feature_sweeping_config=None,
                                                                  feature_sweeping_timeout=None,
                                                                  featurization_config=None,
                                                                  force_text_dnn=None,
                                                                  is_cross_validation=None,
                                                                  is_onnx_compatible=None,
                                                                  logger=None,
                                                              

In [28]:
# As we can see, MaxAbsScaler LightGBMClassifier performed the best on recall_score_micro
best_model.steps

[('datatransformer',
  DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                  feature_sweeping_config=None, feature_sweeping_timeout=None,
                  featurization_config=None, force_text_dnn=None,
                  is_cross_validation=None, is_onnx_compatible=None, logger=None,
                  observer=None, task=None, working_dir=None)),
 ('MaxAbsScaler', MaxAbsScaler(copy=True)),
 ('LightGBMClassifier',
  LightGBMClassifier(boosting_type='gbdt', class_weight=None,
                     colsample_bytree=1.0, importance_type='split',
                     learning_rate=0.1, max_depth=-1, min_child_samples=20,
                     min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
                     n_jobs=1, num_leaves=31, objective=None, random_state=None,
                     reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
                     subsample_for_bin=200000, subsample_freq=0, verbose=-10))]

# Test the model on the Test Set

In [29]:
# important because registering data with TabularDatasetFactory might change column names (it did in this case)
# If column names change and you only registered X_train, there will be a mismatch unless you do the same with X_test
X_testing = testing_data.to_pandas_dataframe() 

In [32]:
from sklearn.metrics import recall_score

# Predict on the Test Set
ypred = best_model.predict(X_testing)

# Calculate recall
recall = recall_score(y_test, ypred, average='micro')
print('Recall Micro: %.3f' % recall)

Recall Micro: 0.781


As you can see, the score on `recall_score_micro` is higher with the LightGBMClassifer than it was with the Random Forest HyperDrive model (which was 0.765).

Therefore, we will be deploying the best model we got with AutoML.

# Model Deployment

Registering the model, creating an inference config and deploy the model as a web service.

In other words, we are publishing the pipeline to enable a REST endpoint to rerun the pipeline from any HTTP library on any platform.

In [40]:
from azureml.core.model import Model

# Register model (with the best recall_score_micro performance)
model = Model.register(model_path='outputs/model.pkl', 
                          model_name='automl_LightGBMClassifier',
                          tags={'Training context':'Auto ML'},
                          properties={'Recall_Micro': recall},
                          workspace=ws)

Registering model automl_LightGBMClassifier


In [41]:
%%writefile score.py

import json
import joblib
import numpy as np
from azureml.core.model import Model

# Called when the service is loaded
def init():
    global model
    # Get the path to the registered model file and load it
    try:
      model_path = Model.get_model_path('automl_LightGBMClassifier')
      model = joblib.load(model_path)
    except Exception as err:
        print("init method error: "+str(err))

# Called when a request is received
def run(data):
    try:
      # Get the input data as a numpy array
      data = np.array(json.loads(data)['data'])
      # Get a prediction from the model
      predictions = model.predict(data)
      # Return the predictions as any JSON serializable format
      return predictions.tolist()
    except Exception as err:
          return strn+"run method error: "+str(err)


Writing score.py


In [42]:
from azureml.core.conda_dependencies import CondaDependencies

# Add the dependencies for your model
myenv = CondaDependencies()
myenv.add_conda_package("scikit-learn")

# Save the environment config as a .yml file
env_file = './env.yml'
with open(env_file,"w") as f:
    f.write(myenv.serialize_to_string())
print("Saved dependency info in", env_file)

Saved dependency info in ./env.yml


In [43]:
# Create inference_config
from azureml.core.model import InferenceConfig

classifier_inference_config = InferenceConfig(runtime="python",
                                              source_directory = '.',
                                              entry_script="score.py",
                                              conda_file="env.yml")


In [48]:
from azureml.core.webservice import AciWebservice

classifier_deploy_config = AciWebservice.deploy_configuration(cpu_cores = 1,
                                                              memory_gb = 1,
                                                              enable_app_insights=True)

In [49]:
from azureml.core.model import Model

model = ws.models['automl_LightGBMClassifier']
service = Model.deploy(workspace=ws,
                       name = 'pump-it-up-service',
                       models = [model],
                       inference_config = classifier_inference_config,
                       deployment_config = classifier_deploy_config)

service.wait_for_deployment(show_output = True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-04-14 22:08:39+00:00 Creating Container Registry if not exists.
2021-04-14 22:08:40+00:00 Registering the environment.
2021-04-14 22:08:41+00:00 Building image..
2021-04-14 22:15:01+00:00 Generating deployment configuration..
2021-04-14 22:15:03+00:00 Submitting deployment to compute..
2021-04-14 22:15:12+00:00 Checking the status of deployment pump-it-up-service..
2021-04-14 22:17:50+00:00 Checking the status of inference endpoint pump-it-up-service.
Succeeded
ACI service creation operation finished, operation "Succeeded"


In [52]:
print(service.get_logs())

2021-04-14T22:17:42,821261500+00:00 - iot-server/run 
2021-04-14T22:17:42,822475900+00:00 - rsyslog/run 
2021-04-14T22:17:42,828873000+00:00 - nginx/run 
2021-04-14T22:17:42,822883400+00:00 - gunicorn/run 
/usr/sbin/nginx: /azureml-envs/azureml_4b824bcb98517d791c41923f24d65461/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_4b824bcb98517d791c41923f24d65461/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_4b824bcb98517d791c41923f24d65461/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_4b824bcb98517d791c41923f24d65461/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_4b824bcb98517d791c41923f24d65461/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
EdgeHubC

# Test the Deployed Model

Here we will send a request to the deployed model to test it.



In [51]:
endpoint = service.scoring_uri

print(f'\nservice state: {service.state}\n')
print(f'scoring URI: \n{endpoint}\n')
print(f'swagger URI: \n{service.swagger_uri}\n')

print(endpoint)
print(service.swagger_uri)


service state: Healthy

scoring URI: 
http://0a33aca7-7278-4c4f-9208-63bec61a3cd0.southcentralus.azurecontainer.io/score

swagger URI: 
http://0a33aca7-7278-4c4f-9208-63bec61a3cd0.southcentralus.azurecontainer.io/swagger.json

http://0a33aca7-7278-4c4f-9208-63bec61a3cd0.southcentralus.azurecontainer.io/score
http://0a33aca7-7278-4c4f-9208-63bec61a3cd0.southcentralus.azurecontainer.io/swagger.json


In [94]:
X_testing.loc[10]

date_recorded    734244.00
gps_height         1181.00
longitude            36.11
latitude             -4.91
region_code           1.00
                    ...   
region_Singida        0.00
region_Tabora         0.00
region_Tanga          0.00
year_recorded      2011.00
month_recorded        4.00
Name: 10, Length: 263, dtype: float64

In [141]:
x_new = pd.DataFrame(X_testing.loc[10]).T # grabbing a random example for testing the webservice
x_new.columns = x_new.columns.str.replace(r"[^a-zA-Z\d_]+", "")
x_new = x_new.T.rename(columns={10: "data"})
x_new.to_json()

'{"data":{"date_recorded":734244.0,"gps_height":1181.0,"longitude":36.11237521,"latitude":-4.91391433,"region_code":1.0,"district_code":1.0,"population":5.5214609179,"construction_year":1910.0,"basin_Internal":1.0,"basin_LakeNyasa":0.0,"basin_LakeRukwa":0.0,"basin_LakeTanganyika":0.0,"basin_LakeVictoria":0.0,"basin_Pangani":0.0,"basin_Rufiji":0.0,"basin_RuvumaSouthernCoast":0.0,"basin_WamiRuvu":0.0,"lga_ArushaRural":0.0,"lga_ArushaUrban":0.0,"lga_Babati":0.0,"lga_Bagamoyo":0.0,"lga_Bahi":0.0,"lga_Bariadi":0.0,"lga_Biharamulo":0.0,"lga_BukobaRural":0.0,"lga_BukobaUrban":0.0,"lga_Bukombe":0.0,"lga_Bunda":0.0,"lga_Chamwino":0.0,"lga_Chato":0.0,"lga_Chunya":0.0,"lga_DodomaUrban":0.0,"lga_Geita":0.0,"lga_Hai":0.0,"lga_Hanang":0.0,"lga_Handeni":0.0,"lga_Igunga":0.0,"lga_Ilala":0.0,"lga_Ileje":0.0,"lga_Ilemela":0.0,"lga_Iramba":0.0,"lga_IringaRural":0.0,"lga_Kahama":0.0,"lga_Karagwe":0.0,"lga_Karatu":0.0,"lga_Kasulu":0.0,"lga_Kibaha":0.0,"lga_Kibondo":0.0,"lga_KigomaRural":0.0,"lga_KigomaUrba

In [150]:
import requests
import json

# An array of new data cases
# x_new = pd.DataFrame(X_testing.loc[10]).T # grabbing a random example for testing the webservice
# x_new.columns = x_new.columns.str.replace(r"[^a-zA-Z\d_]+", "")
# x_new = x_new.T.rename(columns={10: 'data'})
# x_new = x_new.to_json()

x_new = {"date_recorded":734244.0,"gps_height":1181.0,"longitude":36.11237521,"latitude":-4.91391433,"region_code":1.0,"district_code":1.0,"population":5.5214609179,"construction_year":1910.0,"basin_Internal":1.0,"basin_LakeNyasa":0.0,"basin_LakeRukwa":0.0,"basin_LakeTanganyika":0.0,"basin_LakeVictoria":0.0,"basin_Pangani":0.0,"basin_Rufiji":0.0,"basin_RuvumaSouthernCoast":0.0,"basin_WamiRuvu":0.0,"lga_ArushaRural":0.0,"lga_ArushaUrban":0.0,"lga_Babati":0.0,"lga_Bagamoyo":0.0,"lga_Bahi":0.0,"lga_Bariadi":0.0,"lga_Biharamulo":0.0,"lga_BukobaRural":0.0,"lga_BukobaUrban":0.0,"lga_Bukombe":0.0,"lga_Bunda":0.0,"lga_Chamwino":0.0,"lga_Chato":0.0,"lga_Chunya":0.0,"lga_DodomaUrban":0.0,"lga_Geita":0.0,"lga_Hai":0.0,"lga_Hanang":0.0,"lga_Handeni":0.0,"lga_Igunga":0.0,"lga_Ilala":0.0,"lga_Ileje":0.0,"lga_Ilemela":0.0,"lga_Iramba":0.0,"lga_IringaRural":0.0,"lga_Kahama":0.0,"lga_Karagwe":0.0,"lga_Karatu":0.0,"lga_Kasulu":0.0,"lga_Kibaha":0.0,"lga_Kibondo":0.0,"lga_KigomaRural":0.0,"lga_KigomaUrban":0.0,"lga_Kilindi":0.0,"lga_Kilolo":0.0,"lga_Kilombero":0.0,"lga_Kilosa":0.0,"lga_Kilwa":0.0,"lga_Kinondoni":0.0,"lga_Kisarawe":0.0,"lga_Kishapu":0.0,"lga_Kiteto":0.0,"lga_Kondoa":1.0,"lga_Kongwa":0.0,"lga_Korogwe":0.0,"lga_Kwimba":0.0,"lga_Kyela":0.0,"lga_LindiRural":0.0,"lga_LindiUrban":0.0,"lga_Liwale":0.0,"lga_Longido":0.0,"lga_Ludewa":0.0,"lga_Lushoto":0.0,"lga_Mafia":0.0,"lga_Magu":0.0,"lga_Makete":0.0,"lga_Manyoni":0.0,"lga_Masasi":0.0,"lga_Maswa":0.0,"lga_Mbarali":0.0,"lga_MbeyaRural":0.0,"lga_Mbinga":0.0,"lga_Mbozi":0.0,"lga_Mbulu":0.0,"lga_Meatu":0.0,"lga_Meru":0.0,"lga_Misenyi":0.0,"lga_Missungwi":0.0,"lga_Mkinga":0.0,"lga_Mkuranga":0.0,"lga_Monduli":0.0,"lga_MorogoroRural":0.0,"lga_MorogoroUrban":0.0,"lga_MoshiRural":0.0,"lga_MoshiUrban":0.0,"lga_Mpanda":0.0,"lga_Mpwapwa":0.0,"lga_MtwaraRural":0.0,"lga_MtwaraUrban":0.0,"lga_Mufindi":0.0,"lga_Muheza":0.0,"lga_Muleba":0.0,"lga_MusomaRural":0.0,"lga_Mvomero":0.0,"lga_Mwanga":0.0,"lga_Nachingwea":0.0,"lga_Namtumbo":0.0,"lga_Nanyumbu":0.0,"lga_Newala":0.0,"lga_Ngara":0.0,"lga_Ngorongoro":0.0,"lga_Njombe":0.0,"lga_Nkasi":0.0,"lga_Nyamagana":0.0,"lga_Nzega":0.0,"lga_Pangani":0.0,"lga_Rombo":0.0,"lga_Rorya":0.0,"lga_Ruangwa":0.0,"lga_Rufiji":0.0,"lga_Rungwe":0.0,"lga_Same":0.0,"lga_Sengerema":0.0,"lga_Serengeti":0.0,"lga_ShinyangaRural":0.0,"lga_ShinyangaUrban":0.0,"lga_Siha":0.0,"lga_Sikonge":0.0,"lga_Simanjiro":0.0,"lga_SingidaRural":0.0,"lga_SingidaUrban":0.0,"lga_SongeaRural":0.0,"lga_SongeaUrban":0.0,"lga_SumbawangaRural":0.0,"lga_SumbawangaUrban":0.0,"lga_TaboraUrban":0.0,"lga_Tandahimba":0.0,"lga_Tanga":0.0,"lga_Tarime":0.0,"lga_Temeke":0.0,"lga_Tunduru":0.0,"lga_Ukerewe":0.0,"lga_Ulanga":0.0,"lga_Urambo":0.0,"lga_Uyui":0.0,"public_meeting_0_0":0.0,"public_meeting_1_0":1.0,"scheme_management_Company":0.0,"scheme_management_None":0.0,"scheme_management_Other":0.0,"scheme_management_Parastatal":0.0,"scheme_management_Privateoperator":0.0,"scheme_management_SWC":0.0,"scheme_management_Trust":0.0,"scheme_management_VWC":1.0,"scheme_management_WUA":0.0,"scheme_management_WUG":0.0,"scheme_management_WaterBoard":0.0,"scheme_management_Waterauthority":0.0,"permit_0_0":1.0,"permit_1_0":0.0,"extraction_type_afridev":0.0,"extraction_type_cemo":0.0,"extraction_type_climax":0.0,"extraction_type_gravity":1.0,"extraction_type_indiamarkii":0.0,"extraction_type_indiamarkiii":0.0,"extraction_type_ksb":0.0,"extraction_type_mono":0.0,"extraction_type_niratanira":0.0,"extraction_type_other":0.0,"extraction_type_othermkulimashinyanga":0.0,"extraction_type_otherplaypump":0.0,"extraction_type_otherropepump":0.0,"extraction_type_otherswn81":0.0,"extraction_type_submersible":0.0,"extraction_type_swn80":0.0,"extraction_type_walimi":0.0,"extraction_type_windmill":0.0,"extraction_type_class_gravity":1.0,"extraction_type_class_handpump":0.0,"extraction_type_class_motorpump":0.0,"extraction_type_class_other":0.0,"extraction_type_class_ropepump":0.0,"extraction_type_class_submersible":0.0,"extraction_type_class_windpowered":0.0,"management_company":0.0,"management_other":0.0,"management_otherschool":0.0,"management_parastatal":0.0,"management_privateoperator":0.0,"management_trust":0.0,"management_unknown":0.0,"management_vwc":1.0,"management_waterauthority":0.0,"management_waterboard":0.0,"management_wua":0.0,"management_wug":0.0,"management_group_commercial":0.0,"management_group_other":0.0,"management_group_parastatal":0.0,"management_group_unknown":0.0,"management_group_usergroup":1.0,"payment_type_annually":0.0,"payment_type_monthly":0.0,"payment_type_neverpay":1.0,"payment_type_onfailure":0.0,"payment_type_other":0.0,"payment_type_perbucket":0.0,"payment_type_unknown":0.0,"water_quality_coloured":0.0,"water_quality_fluoride":0.0,"water_quality_fluorideabandoned":0.0,"water_quality_milky":0.0,"water_quality_salty":0.0,"water_quality_saltyabandoned":0.0,"water_quality_soft":1.0,"water_quality_unknown":0.0,"quantity_group_dry":0.0,"quantity_group_enough":1.0,"quantity_group_insufficient":0.0,"quantity_group_seasonal":0.0,"quantity_group_unknown":0.0,"source_dam":0.0,"source_handdtw":0.0,"source_lake":0.0,"source_machinedbh":0.0,"source_other":0.0,"source_rainwaterharvesting":0.0,"source_river":0.0,"source_shallowwell":0.0,"source_spring":1.0,"source_unknown":0.0,"source_class_groundwater":1.0,"source_class_surface":0.0,"source_class_unknown":0.0,"waterpoint_type_cattletrough":0.0,"waterpoint_type_communalstandpipe":1.0,"waterpoint_type_communalstandpipemultiple":0.0,"waterpoint_type_dam":0.0,"waterpoint_type_handpump":0.0,"waterpoint_type_improvedspring":0.0,"waterpoint_type_other":0.0,"region_Arusha":0.0,"region_DaresSalaam":0.0,"region_Dodoma":1.0,"region_Iringa":0.0,"region_Kagera":0.0,"region_Kigoma":0.0,"region_Kilimanjaro":0.0,"region_Lindi":0.0,"region_Manyara":0.0,"region_Mara":0.0,"region_Mbeya":0.0,"region_Morogoro":0.0,"region_Mtwara":0.0,"region_Mwanza":0.0,"region_Pwani":0.0,"region_Rukwa":0.0,"region_Ruvuma":0.0,"region_Shinyanga":0.0,"region_Singida":0.0,"region_Tabora":0.0,"region_Tanga":0.0,"year_recorded":2011.0,"month_recorded":4.0}

x_new = [{key.replace('"', ''):val for key, val in x_new.items()}]

# Convert the array to a serializable list in a JSON document
input_data = json.dumps({"data": x_new})

with open('data.json', 'w') as file:
    file.write(input_data)

# Set the content type in the request headers
request_headers = { "Content-Type":"application/json"}

# Call the service
response = requests.post(url = endpoint,
                         data = input_data,
                         headers = request_headers)

print(response)
print("Prediction Results:", response.json())

<Response [502]>


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [151]:
x_new

[{'date_recorded': 734244.0,
  'gps_height': 1181.0,
  'longitude': 36.11237521,
  'latitude': -4.91391433,
  'region_code': 1.0,
  'district_code': 1.0,
  'population': 5.5214609179,
  'construction_year': 1910.0,
  'basin_Internal': 1.0,
  'basin_LakeNyasa': 0.0,
  'basin_LakeRukwa': 0.0,
  'basin_LakeTanganyika': 0.0,
  'basin_LakeVictoria': 0.0,
  'basin_Pangani': 0.0,
  'basin_Rufiji': 0.0,
  'basin_RuvumaSouthernCoast': 0.0,
  'basin_WamiRuvu': 0.0,
  'lga_ArushaRural': 0.0,
  'lga_ArushaUrban': 0.0,
  'lga_Babati': 0.0,
  'lga_Bagamoyo': 0.0,
  'lga_Bahi': 0.0,
  'lga_Bariadi': 0.0,
  'lga_Biharamulo': 0.0,
  'lga_BukobaRural': 0.0,
  'lga_BukobaUrban': 0.0,
  'lga_Bukombe': 0.0,
  'lga_Bunda': 0.0,
  'lga_Chamwino': 0.0,
  'lga_Chato': 0.0,
  'lga_Chunya': 0.0,
  'lga_DodomaUrban': 0.0,
  'lga_Geita': 0.0,
  'lga_Hai': 0.0,
  'lga_Hanang': 0.0,
  'lga_Handeni': 0.0,
  'lga_Igunga': 0.0,
  'lga_Ilala': 0.0,
  'lga_Ileje': 0.0,
  'lga_Ilemela': 0.0,
  'lga_Iramba': 0.0,
  'lga_Ir

In [127]:
response.status_code

502

# Printing the logs and Deleting the Service

In [None]:
# Delete computer target in order to avoid incurring additional charges.

# AmlCompute.delete(cpu_cluster)
# service.delete()
# model.delete()
# run.delete()
# automl_run.delete()
# automl_experiment.delete()