<a href="https://colab.research.google.com/github/JayThibs/hyperdrive-vs-automl-plus-deployment/blob/main/hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hyperparameter Tuning using HyperDrive

# Environment and Import Dependencies

Here we specify the conda dependencies.

In [None]:
%%writefile conda_dependencies.yml

dependencies:
- python=3.6.2
- pip=20.2.4
- pip:
    - azureml-defaults
    - scikit-learn
    - fastai

Overwriting conda_dependencies.yml


In [None]:
from azureml.core import Environment

# Creating a conda environment for model training. It needs to be included in ScriptRunConfig.

sklearn_env = Environment.from_conda_specification(name='sklearn_env', file_path='./conda_dependencies.yml')

In [None]:
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from fastai.tabular.all import *
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import logging
import os
import csv

from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

# Write Training File for our Hyperdrive Model

In [None]:
%%writefile rf_train.py

import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from fastai.tabular.all import *
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score

import argparse
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.model_selection import train_test_split
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory

# Data Exploration was done in a notebook beforehand


def clean_data(X):
  """
  Cleaning data with the fastai library.
  """
  # Let's add some date related columns with fastai's `add_datepart`
  X = add_datepart(X, 'date_recorded')

  # A lot of null values for construction year. Of course, this is a missing value (a placeholder).
  # For modeling purposes, this is actually fine, but we'll have trouble with visualizations if we
  # compare the results for different years, so we'll set the value to something closer to
  # the other values that aren't placeholders. Let's look at the unique years and set the null
  # values to 50 years sooner.
  # Alright, let's set it to 1910
  X.loc[X['construction_year'] < 1950, 'construction_year'] = 1910

  # Now, let's do some more general preprocessing to handle strings and missing data.
  # We'll be using two fastai classes: TabularProc and TabularPandas
  # TabularProc does preprocessing for TabularPandas. We'll use Categorify to replace
  # columns with floats or integers with a numeric categorical column.
  # FillMissing replaces missing values with the median of the column, and
  # creates a new Boolean column that is set to True for any row where the value is missing.
  procs = [Categorify, FillMissing]

  # Before we use TabularPandas, we need to tell it which columns are continuous and which are categorical
  # We can do this with the helper function `cont_cat_split`
  cont, cat = cont_cat_split(X, 1, dep_var=None)

  # Preprocessing with TabularPandas
  to = TabularPandas(X, procs, cat, cont)

  # Converting back to a DataFrame
  X = to.train.xs

  return X


def removal(X):
  """
  Removing some columns that aren't to helpful for our model.
  """
  # id: not a useful predictor
  # amount_tsh: mostly blank
  # wpt_name: too many values
  # subvillage: too many values
  # scheme_name: this is almost 50% nulls
  # num_private: ~99% of the values are zeros
  features_to_drop = ['id', 'num_private', 'wpt_name', 'subvillage', 'scheme_name']
  for i in features_to_drop:
      del X[i]

  return X


# Loading the dataset from the DrivenData website
y = TabularDatasetFactory.from_delimited_files("https://drivendata-prod.s3.amazonaws.com/data/7/public/0bf8bc6e-30d0-4c50-956a-603fc693d966.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCYVI2LMPSY%2F20210331%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20210331T171733Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=ab358ae9d47a9c85a84067b28a636af41d7a7cf71eab3728b869c2b92434884f")
X = TabularDatasetFactory.from_delimited_files("https://drivendata-prod.s3.amazonaws.com/data/7/public/4910797b-ee55-40a7-8668-10efd5c1b960.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCYVI2LMPSY%2F20210331%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20210331T171733Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=e414872d8fc9649993f22ca03b4ae7ef7218188bb951de0ef92d0bd271963807")

# Removing the id column since it does not help for prediction
del y['id']

# Cleaning up the features of our dataset
X = clean_data(X)
X = removal(X)

# Splitting the dataset into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

run = Run.get_context()


def main():
    # Adds arguments to script
    parser = argparse.ArgumentParser()

    # Setting the hyperparameters we will be optimizing for you Random Forest model
    parser.add_argument('--max_depth', type=int, default=5, help='The maximum depth of the trees.')
    parser.add_argument('--min_samples_split', type=int, default=4, help='The minimum number of samples required to split an internal node.')
    parser.add_argument('--n_estimators', type=int, default=750, help='The number of trees in the forest.')

    args = parser.parse_args()

    run.log("Max depth of the trees:", np.int(args.max_depth))
    run.log("Minimum number of samples required to split:", np.int(args.min_samples_split))
    run.log("Number of trees:", np.int(args.n_estimators))

    # Fitting a Random Forest model to our data. 
    # Sidenote: I also tried XGBoost on my local machine, but it did not perform as well.
    # RF has a score of 0.811, XGBoost has a score of 0.745
    # Since I did not use a validation set, it's possible that I'm just overfitting with RF.
    # But I wanted to focus on the end-to-end process for this project so I didn't bother with 
    # a validation set.
    rf = RandomForestClassifier(max_depth=args.max_depth,
                                min_samples_split=args.min_samples_split,
                                n_estimators=args.n_estimators,
                                criterion='gini',
                                max_features='auto',
                                min_samples_split=6,
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1).fit(X_train, y_train)
    
    # Predicting on the test set
    predictions = rf.predict(X_test)
    pred = pd.DataFrame(predictions, columns = [y_test.columns[0]])

    # Calculate recall to test how well we do on True Positives
    # We can imagine a real scenario where we want to build a model
    # that does not miss the non-functioning water pumps, and we
    # care much less functioning water pumps that are incorrectly
    # predicted as non-functional. 
    recall = recall_score(y_test, pred, average='micro')
    run.log("Recall", np.float(recall))

    os.makedirs('outputs', exist_ok=True)
    joblib.dump(rf, 'outputs/rf_model.pkl')

if __name__ == '__main__':
    main()

## Dataset

TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

# Dataset

Getting our data and initialize a workspace object from persisted configuration. We placed the config file in .\config.json.

In [None]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="Pump-it-Up-Data-Mining-the-Water-Table")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-135523
Azure region: southcentralus
Subscription id: 61c5c3f0-6dc7-4ed9-a7f3-c704b20e3b30
Resource group: aml-quickstarts-135523


In [None]:
# We loaded the dataset into Azure and we are grabbing it here.

key = "Pump-it-Up-datatset"
description_text = "Pump it Up dataset from DrivenData"

dataset = ws.datasets[key]
df = dataset.to_pandas_dataframe()
df.describe()

# Setting up our Compute Target

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Creating a compute cluster if there isn't one that is already created.

cpu_cluster_name = 'hypr-auto-clustr'

try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new computer target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_v2',
                                                          max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
    
cpu_cluster.wait_for_completion(show_output=True)

Found existing compute target.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [None]:
# Using get_status() to get a detailed status for the current compute cluster.
print(cpu_cluster.get_status().serialize())

{'currentNodeCount': 4, 'targetNodeCount': 4, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 4, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-01-20T21:41:43.016000+00:00', 'errors': None, 'creationTime': '2021-01-20T21:33:12.704608+00:00', 'modifiedTime': '2021-01-20T21:33:32.357236+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


In [None]:
compute_targets = ws.compute_targets
for name, ct in compute_targets.items():
    print(name, ct.type, ct.provisioning_state)

hyper-vs-automl ComputeInstance Succeeded
hypr-auto-clustr AmlCompute Succeeded


In [None]:
cpu_cluster

AmlCompute(workspace=Workspace.create(name='quick-starts-ws-135523', subscription_id='61c5c3f0-6dc7-4ed9-a7f3-c704b20e3b30', resource_group='aml-quickstarts-135523'), name=hypr-auto-clustr, id=/subscriptions/61c5c3f0-6dc7-4ed9-a7f3-c704b20e3b30/resourceGroups/aml-quickstarts-135523/providers/Microsoft.MachineLearningServices/workspaces/quick-starts-ws-135523/computes/hypr-auto-clustr, type=AmlCompute, provisioning_state=Succeeded, location=southcentralus, tags=None)

# Hyperdrive Configuration

We are using Random Forest to train our model. We will use Hyperdrive to do a hyperparameter search to optimize our model.

We will be using Random Sampling since it is more efficient than grid search for finding an optima.



In [None]:
from azureml.widgets import RunDetails
from azureml.core import ScriptRunConfig
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
from azureml.train.hyperdrive.parameter_expressions import choice
import os

# Specifying parameter sampler.
ps = RandomParameterSampling(
    {
    'max_depth': uniform(3, 10), # Maximum depth of the trees
    'min_samples_split': choice(2, 4, 6), # Minimum number of samples required to split
    'n_estimators' : choice(500, 750, 1000) # Number of trees
    }
)

# Specifying a Policy.
early_termination_policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

# Creating a SKLearn estimator for use with train.py
src = ScriptRunConfig(source_directory=os.path.join('./'),
                      script='train.py',
                      compute_target=cpu_cluster,
                      environment=sklearn_env)

# Creating a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=src,
                                    hyperparameter_sampling=ps,
                                    policy=policy,
                                    primary_metric_name='recall_score_micro',
                                    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                    max_total_runs=30,
                                    max_concurrent_runs=4)

# Submit Experiment and Run Details

In [None]:
from azureml.widgets import RunDetails

# Submitting a HyperDrive run to the experiment and show run details with the widget.

hyperdrive_run = exp.submit(config=hyperdrive_config)

RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_2564a680-12ae-4d6e-925e-1bee71135484
Web View: https://ml.azure.com/experiments/BankMarketing-comparison/runs/HD_2564a680-12ae-4d6e-925e-1bee71135484?wsid=/subscriptions/61c5c3f0-6dc7-4ed9-a7f3-c704b20e3b30/resourcegroups/aml-quickstarts-135523/workspaces/quick-starts-ws-135523

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-01-20T21:55:35.693932][API][INFO]Experiment created<END>\n""<START>[2021-01-20T21:55:36.382278][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n""<START>[2021-01-20T21:55:36.177072][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n"<START>[2021-01-20T21:55:37.3496662Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_2564a680-12ae-4d6e-925e-1bee71135484
Web View: https://ml.azure.com/experiments/BankMarketing-comparison/runs/HD_2564a680-12ae-4d6e-925e-1bee71135484?wsid=/s

{'runId': 'HD_2564a680-12ae-4d6e-925e-1bee71135484',
 'target': 'hypr-auto-clustr',
 'status': 'Completed',
 'startTimeUtc': '2021-01-20T21:55:34.892087Z',
 'endTimeUtc': '2021-01-20T23:32:28.732946Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '01f3997f-f732-463a-a49f-f4d9a5165615',
  'score': '0.9112797167425392',
  'best_child_run_id': 'HD_2564a680-12ae-4d6e-925e-1bee71135484_27',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg135523.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_2564a680-12ae-4d6e-925e-1bee71135484/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=ru4N1wKuNmPm6Zo2ZXsioCgwDnfJs3eRsWdO91Uy3Yk%3D&st=2021-01-20T23%3A23%3A20Z&se=2021-01-21T07%3A33%3A20Z&sp=r'}}

# Saving our Best Random Forest / Hyperdrive Model

In [None]:
import joblib
# Get your best run and save the model from that run.

best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()

print('Best Run:', best_run)
print('Metrics:', best_run_metrics['recall_score_micro'])

hyperdrive_model = best_run.register_model(model_name="rf_hyperdrive_model", model_path="./outputs/model.pkl")

Best Run: Run(Experiment: BankMarketing-comparison,
Id: HD_2564a680-12ae-4d6e-925e-1bee71135484_27,
Type: azureml.scriptrun,
Status: Completed)
Metrics: 0.9112797167425392
