<a href="https://colab.research.google.com/github/JayThibs/hyperdrive-vs-automl-plus-deployment/blob/main/hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hyperparameter Tuning using HyperDrive

# Environment and Import Dependencies

Here we specify the conda dependencies.

In [13]:
%%writefile conda_dependencies.yml

dependencies:
- python=3.6.2
- pip=20.2.4
- pip:
    - azureml-defaults
    - scikit-learn

Overwriting conda_dependencies.yml


In [14]:
from azureml.core import Environment

# Creating a conda environment for model training. It needs to be included in ScriptRunConfig.

sklearn_env = Environment.from_conda_specification(name='sklearn_env', file_path='./conda_dependencies.yml')

In [15]:
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import logging
import os
import csv

from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.24.0


# Write Training File for our Hyperdrive Model

In [16]:
%%writefile train.py

import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score

import argparse
from sklearn.metrics import mean_squared_error
import joblib
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory
import datetime

# Data Exploration was done in a notebook beforehand

# def dates(X):
#     """
#     date_recorded: this might be a useful variable for this analysis, although the year itself would be 
#     useless in a practical scenario moving into the future. We will convert this column into a datetime, 
#     and we will also create 'year_recorded' and 'month_recorded' columns just in case those levels prove 
#     to be useful. A visual inspection of both casts significant doubt on that possibility, but we'll proceed 
#     for now. We will delete date_recorded itself, since random forest cannot accept datetime
#     """
#     for i in [X]:
#         i['date_recorded'] = pd.to_datetime(i['date_recorded'])
#         i['year_recorded'] = i['date_recorded'].apply(lambda x: x.year)
#         i['month_recorded'] = i['date_recorded'].apply(lambda x: x.month)
#         i['date_recorded'] = (pd.to_datetime(i['date_recorded'])).apply(lambda x: x.toordinal())
#     return X


# def date_parser(df):
#     date_recorder = list(map(lambda x: datetime.datetime.strptime(str(x), '%Y-%m-%d'),
#                              df['date_recorded'].values))
#     df['year_recorder'] = list(map(lambda x: int(x.strftime('%Y')), date_recorder))
#     df['weekday_recorder'] = list(map(lambda x: int(x.strftime('%w')), date_recorder))
#     df['yearly_week_recorder'] = list(map(lambda x: int(x.strftime('%W')), date_recorder))
#     df['month_recorder'] = list(map(lambda x: int(x.strftime('%m')), date_recorder))
#     df['age'] = df['year_recorder'].values - df['construction_year'].values
#     del df['date_recorded']
#     return df


def bools(X):
    """
    public_meeting: we will fill the nulls as 'False'
    permit: we will fill the nulls as 'False'
    """
    z = ['public_meeting', 'permit']
    for i in z:
        X[i].fillna(False, inplace = True)
        X[i] = X[i].apply(lambda x: float(x))
    return X


def small_n(X):
    "Collapsing small categorical value counts into 'other'"
    cols = [i for i in X.columns if type(X[i].iloc[0]) == str]
    X[cols] = X[cols].where(X[cols].apply(lambda x: x.map(x.value_counts())) > 100, "other")
    return X



# We loaded the dataset into Azure and we are grabbing it here.

# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

# download config file in azure and put it in the current Notebooks folder
ws = Workspace.from_config()

dataset = Dataset.get_by_name(ws, name='Pump-it-Up-dataset')
df = dataset.to_pandas_dataframe()
y = df['status_group']
del df['status_group']
X = df

# A lot of null values for construction year. Of course, this is a missing value (a placeholder).
# For modeling purposes, this is actually fine, but we'll have trouble with visualizations if we
# compare the results for different years, so we'll set the value to something closer to
# the other values that aren't placeholders. Let's look at the unique years and set the null
# values to 50 years sooner.
# Alright, let's set it to 1910
X.loc[X['construction_year'] < 1950, 'construction_year'] = 1910

# Cleaning up the features of our dataset
# X = dates(X)
# x = date_parser(X)
X = bools(X)
X['population'] = np.log(X['population'])
X = small_n(X)

# Splitting the dataset into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

run = Run.get_context()


def main():
    # Adds arguments to script
    parser = argparse.ArgumentParser()

    # Setting the hyperparameters we will be optimizing for you Random Forest model
    parser.add_argument('--max_depth', type=int, default=5, help='The maximum depth of the trees.')
    parser.add_argument('--min_samples_split', type=int, default=4, help='The minimum number of samples required to split an internal node.')
    parser.add_argument('--n_estimators', type=int, default=750, help='The number of trees in the forest.')

    args = parser.parse_args()

    run.log("Max depth of the trees:", np.int(args.max_depth))
    run.log("Minimum number of samples required to split:", np.int(args.min_samples_split))
    run.log("Number of trees:", np.int(args.n_estimators))

    # Fitting a Random Forest model to our data. 
    # Sidenote: I also tried XGBoost on my local machine, but it did not perform as well.
    # RF has a score of 0.811, XGBoost has a score of 0.745
    # Since I did not use a validation set, it's possible that I'm just overfitting with RF.
    # But I wanted to focus on the end-to-end process for this project so I didn't bother with 
    # a validation set.
    rf = RandomForestClassifier(max_depth=args.max_depth,
                                min_samples_split=args.min_samples_split,
                                n_estimators=args.n_estimators,
                                criterion='gini',
                                oob_score=True,
                                random_state=42,
                                n_jobs=-1).fit(X_train, y_train)
    
    # Predicting on the test set
    predictions = rf.predict(X_test)
    pred = pd.DataFrame(predictions, columns = [y_test.columns[0]])

    # Calculate recall to test how well we do on True Positives
    # We can imagine a real scenario where we want to build a model
    # that does not miss the non-functioning water pumps, and we
    # care much less functioning water pumps that are incorrectly
    # predicted as non-functional. 
    recall = recall_score(y_test, pred, average='micro')
    run.log("Recall", np.float(recall))

    os.makedirs('outputs', exist_ok=True)
    joblib.dump(rf, 'outputs/rf_model.pkl')

if __name__ == '__main__':
    main()

Overwriting train.py


# Dataset

Getting our data and initialize a workspace object from persisted configuration. We placed the config file in .\config.json.

In [17]:
from azureml.core import Workspace, Experiment

# download config file in azure and put it in the current Notebooks folder
ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="Pump-it-Up-Data-Mining-the-Water-Table")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-142186
Azure region: southcentralus
Subscription id: f5091c60-1c3c-430f-8d81-d802f6bf2414
Resource group: aml-quickstarts-142186


In [18]:
# We loaded the dataset into Azure and we are grabbing it here.
from azureml.core import Dataset

dataset = Dataset.get_by_name(ws, name='Pump-it-Up-dataset')
df = dataset.to_pandas_dataframe()
y = df['status_group']
del df['status_group']
df.describe()

KeyboardInterrupt: 

# Setting up our Compute Target

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Creating a compute cluster if there isn't one that is already created.

cpu_cluster_name = 'hypr-auto-clustr'

try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new computer target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_v2',
                                                          max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
    
cpu_cluster.wait_for_completion(show_output=True)

In [None]:
# Using get_status() to get a detailed status for the current compute cluster.
print(cpu_cluster.get_status().serialize())

In [None]:
compute_targets = ws.compute_targets
for name, ct in compute_targets.items():
    print(name, ct.type, ct.provisioning_state)

In [None]:
cpu_cluster

# Hyperdrive Configuration

We are using Random Forest to train our model. We will use Hyperdrive to do a hyperparameter search to optimize our model.

We will be using Random Sampling since it is more efficient than grid search for finding an optima.



In [None]:
from azureml.widgets import RunDetails
from azureml.core import ScriptRunConfig
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import BayesianParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import quniform
from azureml.train.hyperdrive.parameter_expressions import choice
import os

# Specifying parameter sampler.
# - Here we use Bayesian Hyperparameter Sampling to search the hyperparameter space for the best model.
# - Essentially, Bayesian Sampling builds a probability model of the objective function we are trying 
#   to minimize and uses it to select the most promising hyperparameters to evaluate in the true objective function.
# - For best results with Bayesian Sampling we recommend using a maximum number of runs greater than or 
#   equal to 20 times the number of hyperparameters being tuned. Recommendend value:60.
#   We will be optimizing 3 hyperparameters for this project, therefore we choose 60 max_total_runs.
# - We also use quniform to search the hyperparameter space since quniform(low, high, q) creates uniform distriution 
#   between low and high values, separated by spacing q.

ps = BayesianParameterSampling(
    {
    'max_depth': quniform(3, 15, 1), # Maximum depth of the trees
    'min_samples_split': choice(2, 4, 6), # Minimum number of samples required to split
    'n_estimators' : quniform(500, 1000, 50) # Number of trees
    }
)

if "training" not in os.listdir():
    os.mkdir("./training")

# Creating a SKLearn estimator for use with train.py
src = ScriptRunConfig(source_directory=os.path.join('./'),
                      script='train.py',
                      compute_target=cpu_cluster,
                      environment=sklearn_env)

# Creating a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=src,
                                    hyperparameter_sampling=ps,
                                    primary_metric_name='norm_macro_recall',
                                    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                    max_total_runs=60,
                                    max_concurrent_runs=4)

# Submit Experiment and Run Details

In [None]:
# Submitting a HyperDrive run to the experiment and show run details with the widget.

hyperdrive_run = exp.submit(config=hyperdrive_config)

RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)

# Saving our Best Random Forest / Hyperdrive Model

In [None]:
import joblib
# Get your best run and save the model from that run.

best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()

print('Best Run:', best_run)
print('Metrics:', best_run_metrics['recall_score_micro'])

hyperdrive_model = best_run.register_model(model_name="rf_hyperdrive_model", model_path="./outputs/model.pkl")