# SegNet2 Hyperparamter Tuning
* This marks the first attempt at utilising the hyperparameter tuning in sagemaker
* The following guide was used: https://github.com/awslabs/amazon-sagemaker-examples/blob/master/hyperparameter_tuning/pytorch_mnist/hpo_pytorch_mnist.ipynb
* The first attempts did not work due to the limited memory of the instance types that were available to us at the time

In [1]:
############## Set Up ################

import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

# ##yan - training on local instance to see what the problem is  
# import boto3
# import os
# from sagemaker.local import LocalSession
# from sagemaker.debugger import rule_configs, DebuggerHookConfig, CollectionConfig

# sagemaker_session = LocalSession()
# sagemaker_session.config = {'local': {'local_code': True}}
# # Make sure to set this to your bucket and location
# BUCKET_NAME = 'sagemaker-firefly-model-artifacts'
# LOCATION_IN_BUCKET = 'smdebug_debug'

# s3_bucket_for_tensors = 's3://{BUCKET_NAME}/{LOCATION_IN_BUCKET}'.format(BUCKET_NAME=BUCKET_NAME, LOCATION_IN_BUCKET=LOCATION_IN_BUCKET)
# ##

sagemaker_session = sagemaker.Session() #use for remote session

bucket = sagemaker_session.default_bucket()
# prefix = 'sagemaker/DEMO-pytorch-mnist'

role = sagemaker.get_execution_role()

In [2]:
############# Path Set Up #################
# speciy location of training data
# train_data = 's3://sagemaker-firefly-model-data/Cloud Segmentation Data/38-Cloud_training'.format(bucket, prefix, 'train')
train_data = 's3://sagemaker-firefly-model-data/Cloud Segmentation Data/38-Cloud_training'
# train_data="file:///home/ec2-user/SageMaker/Dataset/CloudSeg/38-Cloud_training"
# validation_data = 's3://{}/{}/{}'.format(bucket, prefix, 'validation') currently training script does this split

# specifiy location for model output to be saved
# s3_output_location = 's3://sagemaker-firefly-model-artifacts'.format(bucket, prefix, 'xgboost_model_sdk')
s3_output_location = 's3://sagemaker-firefly-model-artifacts/segnet2_tuning'

In [11]:
role

'arn:aws:iam::067338613469:role/service-role/AmazonSageMaker-ExecutionRole-20200914T180083'

In [12]:
############# Initialise the PyTorch training estimator object ###########

# train_instance_type='ml.c4.4xlarge'
# estimator = PyTorch(entry_point='segnet2_train_script.py',
#                     role=role,
#                     framework_version='1.4.0',
#                     train_instance_count=1,
#                     train_instance_type='local',
#                     output_path=s3_output_location,
#                     hyperparameters={
#                         'epochs': 5,
#                         'learning_rate':0.001,
#                         'batch-size':8
#                     },
#                     debugger_hook_config = DebuggerHookConfig(
#                         s3_output_path=s3_bucket_for_tensors,  # Required
#                         collection_configs=[
#                             CollectionConfig(
#                                 name="conv0_tensors",
#                                 parameters={
#                                     "include_regex": "*",
#                                     "save_interval": "100"
#                                 }
#                             )
#                         ]
#                     )
#                    )
estimator = PyTorch(entry_point='segnet2_hypertuning_train.py',
                    role=role,
                    framework_version='1.4.0',
                    train_instance_count=2,
                    # train_use_spot_instances = True, account not allowed
                    train_instance_type='ml.p2.xlarge',
                    output_path=s3_output_location,
                    sagemaker_session=sagemaker_session,
                    hyperparameters={
                        'epochs': 15,
                        'backend': 'gloo',
                        'batch_size': 8
                    }
                   )

# add train_use_spot_instances = True for spot training

In [13]:
hyperparameter_ranges = {
    'learning_rate': ContinuousParameter(0.001, 0.1)
    }
# turns out that a batch size of 32 was too much for the GPU
# changed to multiple's of 4 instead of powers of 2 to provide the same number of options

# batch size note tested, as the training instance did not have enough memory to handle decently sized batch sizes.


In [14]:
objective_metric_name = 'average test loss'
objective_type = 'Minimize'
metric_definitions = [{'Name': 'average test loss',
                       'Regex': 'Test set: Average loss: ([0-9\\.]+)'}]

In [15]:
tuner = HyperparameterTuner(estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=20,
                            max_parallel_jobs=2,
                            objective_type=objective_type)

In [16]:
tuner.fit({'train': train_data}, wait=False, logs='All')

'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


# Result
* The hypertuning job failed with the cuase being "CUDA out of memory"
* We requrie larger instances
* Current quotas bar us from accessing any of the larger instances
* In the process of requresting for more, have sent email and waiting

# Second Attempt
* The quotas have been passed
* Now the hypertuning will be tried again
* The purpose of this is to test the hypertuning funciton before it is applied to the smoke segmentation model

In [9]:
############# Initialise the PyTorch training estimator object ###########

estimator = PyTorch(entry_point='segnet2_hypertuning_train.py',
                    role=role,
                    framework_version='1.4.0',
                    train_instance_count=1,
                    # train_use_spot_instances = True, account not allowed
                    train_instance_type='ml.p3.2xlarge',
                    output_path=s3_output_location,
                    sagemaker_session=sagemaker_session,
                    hyperparameters={
                        'epochs': 15,
                        'backend': 'gloo',
                    }
                   )

# add train_use_spot_instances = True for spot training

In [10]:
# define the hyperparaters and ranges to be tested
hyperparameter_ranges = {
    'learning_rate': ContinuousParameter(0.001, 0.1),
    'batch_size': CategoricalParameter([4,8,16,32])
    }


In [11]:
# required set up for hypertuning
objective_metric_name = 'average test loss'
objective_type = 'Minimize'
metric_definitions = [{'Name': 'average test loss',
                       'Regex': 'Test set: Average loss: ([0-9\\.]+)'}]

In [12]:
# initialise the hyperparameter tuning object
tuner = HyperparameterTuner(estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=20,
                            max_parallel_jobs=2,
                            objective_type=objective_type)

In [13]:
# run the tuning
tuner.fit({'train': train_data}, wait=False, logs='All')

'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
