### Common Setup

In [1]:
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

import json
import time
import urllib
import pandas as pd



In [2]:
# Environment details:


protocol = 'https'

master_host = 'p10a117.pbm.ihost.com'

dli_rest_port = '9243'
sc_rest_port = '8643'


sc_rest_url =  protocol+'://'+master_host+':'+sc_rest_port+'/platform/rest/conductor/v1'
dl_rest_url = protocol+'://'+master_host+':'+dli_rest_port+'/platform/rest/deeplearning/v1'

print (sc_rest_url)
print (dl_rest_url)
# User login details

wmla_user = 'id'
wmla_pwd = 'pwd'


myauth = (wmla_user, wmla_pwd)

# Spark instance group details
sigName = 'b0p036a-dliauto'

# REST call variables
commonHeaders = {'Accept': 'application/json'}


#startTuneUrl='%s://%s:%s/platform/rest/deeplearning/v1/hypersearch' % (protocol, master_host, dli_rest_port)
#sc_rest_url ='%s://%s:%d/platform/rest/conductor/v1' % (protocol, hostname, conductorport)

req = requests.Session()

https://p10a117.pbm.ihost.com:8643/platform/rest/conductor/v1
https://p10a117.pbm.ihost.com:9243/platform/rest/deeplearning/v1


### Health Check

Check if there is any existing hpo tasks and also verify the platform health

Rest API: **GET platform/rest/deeplearning/v1/hypersearch**
- Description: Get all the hpo task that the login user can access.
- OUTPUT: A list of hpo tasks and each one with the same format which can be found in the api doc.

In [3]:
getTuneStatusUrl = dl_rest_url + '/hypersearch'
print ('getTuneStatusUrl: %s' %getTuneStatusUrl)
r = req.get(getTuneStatusUrl, headers=commonHeaders, verify=False, auth=myauth)

if not r.ok:
    print('check hpo task status failed: code=%s, %s'%(r.status_code, r.content))
else:
    if len(r.json()) == 0:
        print('There is no hpo task been created')
    for item in r.json():
        print('Hpo task: %s, State: %s'%(item['hpoName'], item['state']))
        #print('Best:%s'%json.dumps(item.get('best'), sort_keys=True, indent=4))

getTuneStatusUrl: https://p10a117.pbm.ihost.com:9243/platform/rest/deeplearning/v1/hypersearch
Hpo task: b0p052aa-hpo-5380630135165402, State: FAILED
Hpo task: b0p052aa-hpo-5385332797413203, State: FAILED
Hpo task: kelvinl-hpo-6056618011742928, State: FAILED
Hpo task: kelvinl-hpo-6059288540719979, State: FAILED
Hpo task: kelvinl-hpo-6059602449155012, State: FINISHED
Hpo task: kelvinl-hpo-6060237740672447, State: FINISHED
Hpo task: kelvinl-hpo-6060511715229259, State: FINISHED
Hpo task: kelvinl-hpo-6062021439803581, State: FINISHED
Hpo task: kelvinl-hpo-6126145272048604, State: FINISHED
Hpo task: kelvinl-hpo-6126632704081853, State: FINISHED
Hpo task: kelvinl-hpo-6126795548204361, State: FINISHED
Hpo task: kelvinl-hpo-6131917238944766, State: FAILED
Hpo task: kelvinl-hpo-6132240163171963, State: FINISHED
Hpo task: kelvinl-hpo-6134136415189917, State: FINISHED
Hpo task: kelvinl-hpo-6137509396004738, State: FINISHED
Hpo task: b0p062ae-hpo-6149969004259192, State: FAILED
Hpo task: b0p062ae

### Launch a HPO task

#### Model file update to Run HPO

Model changes required from 2 perspective:
- Inject hyper-parameters for the sub-training during search
- Retrieve sub-training result metric

##### Model update part 1 - Inject hyper-parameters

The hyper-parameters will be supplied in a file called **config.json** with JSON format,located in the current working directory and can be read direcly as the following example snippet.

<pre>
hyper_params = json.loads(open("<b>config.json</b>").read())
learning_rate = float(hyper_params.get("<b>learning_rate</b>", "0.01"))
</pre>

After this, you can use these hyper-parameters during the model trainings. The **hyper-parameter name** and **value** type is defined through the search space part in body of REST call when launching a new hpo task.

##### Model update part 2 - Retrieve sub-training result metric

At the end of your training run, your code will need to create a file called **val_dict_list.json** with test metrics generated during training. These metrics will be used by the search algorithm to propose new sets of hyper-parameters. Please note that **val_dict_list.json** should be created under the result directory which can be retrieved through the environment variable **RESULT_DIR**.

<pre>
with open('{}/val_dict_list.json'.format(os.environ['<b>RESULT_DIR</b>']), 'w') as f:
    json.dump(test_metrics, f)
</pre>

The content of **val_dict_list.json** will be some thing as below, **step** is some thing optional meaning the training iteration or epochs, one of **loss** and **accuracy** can be the name of target metric to optimize, at least one metric need to be included here. The specific name of metric used to optimize (minimize or maximize) is defined in the body of REST call when launching a new hpo task. 

```
[
{‘step’: 1, ‘loss’:0.2487, ‘accuracy’: 0.4523},
{‘step’: 2, ‘loss’:0.1487, ‘accuracy’: 0.5523},
{‘step’: 3, ‘loss’:0.1087, ‘accuracy’: 0.6523},
…
]
```

#### Launch HPO task

REST API: **POST /platform/rest/deeplearning/v1/hypersearch**
- Description: Start a new HPO task
- Content-type: Multi-Form
- Multi-Form Data:
  - files: Model files tar package, ending with `.modelDir.tar`
  - form-filed: {‘data’: ‘String format of input parameters to start hpo task, let’s call it as **hpo_input** and show its specification later’}


#### Model file update to Run HPO

##### Package model files for training

Package the updated model files into a tar file ending with `.modelDir.tar`

In [4]:
import tarfile
import tempfile
import os
def make_tarfile(output_filename, source_dir):
    with tarfile.open(output_filename, "w:gz") as tar:
        tar.add(source_dir, arcname=os.path.basename(source_dir))
MODEL_DIR_SUFFIX = ".modelDir.tar"
tempFile = tempfile.mktemp(MODEL_DIR_SUFFIX)
make_tarfile(tempFile, '/gpfs/software/wmla-p10a117/dli_data_fs/models/pytorch_hpo')
print(" tempFile: " + tempFile)
files = {'file': open(tempFile, 'rb')}

 tempFile: /tmp/tmppzj0aux3.modelDir.tar


##### Construct POST request data

**hpo_input** will be a Python dict or json format as below, convert to string when calling REST.

In [5]:
data =  {
        'modelSpec': # Define the model training related parameters
        {
            # Spark instance group which will be used to run the HPO sub-trainings. The Spark instance group selected
            # here should match the sub-training args, for example, if the sub-training args try to run a EDT job,
            # then we should put a Spark instance group with capability to run EDT job here.
            'sigName': sigName,

            # These are the arguments we'll pass to the execution engine; they follow the same conventions
            # of the dlicmd.py command line launcher
            #
            # See:
            #   https://www.ibm.com/support/knowledgecenter/en/SSFHA8_1.2.1/cm/dlicmd.html
            # In this example, args after --model-dir are all the required parameter for the original model itself.
            #
            'args': '--exec-start PyTorch --cs-datastore-meta type=fs --python-version 3.6\
                     --gpuPerWorker 1 --model-main pytorch_mnist_HPO.py --model-dir pytorch_hpo\
                     --debug-level debug'
                
        },
    
        'algoDef': # Define the parameters for search algorithms
        {
            # Name of the search algorithm, one of Random, Bayesian, Tpe, Hyperband, ExperimentGridSearch
            'algorithm': 'Random', 
            # Max running time of the hpo task in minutes, -1 means unlimited
            'maxRunTime': 60,  
            # Max number of training job to submitted for hpo task, -1 means unlimited’,
            'maxJobNum': 4,            
            # Max number of training job to run in parallel, default 1. It depends on both the
            # avaiable resource and if the search algorithm support to run in parallel, current only Random
            # fully supports to run in parallel, Hyperband and Tpe supports to to in parellel in some phase,
            # Bayesian runs in sequence now.
            'maxParalleJobNum': 2, 
            # Name of the target metric that we are trying to optimize when searching hyper-parameters.
            # It is the same metric name that the model update part 2 trying to dump.
            'objectiveMetric' : 'loss',
            # Strategy as how to optimize the hyper-parameters, minimize means to find better hyper-parameters to
            # make the above objectiveMetric as small as possible, maximize means the opposite.
            'objective' : 'minimize',
        },
    
        # Define the hyper-paremeters to search and the corresponding search space.
        'hyperParams':
        [
             {
                 # Hyperparameter name, which will be the hyper-parameter key in config.json
                 'name': 'learning_rate',
                 # One of Range, Discrete
                 'type': 'Range',
                 # one of int, double, str
                 'dataType': 'DOUBLE',
                 # lower bound and upper bound when type=range and dataType=double
                 'minDbVal': 0.001,
                 'maxDbVal': 0.1,
                 # lower bound and upper bound when type=range and dataType=int
                 'minIntVal': 0,
                 'maxIntVal': 0,
                 # Discrete value list when type=discrete
                 'discreteDbVal': [],
                 'discreteIntVal': [],
                 'discreateStrVal': []
                 #step size to split the Range space. ONLY valid when type is Range
                 #'step': '0.002',
             }
         ]
    }
mydata={'data':json.dumps(data)}

##### Submit the Post request

Submit hpo task through the Post call and a hpo name/id as string format will get back.

In [6]:
startTuneUrl=dl_rest_url + '/hypersearch'
r = req.post(startTuneUrl, headers=commonHeaders, data=mydata, files=files, verify=False, auth=myauth)

if r.ok:
    hpoName = r.json()
    print ('\nModel submitted successfully: {}'.format(hpoName))
  
else:
    print('\nModel submission failed with code={}, {}'. format(r.status_code, r.content))


Model submitted successfully: kelvinl-hpo-410153859499985


In [7]:
import time

getHpoUrl = dl_rest_url +'/hypersearch/'+ hpoName

res = req.get(getHpoUrl, headers=commonHeaders, verify=False, auth=myauth)
if not res.ok:
    print('get hpo task failed: code=%s, %s'%(res.status_code, res.content))
else:
    json_out=res.json()
    
    while json_out['state'] in ['SUBMITTED','RUNNING']:
        print('Hpo task %s state %s progress %s%%'%(hpoName, json_out['state'], json_out['progress']))
        time.sleep(90)
        res = req.get(getHpoUrl, headers=commonHeaders, verify=False, auth=myauth)
        json_out=res.json()
        
        experiments_length = len(json_out['experiments'])
       
        ####
        ## Query the list of 6 sub-training of current batch, as maxParalleJobNum=6
        ###      
        count=0
        Experiment = []
        while (count < experiments_length):
                appID = json_out['experiments'][count]['appId']
                print ('appID: %s,' %appID )
                print ('count: %d' %count)
                Experiment.insert(count, appID)
                count+=1
 
        ####
        ## Query the state of 6 sub-training of current batch
        ###
    
        count = 0
        while (count < len(Experiment)):
                r = requests.get(dl_rest_url+'/execs/'+Experiment[count], auth=myauth, headers=commonHeaders, verify=False).json()    
                if not res.ok:
                    print('get hpo task failed: code=%s, %s'%(res.status_code, res.content))
                else:
                    print ('Experiement %s state: %s' %(Experiment[count], r['state']))
                count+=1
        
        #time.sleep(30)
        #print ('state:' + json_out['state'] )

        
print('Hpo task %s completes with state %s'%(hpoName, json_out['state']))
print(json.dumps(json_out, indent=4, sort_keys=True))
 


Hpo task kelvinl-hpo-410153859499985 state SUBMITTED progress 0%
appID: kelvinl-410156202978861-879625756,
count: 0
appID: kelvinl-410163463592585-49661814,
count: 1
Experiement kelvinl-410156202978861-879625756 state: RUNNING
Experiement kelvinl-410163463592585-49661814 state: RUNNING
Hpo task kelvinl-hpo-410153859499985 state RUNNING progress 0/4%
appID: kelvinl-410156202978861-879625756,
count: 0
appID: kelvinl-410163463592585-49661814,
count: 1
Experiement kelvinl-410156202978861-879625756 state: RUNNING
Experiement kelvinl-410163463592585-49661814 state: RUNNING
Hpo task kelvinl-hpo-410153859499985 state RUNNING progress 0/4%
appID: kelvinl-410156202978861-879625756,
count: 0
appID: kelvinl-410163463592585-49661814,
count: 1
Experiement kelvinl-410156202978861-879625756 state: RUNNING
Experiement kelvinl-410163463592585-49661814 state: RUNNING
Hpo task kelvinl-hpo-410153859499985 state RUNNING progress 0/4%
appID: kelvinl-410156202978861-879625756,
count: 0
appID: kelvinl-41016346