# First Attempt at Training Script for SegNet
* This notebook marks the first attempt to train one of the segmentation models in sagemaker
* Sagemaker allows you the advantage of being able to send jobs to other instances and training indepenent of the current kernal
* Previously, training attempted had been interrupted, and progress lost

In [1]:
############## Set Up ################

import sagemaker
from sagemaker.pytorch import PyTorch
##yan - training on local instance to see what the problem is  
import boto3
import os
from sagemaker.local import LocalSession
from sagemaker.debugger import rule_configs, DebuggerHookConfig, CollectionConfig

sagemaker_session = LocalSession()
sagemaker_session.config = {'local': {'local_code': True}}
# Make sure to set this to your bucket and location
BUCKET_NAME = 'sagemaker-firefly-model-artifacts'
LOCATION_IN_BUCKET = 'smdebug_debug'

s3_bucket_for_tensors = 's3://{BUCKET_NAME}/{LOCATION_IN_BUCKET}'.format(BUCKET_NAME=BUCKET_NAME, LOCATION_IN_BUCKET=LOCATION_IN_BUCKET)
##

# sagemaker_session = sagemaker.Session() #use for remote session

bucket = sagemaker_session.default_bucket()
# prefix = 'sagemaker/DEMO-pytorch-mnist'

role = sagemaker.get_execution_role()

In [2]:
############# Path Set Up #################
# speciy location of training data
# train_data = 's3://sagemaker-firefly-model-data/Cloud Segmentation Data/38-Cloud_training'.format(bucket, prefix, 'train')
# train_data = 's3://sagemaker-firefly-model-data/Cloud Segmentation Data/38-Cloud_training'
train_data="file:///home/ec2-user/SageMaker/Dataset/CloudSeg/38-Cloud_training"
# validation_data = 's3://{}/{}/{}'.format(bucket, prefix, 'validation') currently training script does this split

# specifiy location for model output to be saved
# s3_output_location = 's3://sagemaker-firefly-model-artifacts'.format(bucket, prefix, 'xgboost_model_sdk')
s3_output_location = 's3://sagemaker-firefly-model-artifacts'

In [3]:
train_data

'file:///home/ec2-user/SageMaker/Dataset/CloudSeg/38-Cloud_training'

In [16]:
############# Initialise the PyTorch training estimator object ###########

#train_instance_type='ml.c4.4xlarge'
estimator = PyTorch(entry_point='Refactored_segnet2_train_script.py',
                    role=role,
                    framework_version='1.4.0',
                    train_instance_count=1,
                    train_instance_type='local',
                    output_path=s3_output_location,
                    hyperparameters={
                        'epochs': 5,
                        'learning_rate':0.001,
                        'batch_size':8
                    },
                    debugger_hook_config = DebuggerHookConfig(
                        s3_output_path=s3_bucket_for_tensors,  # Required
                        collection_configs=[
                            CollectionConfig(
                                name="conv0_tensors",
                                parameters={
                                    "include_regex": "*",
                                    "save_interval": "100"
                                }
                            )
                        ]
                    )
                   )
# estimator = PyTorch(entry_point='segnet2_train_script.py',
#                     role=role,
#                     framework_version='1.4.0',
#                     train_instance_count=1,
#                     train_instance_type='local',
#                     output_path=s3_output_location,
#                     sagemaker_session=sagemaker_session,
#                     hyperparameters={
#                         'epochs': 5,
#                         'learning_rate':0.001,
#                         'backend': 'gloo'
#                     },
#                     debugger_hook_config = DebuggerHookConfig(
#                         s3_output_path=s3_bucket_for_tensors,  # Required
#                         collection_configs=[
#                             CollectionConfig(
#                                 name="conv0_tensors",
#                                 parameters={
#                                     "include_regex": "*",
#                                     "save_interval": "100"
#                                 }
#                             )
#                         ]
#                     )
#                    )

# add train_use_spot_instances = True for spot training

In [17]:
############## Run the training ################
# estimator.fit({'train': train_data})
estimator.fit({'train': train_data}, wait=False, logs='All') #job runs in background

'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


Creating tmpt920r_pb_algo-1-y7bzy_1 ... 
[1BAttaching to tmpt920r_pb_algo-1-y7bzy_12mdone[0m
[36malgo-1-y7bzy_1  |[0m 2020-09-24 00:33:12,565 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training
[36malgo-1-y7bzy_1  |[0m 2020-09-24 00:33:12,568 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-y7bzy_1  |[0m 2020-09-24 00:33:12,577 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
[36malgo-1-y7bzy_1  |[0m 2020-09-24 00:33:12,579 sagemaker_pytorch_container.training INFO     Invoking user training script.
[36malgo-1-y7bzy_1  |[0m 2020-09-24 00:33:13,767 sagemaker-containers INFO     Module default_user_module_name does not provide a setup.py. 
[36malgo-1-y7bzy_1  |[0m Generating setup.py
[36malgo-1-y7bzy_1  |[0m 2020-09-24 00:33:13,767 sagemaker-containers INFO     Generating setup.cfg
[36malgo-1-y7bzy_1  |[0m 2020-09-24 00:33:13,767 sagemaker-containers INFO     

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def get_data(trial, tname, batch_index, steps_range, mode=modes.GLOBAL):
    tensor = trial.tensor(tname)
    vals = []
    for s in steps_range:
        val = tensor.value(step_num=s, mode=mode)[batch_index][0]
        vals.append(val)
    return vals

def create_plots(steps_range):
    fig, axs = plt.subplots(nrows=1, ncols=len(steps_range), constrained_layout=True, figsize=(2*len(steps_range), 2),
                            subplot_kw={'xticks': [], 'yticks': []})
    return fig, axs

def plot_tensors(trial, layer, batch_index, steps_range):
    if len(steps_range) > 0:    
        fig, axs = create_plots(steps_range)
        vals = get_data(trial, layer, batch_index, steps_range)

        for ax, image, step in zip(axs.flat if isinstance(axs, np.ndarray) else np.array([axs]), vals, steps_range):
            ax.imshow(image, cmap='gray')
            ax.set_title(str(step))
        plt.show()