In [12]:
import boto3
import sagemaker
from sagemaker import get_execution_role
# from sagemaker.image_uris import retrieve
from sagemaker.s3 import *
import sys
import subprocess
import os


# TODO: Port the training job to Sagemaker 2.0. For now, this works fine. 
if int(sagemaker.__version__.split('.')[0]) == 2:
    !{sys.executable} -m pip install sagemaker==1.72.0
    print("Installing previous SageMaker Version. Please restart the kernel")
else:
    print("Version is good")

role = get_execution_role()


sess = sagemaker.Session(default_bucket=None)
region = boto3.session.Session().region_name
print("Region = {}".format(region))

sm = boto3.Session().client('sagemaker')


Version is good
Region = eu-central-1


In [13]:

# see https://sagemaker.readthedocs.io/en/stable/api/utility/image_uris.html for which inputs to use
# see https://github.com/aws/deep-learning-containers/blob/master/available_images.md for registry paths with custom algorithms
prefix = "763104351884.dkr.ecr.eu-central-1.amazonaws.com/pytorch-training"
PREPROCESSING_IMAGE = "{}:{}".format(prefix, "1.6.0-cpu-py36-ubuntu16.04")
PREPROCESS_INSTANCE = "ml.m5.xlarge"


TRAINING_IMAGE = "{}:{}".format(prefix, "1.6.0-gpu-py36-cu110-ubuntu16.04")
TRAINING_INSTANCE = "ml.g4dn.xlarge" 

print ("Preprocessing image: ", PREPROCESSING_IMAGE)
print ("Training image: ", TRAINING_IMAGE)



Preprocessing image:  763104351884.dkr.ecr.eu-central-1.amazonaws.com/pytorch-training:1.6.0-cpu-py36-ubuntu16.04
Training image:  763104351884.dkr.ecr.eu-central-1.amazonaws.com/pytorch-training:1.6.0-gpu-py36-cu110-ubuntu16.04


In [14]:
!pip install sagemaker-experiments 
from sagemaker.analytics import ExperimentAnalytics
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m


In [20]:
sagemaker_local_bucket = sess.default_bucket() # Alternatively you can use our custom bucket here. 
original_data_bucket = "treetracker-training-images"
prefix = 'sagemaker-modelmonitor' # use this prefix to store all files pertaining to this workshop.

dataprefix = prefix + '/data'
traindataprefix = prefix + '/train_data'
testdataprefix = prefix + '/test_data'
testdatanolabelprefix = prefix + '/test_data_no_label'
trainheaderprefix = prefix + '/train_headers'

dataset_key = "imnet" # use this to restrict to a particular directory
train_key = "train"
validation_key = "validation"
test_key = "test"
s3_raw = 's3://{}/{}/'.format(original_data_bucket, dataset_key)
sagemaker_train = 's3://{}/{}/'.format(sagemaker_local_bucket, train_key)
sagemaker_validation = 's3://{}/{}/'.format(sagemaker_local_bucket, validation_key)
sagemaker_test = 's3://{}/{}/'.format(sagemaker_local_bucket, test_key)

os.environ["SAGEMAKER_VALIDATION"] = sagemaker_validation
os.environ["SAGEMAKER_TRAIN"] = sagemaker_train
os.environ["SAGEMAKER_TEST"] = sagemaker_test

In [21]:
!aws s3 rm $SAGEMAKER_VALIDATION --recursive --quiet
!aws s3 rm $SAGEMAKER_TRAIN --recursive --quiet
!aws s3 rm $SAGEMAKER_TEST --recursive --quiet


In [22]:
# TODO: Figure out preprocessing instance jobs
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import ScriptProcessor

pre_input = [ProcessingInput(source=s3_raw, destination="/opt/ml/processing/raw/", input_name="raw")]
pre_output = [ProcessingOutput(source="/opt/ml/processing/train/", destination=sagemaker_train),
              ProcessingOutput(source="/opt/ml/processing/validation/", destination=sagemaker_validation),
              ProcessingOutput(source="/opt/ml/processing/test/", destination=sagemaker_test)]
            
              
script_processor = ScriptProcessor(command= ["python"], 
                                    image_uri=PREPROCESSING_IMAGE,
                                    role=role,
                                    instance_count=1,
                                    instance_type=PREPROCESS_INSTANCE,
                                    base_job_name="preprocessing-test", 
                                    max_runtime_in_seconds=7200)

preprocessing_script = "preprocessing_p1.py" # Put path to preprocessing script here

In [23]:
script_processor.run(preprocessing_script, 
                    inputs=pre_input,
                    outputs=pre_output,
                    arguments=None)

# default arguments in script should work for now

INFO:sagemaker:Creating processing-job with name preprocessing-test-2021-01-19-00-42-58-482



Job Name:  preprocessing-test-2021-01-19-00-42-58-482
Inputs:  [{'InputName': 'raw', 'S3Input': {'S3Uri': 's3://treetracker-training-images/imnet/', 'LocalPath': '/opt/ml/processing/raw/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-eu-central-1-053061259712/preprocessing-test-2021-01-19-00-42-58-482/input/code/preprocessing_p1.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'output-1', 'S3Output': {'S3Uri': 's3://sagemaker-eu-central-1-053061259712/train/', 'LocalPath': '/opt/ml/processing/train/', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'output-2', 'S3Output': {'S3Uri': 's3://sagemaker-eu-central-1-053061259712/validation/', 'LocalPath': '/opt/ml/processing/validation/', 'S3UploadMode': 'E

In [None]:
## TODO: Add step for RecordIO format 

In [32]:
from sagemaker.pytorch import PyTorch

# This is where you can add hyperparameters, framework used, point to the script, and define instances you want to train on. 
# ALl of this information is represented as environment variables passed to the instance. In your script, you can refer to these variables or 
# the argument. 

# TODO: add metric monitoring via CloudWatch 
# https://docs.aws.amazon.com/sagemaker/latest/dg/training-metrics.html
estimator = PyTorch(entry_point='mobilenet_v2.py',
                    role=role,
                    framework_version='1.6.0',
                    train_instance_count=1,
                    train_instance_type=TRAINING_INSTANCE,
                    hyperparameters={
                        'epochs': 5,
                        'backend': 'gloo',
                        'train_split': 0.7, 
                        'log_interval': 200
                    },                   
                   )


In [None]:
 estimator.fit({"training": sagemaker_train, "validation": sagemaker_validation, "test": sagemaker_test})

INFO:sagemaker:Creating training-job with name: pytorch-training-2021-01-19-03-37-36-836


2021-01-19 03:37:37 Starting - Starting the training job...
2021-01-19 03:37:39 Starting - Launching requested ML instances......
2021-01-19 03:38:44 Starting - Preparing the instances for training...
2021-01-19 03:39:26 Downloading - Downloading input data.........................................................
2021-01-19 03:49:08 Training - Downloading the training image..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-01-19 03:49:23,110 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-01-19 03:49:23,130 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-01-19 03:49:24,565 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-01-19 03:49:24,956 sagemaker-training-toolkit INFO     Invoking user script
[0m
[34mTraining Env:
[0m
[34m{
    "additiona