In [1]:
# Notebook Instance Imports
import os
import sagemaker
from sagemaker.tensorflow import TensorFlow
from sagemaker import get_execution_role
from sagemaker.estimator import Estimator
from sagemaker.debugger import ProfilerConfig, FrameworkProfile
import time
import io
import json
import pandas as pd
    
profiler_config=ProfilerConfig(
    framework_profile_params=FrameworkProfile()
)

# Identify Data Location

In [2]:
# S3 directories

training_files = "s3://canopy-production-ml/training_inputs/"
# val_file = "s3://canopy-production-ml/training_inputs/val_labels.csv"
# labels_file = "s3://canopy-production-ml/training_inputs/labels.json"

inputs = {"data":training_files}
hyperparameters = {"wandb_key":"abfa0dec9fc06fbfa6392496f40a22a8d47e58cf","epochs":"10","s3_chkpt_dir":"ckpt"}

print(inputs)
print(hyperparameters)

{'data': 's3://canopy-production-ml/training_inputs/'}
{'wandb_key': 'abfa0dec9fc06fbfa6392496f40a22a8d47e58cf', 'epochs': '10', 's3_chkpt_dir': 'ckpt'}


# Custom Docker for Training

In [3]:
%cd docker_test_folder

/home/ec2-user/SageMaker/cb_feature_detection/sagemaker_staging/docker_test_folder


In [4]:
! aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [5]:
! docker build -t tf-custom-container-test .

Sending build context to Docker daemon  52.74kB
Step 1/5 : FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-training:2.4.1-cpu-py37-ubuntu18.04
 ---> a6b6eae32037
Step 2/5 : ENV PATH="/opt/ml/code:${PATH}"
 ---> Using cache
 ---> 990d3b48ef57
Step 3/5 : RUN pip3 install rasterio wandb tensorflow-addons
 ---> Using cache
 ---> 959b07bf83e3
Step 4/5 : COPY cb_feature_train1_aws.py /opt/ml/code/train.py
 ---> Using cache
 ---> 92d86bcceb4f
Step 5/5 : ENV SAGEMAKER_PROGRAM train.py
 ---> Using cache
 ---> 894d842355a0
Successfully built 894d842355a0
Successfully tagged tf-custom-container-test:latest


# For Local Container test - Success

In [None]:
from sagemaker.estimator import Estimator

estimator = Estimator(image_uri='tf-custom-container-test',
                      role='arn:aws:iam::963659202518:role/service-role/AmazonSageMaker-ExecutionRole-20210306T191865',
                      instance_count=1,
#                       instance_type='ml.p3.2xlarge',
                      instance_type='local',
                     hyperparameters=hyperparameters)


estimator.fit(inputs)

Creating wgw7vx2rhr-algo-1-galxa ... 
Creating wgw7vx2rhr-algo-1-galxa ... done
Attaching to wgw7vx2rhr-algo-1-galxa
[36mwgw7vx2rhr-algo-1-galxa |[0m 2021-03-09 23:30:28.114872: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
[36mwgw7vx2rhr-algo-1-galxa |[0m 2021-03-09 23:30:28.119315: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.
[36mwgw7vx2rhr-algo-1-galxa |[0m 2021-03-09 23:30:28.373538: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
[36mwgw7vx2rhr-algo-1-galxa |[0m 2021-03-09 23:30:33,761 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training
[36mwgw7vx2rhr-algo-1-galxa |[0m 2021-03-09 23:30:33,762 sagemaker-training-toolkit INFO     Failed to parse hyperparameter wandb_key value abfa0dec9

# Publish Container to ECR

In [9]:
%%sh

# Specify an algorithm name
algorithm_name=pc-tf-custom-container-test

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.

aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1
if [ $? -ne 0 ]
then
aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly

$(aws ecr get-login --region ${region} --no-include-email)

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build -t ${algorithm_name} .
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

Login Succeeded
Sending build context to Docker daemon  50.69kB
Step 1/5 : FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-training:2.4.1-cpu-py37-ubuntu18.04
 ---> a6b6eae32037
Step 2/5 : ENV PATH="/opt/ml/code:${PATH}"
 ---> Using cache
 ---> 990d3b48ef57
Step 3/5 : RUN pip3 install rasterio keras wandb tensorflow-addons
 ---> Using cache
 ---> 3836f74d3886
Step 4/5 : COPY cb_feature_train1_aws.py /opt/ml/code/train.py
 ---> Using cache
 ---> cc5a338916a9
Step 5/5 : ENV SAGEMAKER_PROGRAM train.py
 ---> Using cache
 ---> 422176b2a1d4
Successfully built 422176b2a1d4
Successfully tagged pc-tf-custom-container-test:latest
The push refers to repository [963659202518.dkr.ecr.us-east-1.amazonaws.com/pc-tf-custom-container-test]
772a81ceacea: Preparing
bf38be370bde: Preparing
0859a4046b5c: Preparing
13a6259d0a5f: Preparing
7c9b17058a17: Preparing
e64228f78c01: Preparing
a4b459577f83: Preparing
2c6530437d13: Preparing
24a74e1f08ab: Preparing
1dc8a537c9f8: Preparing
805fb593f0a3: 

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



In [15]:
import boto3

account_id = boto3.client('sts').get_caller_identity().get('Account')
ecr_repository = 'pc-tf-custom-container-test'
tag = ':latest'

region = boto3.session.Session().region_name

uri_suffix = 'amazonaws.com'
if region in ['cn-north-1', 'cn-northwest-1']:
    uri_suffix = 'amazonaws.com.cn'

image_uri = '{}.dkr.ecr.{}.{}/{}'.format(account_id, region, uri_suffix, ecr_repository + tag)

image_uri
# This should return something like
# 111122223333.dkr.ecr.us-east-2.amazonaws.com/sagemaker-byoc-test:latest

'963659202518.dkr.ecr.us-east-1.amazonaws.com/pc-tf-custom-container-test:latest'

# For ECR Run

## Identify Data Location

In [12]:
# S3 directories

training_files = "s3://canopy-production-ml/training_inputs/"
# val_file = "s3://canopy-production-ml/training_inputs/val_labels.csv"
# labels_file = "s3://canopy-production-ml/training_inputs/labels.json"

# inputs = {"data":training_files}
# hyperparameters = {"wandb_key":"abfa0dec9fc06fbfa6392496f40a22a8d47e58cf"}

print(inputs)
print(hyperparameters)

{'data': 's3://canopy-production-ml/training_inputs/'}
{'wandb_key': 'abfa0dec9fc06fbfa6392496f40a22a8d47e58cf', 'epochs': '10'}


In [13]:
profiler_config=ProfilerConfig(
    framework_profile_params=FrameworkProfile(start_unix_time=int(time.time()), duration=600)
)

In [16]:
# create estimator
estimator = Estimator(image_uri=image_uri,
                       instance_type='ml.p3.16xlarge',
                       output_path='s3://canopy-production-ml-output',
                       base_job_name='pc-tf-custom-container-test-job',
                       instance_count=1,
                       role=get_execution_role(), # Passes to the container the AWS role that you are using on this notebook
                       py_version='py37',
                     profiler_config=profiler_config,
                     checkpoint_s3_uri='s3://canopy-production-ml-output/ckpt/',
                     hyperparameters=hyperparameters)

In [17]:
estimator.fit(inputs)

2021-03-09 16:19:36 Starting - Starting the training job...
2021-03-09 16:19:59 Starting - Launching requested ML instancesProfilerReport-1615306776: InProgress
.........
2021-03-09 16:21:25 Starting - Preparing the instances for training.........
2021-03-09 16:23:02 Downloading - Downloading input data...
2021-03-09 16:23:21 Training - Downloading the training image......
2021-03-09 16:24:25 Training - Training image download completed. Training in progress.[34m2021-03-09 16:24:22.853047: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2021-03-09 16:24:22.858417: I tensorflow/core/profiler/internal/smprofiler_config_reader.cc:123] PID of the process that is writing to the timeline : 1[0m
[34m2021-03-09 16:24:22.859574: I tensorflow/core/profiler/internal/smprofiler_timeline.cc:121] SageMaker Profiler Timeline Writer read the following config parameters :[0m
[34m2021-03-09 16:24:22.859586: I tensorflow/core/profiler/int

# Profiler Viewing

In [63]:
estimator.output_path

's3://canopy-production-ml-output'

In [64]:
estimator.latest_training_job.job_name

'pc-tf-custom-container-test-job-2021-03-09-06-30-02-520'

In [65]:
rule_output_path = estimator.output_path + "/"+ estimator.latest_training_job.job_name + "/rule-output"

In [66]:
rule_output_path

's3://canopy-production-ml-output/pc-tf-custom-container-test-job-2021-03-09-06-30-02-520/rule-output'

In [67]:
! aws s3 ls {rule_output_path} --recursive

2021-03-09 06:45:11     350803 pc-tf-custom-container-test-job-2021-03-09-06-30-02-520/rule-output/ProfilerReport-1615271402/profiler-output/profiler-report.html
2021-03-09 06:45:11     202858 pc-tf-custom-container-test-job-2021-03-09-06-30-02-520/rule-output/ProfilerReport-1615271402/profiler-output/profiler-report.ipynb
2021-03-09 06:45:07        192 pc-tf-custom-container-test-job-2021-03-09-06-30-02-520/rule-output/ProfilerReport-1615271402/profiler-output/profiler-reports/BatchSize.json
2021-03-09 06:45:07      53300 pc-tf-custom-container-test-job-2021-03-09-06-30-02-520/rule-output/ProfilerReport-1615271402/profiler-output/profiler-reports/CPUBottleneck.json
2021-03-09 06:45:07        126 pc-tf-custom-container-test-job-2021-03-09-06-30-02-520/rule-output/ProfilerReport-1615271402/profiler-output/profiler-reports/Dataloader.json
2021-03-09 06:45:07        130 pc-tf-custom-container-test-job-2021-03-09-06-30-02-520/rule-output/ProfilerReport-1615271402/profiler-output/profiler-r

In [24]:
def read_s3_obj(s3_key):
    s3 = boto3.resource('s3')
    obj = s3.Object('canopy-production-ml-output', s3_key)
    obj_bytes = io.BytesIO(obj.get()['Body'].read())
    return obj_bytes

In [33]:
key = 'pc-tf-custom-container-test-job-2021-03-07-19-37-23-579/rule-output/ProfilerReport-1615145843/profiler-output/profiler-reports/Dataloader.json'
data = json.load(read_s3_obj(key))

In [34]:
data

{'RuleTriggered': 0,
 'Violations': 0,
 'Details': {},
 'Datapoints': 0,
 'RuleParameters': 'min_threshold:70\nmax_threshold:200'}

# Search for H5 Files

In [27]:
import boto3
s3 = boto3.resource('s3')
my_bucket = s3.Bucket('canopy-production-ml-output')
files = my_bucket.objects.all()
file_list = []
for file in files:
    if file.key.endswith('.h5'):
         file_list.append(file.key)

In [28]:
file_list

[]

In [29]:
!pwd

/home/ec2-user/SageMaker/cb_feature_detection/sagemaker_staging/docker_test_folder


In [37]:
!ls

cb_feature_Launch_Training_Job2.ipynb  docker_test_folder   labels_test_v1.csv
cb_feature_Launch_Training_Job3.ipynb  entry_point_test.py  test_script.py
cb_feature_train1_aws.py	       labels.json	    val_labels.csv


In [30]:
df = pd.read_csv("/home/ec2-user/SageMaker/cb_feature_detection/sagemaker_staging/val_labels.csv")

In [33]:
from io import StringIO # python3; python2: BytesIO 
import boto3

bucket = 'canopy-production-ml-output' # already created on S3
csv_buffer = StringIO()
csv_buffer = df.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'ckpt/test.csv').put(Body=csv_buffer.getvalue())

AttributeError: 'NoneType' object has no attribute 'getvalue'

In [11]:
help(sagemaker.session)

Help on module sagemaker.session in sagemaker:

NAME
    sagemaker.session - Placeholder docstring

CLASSES
    builtins.object
        LogState
        Session
    
    class LogState(builtins.object)
     |  Placeholder docstring
     |  
     |  Data descriptors defined here:
     |  
     |  __dict__
     |      dictionary for instance variables (if defined)
     |  
     |  __weakref__
     |      list of weak references to the object (if defined)
     |  
     |  ----------------------------------------------------------------------
     |  Data and other attributes defined here:
     |  
     |  COMPLETE = 5
     |  
     |  JOB_COMPLETE = 4
     |  
     |  STARTING = 1
     |  
     |  TAILING = 3
     |  
     |  WAIT_IN_PROGRESS = 2
    
    class Session(builtins.object)
     |  Manage interactions with the Amazon SageMaker APIs and any other AWS services needed.
     |  
     |  This class provides convenient methods for manipulating entities and resources that Amazon
    