In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import os
import json
import logging
from datetime import datetime

<IPython.core.display.Javascript object>

In [3]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri

<IPython.core.display.Javascript object>

In [4]:
boto3.set_stream_logger(name="botocore.credentials", level=logging.WARNING)

<IPython.core.display.Javascript object>

In [5]:
sess = sagemaker.Session()
region = sess.boto_region_name
print(region)

ap-southeast-2


<IPython.core.display.Javascript object>

In [6]:
# role_arn = sagemaker.get_execution_role()
role_arn = os.getenv("SGMKR_ROLE_ARN")

<IPython.core.display.Javascript object>

In [7]:
bucket_name = "sgmkr-course"
data_path = "sgmkr_clf_lst"

nclasses = 3
nimgs_train = 36
nepochs = 10
mini_batch_size = 8

train_instance_type = "ml.g4dn.xlarge"
job_name_prefix = "flowers-clf-ib-resent-"

<IPython.core.display.Javascript object>

https://aws.amazon.com/sagemaker/pricing/

In [8]:
train_image_uri = sagemaker.image_uris.retrieve(
    framework="image-classification",
    region=region,
    image_scope="training",
    version="latest",
)
print(train_image_uri)

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.
Ignoring unnecessary instance type: None.


544295431143.dkr.ecr.ap-southeast-2.amazonaws.com/image-classification:1


<IPython.core.display.Javascript object>

In [9]:
s3_output_path = "s3://{}/{}/{}".format(bucket_name, data_path, "model_output")

<IPython.core.display.Javascript object>

In [10]:
clf_estimator = sagemaker.estimator.Estimator(
    image_uri=train_image_uri,
    role=role_arn,
    instance_count=1,
    instance_type=train_instance_type,
    volume_size=50,
    max_run=360000,
    input_mode="File",
    output_path=s3_output_path,
    sagemaker_session=sess,
)

<IPython.core.display.Javascript object>

In [11]:
clf_estimator.set_hyperparameters(
    num_classes=nclasses,  # update this
    epochs=nepochs,  # update this
    num_training_samples=nimgs_train,  # update this
    mini_batch_size=mini_batch_size,  # update this
    num_layers=18,
    use_pretrained_model=1,
    image_shape="3,224,224",
    resize=256,
    learning_rate=0.001,
    use_weighted_loss=1,
    augmentation_type="crop_color_transform",
    precision_dtype="float32",
    multi_label=0,
)

<IPython.core.display.Javascript object>

In [12]:
s3_train_imgs = "s3://{}/{}/{}".format(bucket_name, data_path, "train_imgs")
s3_valid_imgs = "s3://{}/{}/{}".format(bucket_name, data_path, "valid_imgs")
s3_train_annot = "s3://{}/{}/{}".format(bucket_name, data_path, "train_annots")
s3_valid_annot = "s3://{}/{}/{}".format(bucket_name, data_path, "valid_annots")

train_imgs = sagemaker.inputs.TrainingInput(
    s3_train_imgs,
    distribution="FullyReplicated",
    content_type="application/jpeg",
    s3_data_type="S3Prefix",
)
valid_imgs = sagemaker.inputs.TrainingInput(
    s3_valid_imgs,
    distribution="FullyReplicated",
    content_type="application/jpeg",
    s3_data_type="S3Prefix",
)
train_annot = sagemaker.inputs.TrainingInput(
    s3_train_annot,
    distribution="FullyReplicated",
    content_type="application/jpeg",
    s3_data_type="S3Prefix",
)
valid_annot = sagemaker.inputs.TrainingInput(
    s3_valid_annot,
    distribution="FullyReplicated",
    content_type="application/jpeg",
    s3_data_type="S3Prefix",
)

data_channels = {
    "train": train_imgs,
    "validation": valid_imgs,
    "train_lst": train_annot,
    "validation_lst": valid_annot,
}

<IPython.core.display.Javascript object>

In [13]:
timestamp = (
    str(datetime.now().replace(microsecond=0)).replace(" ", "-").replace(":", "-")
)
job_name = job_name_prefix + timestamp
print(job_name)

flowers-clf-ib-resent-2023-03-21-16-47-50


<IPython.core.display.Javascript object>

In [15]:
clf_estimator.fit(inputs=data_channels, logs=True, job_name=job_name)

Creating training-job with name: flowers-clf-ib-resent-2023-03-21-16-47-50


2023-03-21 05:48:23 Starting - Starting the training job...
2023-03-21 05:48:48 Starting - Preparing the instances for training......
2023-03-21 05:49:59 Downloading - Downloading input data...
2023-03-21 05:50:24 Training - Downloading the training image...............
2023-03-21 05:52:55 Training - Training image download completed. Training in progress.....[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34mNvidia gpu devices, drivers and cuda toolkit versions (only available on hosts with GPU):[0m
[34mTue Mar 21 05:53:35 2023       [0m
[34m+-----------------------------------------------------------------------------+[0m
[34m| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |[0m
[34m|-------------------------------+----------------------+----------------------+[0m
[34m| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |[0m
[34m| Fan  Temp  Perf  Pw

[34m[03/21/2023 05:53:47 INFO 140106315691840] Epoch[3] Train-accuracy=0.781250[0m
[34m[03/21/2023 05:53:47 INFO 140106315691840] Epoch[3] Time cost=0.514[0m
[34m[03/21/2023 05:53:47 INFO 140106315691840] Epoch[3] Validation-accuracy=0.812500[0m
[34m[03/21/2023 05:53:47 INFO 140106315691840] Storing the best model with validation accuracy: 0.812500[0m
[34m[03/21/2023 05:53:47 INFO 140106315691840] Saved checkpoint to "/opt/ml/model/image-classification-0004.params"[0m
[34m[03/21/2023 05:53:48 INFO 140106315691840] Epoch[4] Train-accuracy=0.906250[0m
[34m[03/21/2023 05:53:48 INFO 140106315691840] Epoch[4] Time cost=0.291[0m
[34m[03/21/2023 05:53:48 INFO 140106315691840] Epoch[4] Validation-accuracy=0.750000[0m
[34m[03/21/2023 05:53:49 INFO 140106315691840] Epoch[5] Train-accuracy=0.937500[0m
[34m[03/21/2023 05:53:49 INFO 140106315691840] Epoch[5] Time cost=0.458[0m
[34m[03/21/2023 05:53:49 INFO 140106315691840] Epoch[5] Validation-accuracy=0.875000[0m
[34m[03/21/2

<IPython.core.display.Javascript object>

In [16]:
infer_instance_type = "ml.t2.medium"
model_name = job_name
endpoint_name = job_name

<IPython.core.display.Javascript object>

In [19]:
clf_predictor = clf_estimator.deploy(
    initial_instance_count=1,
    instance_type=infer_instance_type,
    endpoint_name=endpoint_name,
    model_name=model_name,
)

Creating model with name: flowers-clf-ib-resent-2023-03-21-16-47-50
Creating endpoint-config with name flowers-clf-ib-resent-2023-03-21-16-47-50
Creating endpoint with name flowers-clf-ib-resent-2023-03-21-16-47-50


-----------------------------------------------!

<IPython.core.display.Javascript object>

In [20]:
sgmkr_runt = boto3.client("runtime.sagemaker")

<IPython.core.display.Javascript object>

In [21]:
with open("images/rose.jpg", "rb") as image:
        payload = image.read()
        payload = bytearray(payload)
        
response = sgmkr_runt.invoke_endpoint(
    EndpointName = endpoint_name,
    ContentType = 'image/jpeg',
    Accept = "application/json;verbose",
    Body = payload,
)

prediction = json.loads(response['Body'].read().decode())
print(prediction)


[0.0009680639486759901, 0.9901915788650513, 0.008840403519570827]


<IPython.core.display.Javascript object>

In [22]:
clf_predictor.delete_endpoint()

Deleting endpoint configuration with name: flowers-clf-ib-resent-2023-03-21-16-47-50
Deleting endpoint with name: flowers-clf-ib-resent-2023-03-21-16-47-50


<IPython.core.display.Javascript object>