In [1]:
import boto3
import sagemaker
from sagemaker import Session
from sagemaker.session import TrainingInput
from sagemaker.tensorflow import TensorFlow as tf
from sagemaker.tensorflow import TensorFlowModel
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, ContinuousParameter, CategoricalParameter
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
import tensorflow
import s3fs
import numpy as np

RANDOM_SEED = 0
sagemaker.__version__

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


2025-08-14 22:46:45.240584: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'2.245.0'

In [2]:
fs = s3fs.S3FileSystem()
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
s3 = boto3.client("s3")
bucket = sess.default_bucket()   # will be sagemaker-ap-southeast-2-838084669510
prefix = "aiornot"
print(f"S3 Bucket: f{bucket}")

s3_small_train_path = f"s3://{bucket}/{prefix}/small_train/small_train.npz"
s3_train_path = f"s3://{bucket}/{prefix}/train/train.npz"
s3_test_path = f"s3://{bucket}/{prefix}/test/test.npz"

small_train_input = TrainingInput(s3_small_train_path, content_type="application/x-npz")
train_input = TrainingInput(s3_train_path, content_type="application/x-npz")
test_input = TrainingInput(s3_test_path, content_type="application/x-npz")


S3 Bucket: fsagemaker-ap-southeast-2-838084669510


In [13]:
estimator = tf(
    entry_point="train.py",
    source_dir="src",   # where train.py and model_def.py live
    role=role,
    # use_spot_instances=True,  # save money
    instance_type="ml.c5.2xlarge",
    instance_count=1,
    framework_version="2.14",
    py_version="py310",
    hyperparameters={
        "epochs": 1,
        "height": 512,
        "width": 512,
        "channels": 3
    },
)

# Step 2: define search space
hyperparameter_ranges = {
    # numeric / continuous
    "learning-rate": ContinuousParameter(1e-4, 1e-2, scaling_type="Logarithmic"),
    "dropout-rate": ContinuousParameter(0.0, 0.5),                 # if use-dropout=true
    # integer choices for layer sizes
    "batch-size": IntegerParameter(4, 8),                         # change range to suit memory
    "conv1-filters": IntegerParameter(16, 128),
    "conv2-filters": IntegerParameter(32, 256),
    "dense-units": IntegerParameter(64, 512),
    # categorical choices
    "pooling": CategoricalParameter(["max", "avg"]),
    "use-dropout": CategoricalParameter(["true", "false"]),
    "optimizer": CategoricalParameter(["adam", "adagrad"]),
}

# Step 3: define regex to extract val_accuracy from train.py logs
metric_definitions = [{
    "Name": "val_accuracy",
    "Regex": "val_accuracy: ([0-9\\.]+)"
}]

# Step 4: set up the tuner
tuner = HyperparameterTuner(
    estimator=estimator,
    objective_metric_name="val_accuracy",
    # strategy='Hyperband',
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=metric_definitions,
    max_parallel_jobs=2,
    objective_type="Maximize",
    early_stopping_type="Auto",
    max_jobs=1,
    base_tuning_job_name="ph-14"
)

# Step 5: launch it using small_train.npz for both train + val
tuner.fit({
    "train": small_train_input,
    "test": test_input,
})


No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


..................................................................................................................................................!


In [14]:
best_job_name = tuner.best_training_job()
print("Best training job:", best_job_name)
best_estimator = tuner.best_estimator()
sm = boto3.client("sagemaker")
resp = sm.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=best_job_name)

# hyperparameters (strings)
print("Hyperparameters (strings):")
for hyperparameter, value in resp.get("HyperParameters", {}).items():
    print(hyperparameter, ":", value)

print("\nFinalMetricDataList:")
for m in resp.get("FinalMetricDataList", []):
    print(m)


Best training job: ph-14-250814-2347-001-17ca7129

2025-08-14 23:57:55 Starting - Preparing the instances for training
2025-08-14 23:57:55 Downloading - Downloading the training image
2025-08-14 23:57:55 Training - Training image download completed. Training in progress.
2025-08-14 23:57:55 Uploading - Uploading generated training model
2025-08-14 23:57:55 Completed - Resource released due to keep alive period expiry
Hyperparameters (strings):
_tuning_objective_metric : val_accuracy
batch-size : 6
channels : 3
conv1-filters : 66
conv2-filters : 199
dense-units : 202
dropout-rate : 0.4586195089742096
epochs : 1
height : 512
learning-rate : 0.0004463910587691542
model_dir : "s3://sagemaker-ap-southeast-2-838084669510/tensorflow-training-2025-08-14-23-47-00-882/model"
optimizer : "adam"
pooling : "avg"
sagemaker_container_log_level : 20
sagemaker_estimator_class_name : "TensorFlow"
sagemaker_estimator_module : "sagemaker.tensorflow.estimator"
sagemaker_job_name : "tensorflow-training-2025

In [13]:
model_name = "tensorflow-training-2025-08-14-04-53-54-648"
model_s3_path = f"s3://{bucket}/{model_name}/source/sourcedir.tar.gz"
output_s3 = f"s3://{bucket}/batch-output/tf-transform-{model_name}/"
input_s3_uri = f"s3://{bucket}/aiornot/test/test.npz"
endpoint_name = 'model-endpoint'

model = TensorFlowModel(
    model_data=model_s3_path,
    role=role,
    entry_point="inference.py",
    source_dir="src",
    framework_version="2.14",
    sagemaker_session=sess
)

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.c5.xlarge",
    endpoint_name=endpoint_name,
    wait=True
)

predictor.serializer = JSONSerializer()
predictor.deserializer = JSONDeserializer()
predictor.content_type = "application/json"
predictor.accept = "application/json"

local_tmp = "/tmp/tiny_test.npz"
s3_key = "aiornot/tiny_test/tiny_test.npz"
s3.download_file(bucket, s3_key, local_tmp)

npz = np.load(local_tmp, allow_pickle=False)
print("npz keys:", npz.files)   # should print ['image','label']
images = npz["image"]           # shape (20, 512,512,3)
labels = npz["label"]           # shape (20,)

------------------------------------------*

Please check the troubleshooting guide for common errors: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-python-sdk-troubleshooting.html#sagemaker-python-sdk-troubleshooting-create-endpoint


In [6]:
sm = boto3.client("sagemaker")
training_job_name = "ph-12-250814-0453-001-32522441"
training_job = sm.describe_training_job(TrainingJobName=training_job_name)
print("S3ModelArtifacts:", training_job["ModelArtifacts"]["S3ModelArtifacts"])
print("TrainingJobStatus:", training_job.get("TrainingJobStatus"))
print("FailureReason:", training_job.get("FailureReason"))
print("ModelArtifacts S3 URI:", training_job.get("ModelArtifacts", {}).get("S3ModelArtifacts"))

S3ModelArtifacts: s3://sagemaker-ap-southeast-2-838084669510/ph-12-250814-0453-001-32522441/output/model.tar.gz
TrainingJobStatus: Completed
FailureReason: None
ModelArtifacts S3 URI: s3://sagemaker-ap-southeast-2-838084669510/ph-12-250814-0453-001-32522441/output/model.tar.gz


In [7]:
s3_uri = "s3://sagemaker-ap-southeast-2-838084669510/ph-12-250814-0453-001-32522441/output/model.tar.gz"
bucket, key = s3_uri.replace("s3://","").split("/", 1)
prefix = key.rsplit("/",1)[0] + "/"

paginator = s3.get_paginator("list_objects_v2")
found = False
print("Listing objects under:", prefix)
for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
    for obj in page.get("Contents", []):
        print(obj["Key"], obj["Size"])
        if obj["Key"] == key:
            found = True

print("\nmodel.tar.gz present?:", found)

Listing objects under: ph-12-250814-0453-001-32522441/output/

model.tar.gz present?: False


In [9]:
s3 = boto3.client("s3")
bucket = "sagemaker-ap-southeast-2-838084669510"

paginator = s3.get_paginator("list_objects_v2")
matches = []
for page in paginator.paginate(Bucket=bucket):
    for obj in page.get("Contents", []):
        k = obj["Key"]
        if k.endswith("saved_model.pb"):
            matches.append(k)
            print("Found saved_model.pb at:", k)

print("Total saved_model.pb files found:", len(matches))

Found saved_model.pb at: tensorflow-training-2025-08-14-01-17-19-905/model/ph-08-250814-0117-001-fa1ff411/model/1/saved_model.pb
Found saved_model.pb at: tensorflow-training-2025-08-14-04-13-29-059/model/ph-10-250814-0413-001-7b46a067/model/1/saved_model.pb
Found saved_model.pb at: tensorflow-training-2025-08-14-04-53-54-648/model/ph-12-250814-0453-001-32522441/model/1/saved_model.pb
Total saved_model.pb files found: 3
