In [5]:
import boto3
import sagemaker
from sagemaker import Session
from sagemaker.session import TrainingInput
from sagemaker.tensorflow import TensorFlow as tf
from sagemaker.tensorflow import TensorFlowModel
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, ContinuousParameter, CategoricalParameter
import tensorflow
import s3fs
import numpy as np

RANDOM_SEED = 0
sagemaker.__version__

'2.245.0'

In [2]:
fs = s3fs.S3FileSystem()
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()   # will be sagemaker-ap-southeast-2-838084669510
prefix = "aiornot"
print(f"S3 Bucket: f{bucket}")

s3_small_train_path = f"s3://{bucket}/{prefix}/small_train/small_train.npz"
s3_train_path = f"s3://{bucket}/{prefix}/train/train.npz"
s3_test_path = f"s3://{bucket}/{prefix}/test/test.npz"

small_train_input = TrainingInput(s3_small_train_path, content_type="application/x-npz")
train_input = TrainingInput(s3_train_path, content_type="application/x-npz")
test_input = TrainingInput(s3_test_path, content_type="application/x-npz")


S3 Bucket: fsagemaker-ap-southeast-2-838084669510


In [9]:
estimator = tf(
    entry_point="train.py",
    source_dir="src",   # where train.py and model_def.py live
    role=role,
    # use_spot_instances=True,  # save money
    instance_type="ml.c5.2xlarge",
    instance_count=1,
    framework_version="2.14",
    py_version="py310",
    hyperparameters={
        "epochs": 1,
        "height": 512,
        "width": 512,
        "channels": 3,
        "model_dir": "/opt/ml/model"
    },
)

# Step 2: define search space
hyperparameter_ranges = {
    # numeric / continuous
    "learning-rate": ContinuousParameter(1e-4, 1e-2, scaling_type="Logarithmic"),
    "dropout-rate": ContinuousParameter(0.0, 0.5),                 # if use-dropout=true
    # integer choices for layer sizes
    "batch-size": IntegerParameter(4, 8),                         # change range to suit memory
    "conv1-filters": IntegerParameter(16, 128),
    "conv2-filters": IntegerParameter(32, 256),
    "dense-units": IntegerParameter(64, 512),
    # categorical choices
    "pooling": CategoricalParameter(["max", "avg"]),
    "use-dropout": CategoricalParameter(["true", "false"]),
}

# Step 3: define regex to extract val_accuracy from train.py logs
metric_definitions = [{
    "Name": "val_accuracy",
    "Regex": "val_accuracy: ([0-9\\.]+)"
}]

# Step 4: set up the tuner
tuner = HyperparameterTuner(
    estimator=estimator,
    objective_metric_name="val_accuracy",
    # strategy='Hyperband',
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=metric_definitions,
    max_parallel_jobs=2,
    objective_type="Maximize",
    early_stopping_type="Auto",
    max_jobs=1,
    base_tuning_job_name="ph-12"
)

# Step 5: launch it using small_train.npz for both train + val
tuner.fit({
    "train": small_train_input,
    "test": test_input,
})


No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


........................................................................................!


In [13]:
best_job_name = tuner.best_training_job()
print("Best training job:", best_job_name)
best_estimator = tuner.best_estimator()
sm = boto3.client("sagemaker")
resp = sm.describe_training_job(TrainingJobName=best_job_name)

# hyperparameters (strings)
print("Hyperparameters (strings):")
for hyperparameter, value in resp.get("HyperParameters", {}).items():
    print(hyperparameter, ":", value)

print("\nFinalMetricDataList:")
for m in resp.get("FinalMetricDataList", []):
    print(m)


Best training job: ph-12-250814-0453-001-32522441

2025-08-14 05:00:38 Starting - Preparing the instances for training
2025-08-14 05:00:38 Downloading - Downloading the training image
2025-08-14 05:00:38 Training - Training image download completed. Training in progress.
2025-08-14 05:00:38 Uploading - Uploading generated training model
2025-08-14 05:00:38 Completed - Resource released due to keep alive period expiry
Hyperparameters (strings):
_tuning_objective_metric : val_accuracy
batch-size : 5
channels : 3
conv1-filters : 18
conv2-filters : 225
dense-units : 469
dropout-rate : 0.2813732220223194
epochs : 1
height : 512
learning-rate : 0.0004852543347744691
model_dir : "s3://sagemaker-ap-southeast-2-838084669510/tensorflow-training-2025-08-14-04-53-54-648/model"
pooling : "avg"
sagemaker_container_log_level : 20
sagemaker_estimator_class_name : "TensorFlow"
sagemaker_estimator_module : "sagemaker.tensorflow.estimator"
sagemaker_job_name : "tensorflow-training-2025-08-14-04-53-54-648

In [7]:
model_name = "tensorflow-training-2025-08-14-04-53-54-648"
model_s3_path = f"s3://{bucket}/{model_name}/source/sourcedir.tar.gz"
output_s3 = f"s3://{bucket}/batch-output/tf-transform-{model_name}/"
input_s3_uri = f"s3://{bucket}/aiornot/test/test.npz"
endpoint_name = 'model_endpoint'

model = TensorFlowModel(
    model_data=model_s3_path,
    role=role,
    entry_point="inference.py",
    souce_dir="src",
    sagemaker_session=sess
)

transformer = model.transformer(
    instance_count=1,
    instance_type="ml.r5.4xlarge",   # memory-optimized; see notes below about sizing
    output_path=output_s3,
)

transformer.transform(
    data=input_s3_uri,
    data_type="S3Prefix",               # use S3Prefix for a single object URI too
    content_type="application/octet-stream",
    split_type="None",                  # send the whole file as one request to the container
    wait=True,
    logs=True
)

transformer.wait()
print("Batch transform finished. Outputs in:", output_s3)
