# Imports

In [26]:
import boto3
import sagemaker
from sagemaker import Session
from sagemaker.session import TrainingInput
from sagemaker.tensorflow import TensorFlow as tf
from sagemaker.tensorflow import TensorFlowModel
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, ContinuousParameter, CategoricalParameter
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
import tensorflow
import s3fs
import numpy as np
import json
RANDOM_SEED = 0
sagemaker.__version__

'2.245.0'

# Sagemaker Initialization

In [27]:
fs = s3fs.S3FileSystem()
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
s3 = boto3.client("s3")
bucket = sess.default_bucket()   # will be sagemaker-ap-southeast-2-838084669510
prefix = "aiornot"
print(f"S3 Bucket: f{bucket}")

s3_small_train_path = f"s3://{bucket}/{prefix}/small_train/small_train.npz"
s3_train_path = f"s3://{bucket}/{prefix}/train/train.npz"
s3_test_path = f"s3://{bucket}/{prefix}/test/test.npz"

s3_output_location = f"s3://{bucket}/{prefix}/model_output"

small_train_input = TrainingInput(s3_small_train_path, content_type="application/x-npz")
train_input = TrainingInput(s3_train_path, content_type="application/x-npz")
test_input = TrainingInput(s3_test_path, content_type="application/x-npz")


S3 Bucket: fsagemaker-ap-southeast-2-838084669510


# Hyperparameter Tuning

In [23]:
estimator = tf(
    entry_point="train.py",
    source_dir="src",   # where train.py and model_def.py live
    role=role,
    # use_spot_instances=True,  # save money
    instance_type="ml.c5.2xlarge",
    instance_count=1,
    framework_version="2.14",
    py_version="py310",
    hyperparameters={
        "epochs": 1,
        "height": 512,
        "width": 512,
        "channels": 3
    },
    output_path=s3_output_location
)

# Step 2: define search space
hyperparameter_ranges = {
    # numeric / continuous
    "learning-rate": ContinuousParameter(1e-4, 1e-2, scaling_type="Logarithmic"),
    "dropout-rate": ContinuousParameter(0.0, 0.5),                 # if use-dropout=true
    # integer choices for layer sizes
    "batch-size": IntegerParameter(4, 8),                         # change range to suit memory
    "conv1-filters": IntegerParameter(16, 128),
    "conv2-filters": IntegerParameter(32, 256),
    "dense-units": IntegerParameter(64, 512),
    # categorical choices
    "pooling": CategoricalParameter(["max", "avg"]),
    "use-dropout": CategoricalParameter(["true", "false"]),
    "optimizer": CategoricalParameter(["adam", "adagrad"]),
}

# Step 3: define regex to extract val_accuracy from train.py logs
metric_definitions = [
    {"Name": "val_auc",       "Regex": "val_auc: ([0-9\\.]+)"},
    {"Name": "val_f1",        "Regex": "val_f1: ([0-9\\.]+)"},
    {"Name": "val_precision", "Regex": "val_precision: ([0-9\\.]+)"},
    {"Name": "val_recall",    "Regex": "val_recall: ([0-9\\.]+)"},
    {"Name": "val_accuracy",  "Regex": "val_accuracy: ([0-9\\.]+)"},
]

# Step 4: set up the tuner
tuner = HyperparameterTuner(
    estimator=estimator,
    objective_metric_name="val_f1",
    # strategy='Hyperband',
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=metric_definitions,
    max_parallel_jobs=2,
    objective_type="Maximize",
    early_stopping_type="Auto",
    max_jobs=1,
    base_tuning_job_name="ph-16",
)

# Step 5: launch it using small_train.npz for both train + val
tuner.fit({
    "train": small_train_input,
    "test": test_input,
})


No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


........................................................................................................!


# Endpoint Configuration and Model Deployment

In [38]:
model_s3_uri = "s3://sagemaker-ap-southeast-2-838084669510/aiornot/model_output/ph-16-250815-0152-001-69914851/output/model.tar.gz"
tiny_npz_s3 = f"s3://{bucket}/aiornot/tiny_test/tiny_test.npz" 
endpoint_name="model-endpoint"
model = TensorFlowModel(
    model_data=model_s3_uri,
    role=role,
    framework_version="2.14",
    sagemaker_session=sess,
    entry_point="inference.py",   # must be at root of source_dir
    source_dir="src"              # directory that contains inference.py and requirements.txt
)

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.c5.xlarge",
    endpoint_name=endpoint_name,
    wait=True
)

predictor.serializer = JSONSerializer()
predictor.deserializer = JSONDeserializer()
predictor.content_type = "application/json"
predictor.accept = "application/json"


----!

# Get predictions out of Endpoint

In [39]:
payload = {"s3_path": tiny_npz_s3}
try:
    resp = predictor.predict(payload)
    print("Raw response:", resp)
    if isinstance(resp, dict) and "predictions" in resp:
        preds = np.array(resp["predictions"]).reshape(-1)
    else:
        preds = np.array(resp).reshape(-1)
    labels_pred = (preds >= 0.5).astype(int)
    print("Pred probs (first 10):", preds[:10])
    print("Pred labels (first 10):", labels_pred[:10])
except Exception as e:
    print("Invoke failed:", e)

Invoke failed: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from primary with message "{"error": "Unsupported content type: Context(model_name=None, model_version=None, method=None, rest_uri='http://localhost:22001/v1/models/model:predict', grpc_port='22000', channel=<grpc._channel.Channel object at 0x7f65dbf20910>, custom_attributes=None, request_content_type='application/json', accept_header='application/json', content_length=89). Expected application/json."}". See https://ap-southeast-2.console.aws.amazon.com/cloudwatch/home?region=ap-southeast-2#logEventViewer:group=/aws/sagemaker/Endpoints/model-endpoint in account 838084669510 for more information.


In [41]:
# delte the failed endpoint
endpoint_name="model-endpoint"
sm = boto3.client("sagemaker")
print(sm.describe_endpoint(EndpointName=endpoint_name)["EndpointStatus"])
sm.delete_endpoint(EndpointName=endpoint_name)
cfg = sm.describe_endpoint(EndpointName=endpoint_name)["EndpointConfigName"]
sm.delete_endpoint_config(EndpointConfigName=cfg)

------------------------------------------*

Please check the troubleshooting guide for common errors: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-python-sdk-troubleshooting.html#sagemaker-python-sdk-troubleshooting-create-endpoint


In [34]:
sm = boto3.client("sagemaker")
training_job_name = "ph-12-250814-0453-001-32522441"
training_job = sm.describe_training_job(TrainingJobName=training_job_name)
print("S3ModelArtifacts:", training_job["ModelArtifacts"]["S3ModelArtifacts"])
print("TrainingJobStatus:", training_job.get("TrainingJobStatus"))
print("FailureReason:", training_job.get("FailureReason"))
print("ModelArtifacts S3 URI:", training_job.get("ModelArtifacts", {}).get("S3ModelArtifacts"))

S3ModelArtifacts: s3://sagemaker-ap-southeast-2-838084669510/ph-12-250814-0453-001-32522441/output/model.tar.gz
TrainingJobStatus: Completed
FailureReason: None
ModelArtifacts S3 URI: s3://sagemaker-ap-southeast-2-838084669510/ph-12-250814-0453-001-32522441/output/model.tar.gz


In [7]:
s3_uri = "s3://sagemaker-ap-southeast-2-838084669510/ph-12-250814-0453-001-32522441/output/model.tar.gz"
bucket, key = s3_uri.replace("s3://","").split("/", 1)
prefix = key.rsplit("/",1)[0] + "/"

paginator = s3.get_paginator("list_objects_v2")
found = False
print("Listing objects under:", prefix)
for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
    for obj in page.get("Contents", []):
        print(obj["Key"], obj["Size"])
        if obj["Key"] == key:
            found = True

print("\nmodel.tar.gz present?:", found)

Listing objects under: ph-12-250814-0453-001-32522441/output/

model.tar.gz present?: False


In [9]:
s3 = boto3.client("s3")
bucket = "sagemaker-ap-southeast-2-838084669510"

paginator = s3.get_paginator("list_objects_v2")
matches = []
for page in paginator.paginate(Bucket=bucket):
    for obj in page.get("Contents", []):
        k = obj["Key"]
        if k.endswith("saved_model.pb"):
            matches.append(k)
            print("Found saved_model.pb at:", k)

print("Total saved_model.pb files found:", len(matches))

Found saved_model.pb at: tensorflow-training-2025-08-14-01-17-19-905/model/ph-08-250814-0117-001-fa1ff411/model/1/saved_model.pb
Found saved_model.pb at: tensorflow-training-2025-08-14-04-13-29-059/model/ph-10-250814-0413-001-7b46a067/model/1/saved_model.pb
Found saved_model.pb at: tensorflow-training-2025-08-14-04-53-54-648/model/ph-12-250814-0453-001-32522441/model/1/saved_model.pb
Total saved_model.pb files found: 3
