# Imports

In [10]:
import boto3
import sagemaker
from sagemaker import Session
from sagemaker.session import TrainingInput
from sagemaker.tensorflow import TensorFlow as tf
from sagemaker.tensorflow import TensorFlowModel
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, ContinuousParameter, CategoricalParameter
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
import tensorflow
import s3fs
import numpy as np
import json
import time
RANDOM_SEED = 0
sagemaker.__version__

'2.245.0'

# Sagemaker Initialization

In [23]:
fs = s3fs.S3FileSystem()
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
s3 = boto3.client("s3")
bucket = sess.default_bucket()   # will be sagemaker-ap-southeast-2-838084669510
prefix = "aiornot"
print(f"S3 Bucket: f{bucket}")

s3_small_train_path = f"s3://{bucket}/{prefix}/small_train/small_train.npz"
s3_train_path = f"s3://{bucket}/{prefix}/train/train.npz"
s3_test_path = f"s3://{bucket}/{prefix}/test/test.npz"
s3_tiny_test_path = f"s3://{bucket}/{prefix}/tiny_test/tiny_test.npz"

s3_output_location = f"s3://{bucket}/{prefix}/model_output"

small_train_input = TrainingInput(s3_small_train_path, content_type="application/x-npz")
train_input = TrainingInput(s3_train_path, content_type="application/x-npz")
test_input = TrainingInput(s3_test_path, content_type="application/x-npz")


S3 Bucket: fsagemaker-ap-southeast-2-838084669510


# Hyperparameter Tuning

In [23]:
estimator = tf(
    entry_point="train.py",
    source_dir="src",   # where train.py and model_def.py live
    role=role,
    # use_spot_instances=True,  # save money
    instance_type="ml.c5.2xlarge",
    instance_count=1,
    framework_version="2.14",
    py_version="py310",
    hyperparameters={
        "epochs": 1,
        "height": 512,
        "width": 512,
        "channels": 3
    },
    output_path=s3_output_location
)

# Step 2: define search space
hyperparameter_ranges = {
    # numeric / continuous
    "learning-rate": ContinuousParameter(1e-4, 1e-2, scaling_type="Logarithmic"),
    "dropout-rate": ContinuousParameter(0.0, 0.5),                 # if use-dropout=true
    # integer choices for layer sizes
    "batch-size": IntegerParameter(4, 8),                         # change range to suit memory
    "conv1-filters": IntegerParameter(16, 128),
    "conv2-filters": IntegerParameter(32, 256),
    "dense-units": IntegerParameter(64, 512),
    # categorical choices
    "pooling": CategoricalParameter(["max", "avg"]),
    "use-dropout": CategoricalParameter(["true", "false"]),
    "optimizer": CategoricalParameter(["adam", "adagrad"]),
}

# Step 3: define regex to extract val_accuracy from train.py logs
metric_definitions = [
    {"Name": "val_auc",       "Regex": "val_auc: ([0-9\\.]+)"},
    {"Name": "val_f1",        "Regex": "val_f1: ([0-9\\.]+)"},
    {"Name": "val_precision", "Regex": "val_precision: ([0-9\\.]+)"},
    {"Name": "val_recall",    "Regex": "val_recall: ([0-9\\.]+)"},
    {"Name": "val_accuracy",  "Regex": "val_accuracy: ([0-9\\.]+)"},
]

# Step 4: set up the tuner
tuner = HyperparameterTuner(
    estimator=estimator,
    objective_metric_name="val_f1",
    # strategy='Hyperband',
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=metric_definitions,
    max_parallel_jobs=2,
    objective_type="Maximize",
    early_stopping_type="Auto",
    max_jobs=1,
    base_tuning_job_name="ph-16",
)

# Step 5: launch it using small_train.npz for both train + val
tuner.fit({
    "train": small_train_input,
    "test": test_input,
})


No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


........................................................................................................!


# Endpoint Configuration and Model Deployment

In [52]:
model_s3_uri = "s3://sagemaker-ap-southeast-2-838084669510/aiornot/model_output/ph-16-250815-0152-001-69914851/output/model.tar.gz"
tiny_npz_s3 = f"s3://{bucket}/aiornot/tiny_test/tiny_test.npz"
endpoint_name="model-endpoint"
model = TensorFlowModel(
    model_data=model_s3_uri,
    role=role,
    framework_version="2.14",
    sagemaker_session=sess,
    entry_point="inference.py",   # must be at root of source_dir
    source_dir="src"              # directory that contains inference.py and requirements.txt
)

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.c5.xlarge",
    endpoint_name=endpoint_name,
    wait=True
)

predictor.serializer = JSONSerializer()
predictor.deserializer = JSONDeserializer()

-----!

# Get predictions out of Endpoint

In [54]:
import tempfile

def prepare_data_for_prediction(s3_uri):
    """Download NPZ from S3 and prepare instances for prediction"""
    s3 = boto3.client("s3")
    bucket, key = s3_uri.replace("s3://", "").split("/", 1)
    
    # Download to local temp file
    tmp = tempfile.mktemp(suffix=".npz")
    s3.download_file(bucket, key, tmp)
    
    # Load and convert to instances
    arr = np.load(tmp)
    images = arr["image"]
    
    # Handle different shapes
    if len(images.shape) == 3:  # Single image (H, W, C)
        instances = [images.tolist()]
    elif len(images.shape) == 4:  # Multiple images (N, H, W, C)
        instances = [img.tolist() for img in images]
    else:
        raise ValueError(f"Unexpected image shape: {images.shape}")
    
    # Clean up temp file
    import os
    os.unlink(tmp)
    
    return {"instances": instances}

# Prepare data and make prediction
print("Preparing data for prediction...")
payload = prepare_data_for_prediction(tiny_npz_s3)
print(f"Payload shape: {len(payload['instances'])} instances")

print("Making prediction...")
resp = predictor.predict(payload)
print("Response:", resp)


Preparing data for prediction...
Payload shape: 20 instances
Making prediction...


In [4]:
# delte the failed endpoint
endpoint_name="model-endpoint"
sm = boto3.client("sagemaker")
print(f"Endpoint Description: {sm.describe_endpoint(EndpointName=endpoint_name)["EndpointStatus"]}")
sm.delete_endpoint(EndpointName=endpoint_name)
cfg = sm.describe_endpoint(EndpointName=endpoint_name)["EndpointConfigName"]
sm.delete_endpoint_config(EndpointConfigName=cfg)

# Get Best Hyperparameters

In [9]:
sm = boto3.client("sagemaker")
training_job_name = "ph-16-250815-0152-001-69914851"  
tj = sm.describe_training_job(TrainingJobName=training_job_name)
raw_hps = dict(tj["HyperParameters"])  # strings
raw_hps

def strip_wrapped_quotes(v: str) -> str:
    # turns '"adam"' -> adam ; "\"s3://...\"" -> s3://...
    if v and len(v) >= 2 and v[0] == '"' and v[-1] == '"':
        return v[1:-1]
    return v

# keys you don’t want to pass back to train.py
denylist_prefixes = ("sagemaker_", "_tuning_")
denylist_exact    = {"model_dir", "sagemaker_job_name", "sagemaker_program",
                     "sagemaker_region", "sagemaker_submit_directory"}

clean_hps = {}
for k, v in raw_hps.items():
    if k in denylist_exact or any(k.startswith(p) for p in denylist_prefixes):
        continue
    vv = strip_wrapped_quotes(v)
    clean_hps[k] = vv
clean_hps

int_keys   = {"epochs","height","width","channels","batch-size",
              "conv1-filters","conv2-filters","dense-units"}
float_keys = {"learning-rate","dropout-rate"}
bool_keys  = {"use-dropout"}  # expects "true"/"false" strings in argparse? keep as strings if so.

typed_hps = {}
for k, v in clean_hps.items():
    try:
        if k in int_keys:   typed_hps[k] = int(float(v))  # int might come as "5" or "5.0"
        elif k in float_keys: typed_hps[k] = float(v)
        elif k in bool_keys:  typed_hps[k] = v.lower() if v.lower() in ("true","false") else v
        else:                 typed_hps[k] = v
    except Exception:
        # if anything is odd, fall back to original string
        typed_hps[k] = v

print("Using hyperparameters:", typed_hps)

Using hyperparameters: {'batch-size': 5, 'channels': 3, 'conv1-filters': 62, 'conv2-filters': 74, 'dense-units': 92, 'dropout-rate': 0.21555822761766713, 'epochs': 1, 'height': 512, 'learning-rate': 0.004668016111303579, 'optimizer': 'adam', 'pooling': 'avg', 'use-dropout': 'false', 'width': 512}


# Make Predictions using endpoint deployment

In [12]:
train_npz = "s3://sagemaker-ap-southeast-2-838084669510/aiornot/small_train/small_train.npz"
test_npz  = "s3://sagemaker-ap-southeast-2-838084669510/aiornot/test/test.npz"

train_input = TrainingInput(train_npz, input_mode="File", content_type="application/x-npz")
test_input  = TrainingInput(test_npz,  input_mode="File", content_type="application/x-npz")

estimator = tf(
    entry_point="train.py",
    source_dir="src",
    role=role,
    instance_type="ml.c5.2xlarge",
    instance_count=1,
    framework_version="2.14",
    py_version="py310",
    output_path=f"s3://{bucket}/aiornot/model_output",
    # keep the static shape/run params you use + inject the tuned ones
    hyperparameters={
        "epochs": 1, "height": 512, "width": 512, "channels": 3,
        **typed_hps,  # tuned values win if keys overlap
    },
    metric_definitions=[
        {"Name":"val_auc","Regex":r"val_auc: ([0-9\.]+)"},
        {"Name":"val_f1","Regex":r"val_f1: ([0-9\.]+)"},
        {"Name":"val_precision","Regex":r"val_precision: ([0-9\.]+)"},
        {"Name":"val_recall","Regex":r"val_recall: ([0-9\.]+)"},
        {"Name":"val_accuracy","Regex":r"val_accuracy: ([0-9\.]+)"},
    ],
)

job_name = "bestparams-refit-" + time.strftime("%Y%m%d-%H%M%S")
estimator.fit(
    {"train": train_input, "test": test_input},
    job_name=job_name
)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: bestparams-refit-20250815-091227


2025-08-15 09:12:31 Starting - Starting the training job...
2025-08-15 09:13:00 Downloading - Downloading input data.........
  "cipher": algorithms.TripleDES,[0m
  "class": algorithms.TripleDES,[0m
[34m2025-08-15 09:14:45.432846: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.[0m
[34mTo enable the following instructions: AVX512F, in other operations, rebuild TensorFlow with the appropriate compiler flags.[0m
[34m2025-08-15 09:14:47,426 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2025-08-15 09:14:47,427 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-08-15 09:14:47,427 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-08-15 09:14:47,776 sagemaker-training-toolkit INFO     Installing dependencies from requiremen

In [17]:
final_model_s3_path = "s3://sagemaker-ap-southeast-2-838084669510/aiornot/model_output/bestparams-refit-20250815-091227/output/model.tar.gz"
final_tf_model = TensorFlowModel(
    model_data=final_model_s3_path,
    role=role,
    framework_version="2.14"
)

predictor = final_tf_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large"
)

INFO:sagemaker.tensorflow.model:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating model with name: tensorflow-inference-2025-08-15-10-06-55-492
INFO:sagemaker:Creating endpoint-config with name tensorflow-inference-2025-08-15-10-06-56-118
INFO:sagemaker:Creating endpoint with name tensorflow-inference-2025-08-15-10-06-56-118


-----!

In [38]:
from sklearn.metrics import classification_report, accuracy_score
# Load test set locally (you’ll need it locally for label comparison)

# 1) load
with fs.open(s3_tiny_test_path, "rb") as f:
    d = np.load(f)
    X = d["image"].astype("float32") 
    y_true = np.asarray(d["label"], dtype=int).ravel()

# 3) predict in micro-batches to avoid 413
def predict_batches(pred, X, bs=1):
    probs = []
    for i in range(0, len(X), bs):
        out = pred.predict(X[i:i+bs].tolist())
        p = np.array(out.get("predictions", out)).reshape(-1)  # shape (bs,)
        probs.append(p)
    return np.concatenate(probs)

probs = predict_batches(predictor, X, bs=1)

# binary labels from probs
y_pred = (probs >= 0.5).astype(int)

print("Accuracy:", accuracy_score(y_true, y_pred))
print(classification_report(y_true, y_pred, zero_division=0))

Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.73      0.80      0.76        10
           1       0.78      0.70      0.74        10

    accuracy                           0.75        20
   macro avg       0.75      0.75      0.75        20
weighted avg       0.75      0.75      0.75        20



array([1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1])

In [40]:
y_pred

array([1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0])

In [41]:
probs

array([0.53495926, 0.70964813, 0.38401976, 0.73712951, 0.37336013,
       0.45250031, 0.51539123, 0.37229359, 0.435945  , 0.46623388,
       0.69867581, 0.37254658, 0.65705967, 0.74084812, 0.37394014,
       0.37293681, 0.5825783 , 0.73574185, 0.37438586, 0.43675256])