In [1]:
!pip install --upgrade pip -q
!pip install --upgrade boto3 -q
!pip install --upgrade sagemaker -q
!pip install --upgrade tensorflow -q
!pip install protobuf -q

In [2]:
import boto3
import sagemaker

from sagemaker.session import Session
from sagemaker import get_execution_role
from sagemaker.experiments.run import Run

import json

import os
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


2025-08-21 03:10:12.629732: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-08-21 03:10:14.384141: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-08-21 03:10:18.642384: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


In [3]:
boto_session = boto3.Session()
bucket_name = "mlflow-tracking-dir"
sagemaker_session = Session(default_bucket=bucket_name)

role = get_execution_role()
default_bucket = sagemaker_session.default_bucket()

sagemaker_client = boto_session.client("sagemaker")
region = boto_session.region_name

In [4]:
print(boto_session)
print(sagemaker_session)
print(role) # 해당 Role을 활용해 SageMaker 실행 및 S3 접근
print(default_bucket) # log를 저장할 s3 bucket
print(sagemaker_client)
print(region)

Session(region_name='ap-southeast-2')
<sagemaker.session.Session object at 0x7f50a5ebcf80>
arn:aws:iam::954690186719:role/service-role/AmazonSageMaker-ExecutionRole-20250821T100350
mlflow-tracking-dir
<botocore.client.SageMaker object at 0x7f5057e58080>
ap-southeast-2


In [5]:
s3_client = boto3.client("s3")

train_input_path = os.path.join('datasets', 'train_input.npy')
test_input_path = os.path.join('datasets', 'test_input.npy')
train_target_path = os.path.join('datasets', 'train_target.npy')
test_target_path = os.path.join('datasets', 'test_target.npy')

s3_client.download_file(
    f'sagemaker-example-files-prod-{region}',
    os.path.join('datasets', 'image', 'MNIST', 'numpy', 'input_train.npy'),
    train_input_path
)

s3_client.download_file(
    f'sagemaker-example-files-prod-{region}',
    os.path.join('datasets', 'image', 'MNIST', 'numpy', 'input_test.npy'),
    test_input_path
)

s3_client.download_file(
    f'sagemaker-example-files-prod-{region}',
    os.path.join('datasets', 'image', 'MNIST', 'numpy', 'input_train_labels.npy'),
    train_target_path
)

s3_client.download_file(
    f'sagemaker-example-files-prod-{region}',
    os.path.join('datasets', 'image', 'MNIST', 'numpy', 'input_test_labels.npy'),
    test_target_path
)

In [6]:
train_input = np.load(train_input_path)
test_input = np.load(test_input_path)
train_target = np.load(train_target_path)
test_target = np.load(test_target_path)

In [7]:
train_input = np.reshape(train_input, (60000, 28, 28))
test_input = np.reshape(test_input, (10000, 28, 28))
train_target = np.reshape(train_target, (60000,))
test_target = np.reshape(test_target, (10000,))

In [8]:
train_input = train_input.astype("float32") / 255
test_input = test_input.astype("float32") / 255

In [9]:
train_input = np.expand_dims(train_input, -1)
test_input = np.expand_dims(test_input, -1)

In [10]:
print(
    f"Train Inpu Shape : {train_input.shape}",
    f"\n{train_input.shape[0]} train samples",
    f"\n{test_input.shape[0]} test samples"
)

Train Inpu Shape : (60000, 28, 28, 1) 
60000 train samples 
10000 test samples


In [11]:
num_classes = 10

train_target = keras.utils.to_categorical(train_target, num_classes)
test_target = keras.utils.to_categorical(test_target, num_classes)

In [12]:
input_shape = (28,28,1)

In [13]:
def get_model(dropout=0.5):
    model = keras.Sequential(
        [
            keras.Input(shape=input_shape),
            layers.Conv2D(32, kernel_size=(3, 3), activation='relu'),
            layers.MaxPool2D(pool_size=(2, 2)),
            layers.Conv2D(64, kernel_size=(3, 3), activation='relu'),
            layers.MaxPool2D(pool_size=(2, 2)),
            layers.Flatten(),
            layers.Dropout(dropout),
            layers.Dense(num_classes, activation="softmax")
        ]
    )

    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

    return model

In [14]:
PARAMETERS = {
    'batch_size' : 256,
    'epochs' : 5,
    'dropout' : 0.5
}

model = get_model(PARAMETERS['dropout'])

2025-08-21 03:10:24.366282: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [15]:
class ExperimentCallback(keras.callbacks.Callback):
    def __init__(self, run, model, test_input, test_target):
        self.run = run
        self.test_input = test_input
        self.test_target = test_target

    def on_epoch_end(self, epoch, logs=None):
        keys = list(logs.keys())
        for key in keys:
            self.run.log_metric(name=key, value=logs[key], step=epoch)
            print(f"{key} -> {logs[key]}")

In [16]:
experiment_name = "keras-experiment-lab"

with Run(
    experiment_name=experiment_name,
    sagemaker_session=sagemaker_session
) as run:

    run.log_parameter('BatchSize', PARAMETERS['batch_size'])
    run.log_parameter('Epochs', PARAMETERS['epochs'])
    run.log_parameter('Dropout', PARAMETERS['dropout'])

    run.log_file(train_input_path, is_output=False)
    run.log_file(test_input_path, is_output=False)
    run.log_file(train_target_path, is_output=False)
    run.log_file(test_target_path, is_output=False)

    model.fit(
        train_input,
        train_target,
        batch_size=PARAMETERS['batch_size'],
        epochs=PARAMETERS['epochs'],
        validation_split=0.1,
        callbacks=[ExperimentCallback(run, model, test_input, test_target)]
    )

    score = model.evaluate(test_input, test_target)

    print(
        f"Test Loss : {score[0]}",
        f"\nTest Accuracy : {score[1]}"
    )

    run.log_metric(name="TestLoss(CorssEntropy)", value=score[0])
    run.log_metric(name="TestAccuracy", value=score[1])

Epoch 1/5


2025-08-21 03:10:26.219185: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 169344000 exceeds 10% of free system memory.


[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - accuracy: 0.6800 - loss: 1.0154accuracy -> 0.8435184955596924
loss -> 0.5130194425582886
val_accuracy -> 0.9729999899864197
val_loss -> 0.10410809516906738
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 143ms/step - accuracy: 0.6808 - loss: 1.0131 - val_accuracy: 0.9730 - val_loss: 0.1041
Epoch 2/5
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.9513 - loss: 0.1591accuracy -> 0.9568703770637512
loss -> 0.14031855762004852
val_accuracy -> 0.981166660785675
val_loss -> 0.07154268771409988
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 148ms/step - accuracy: 0.9514 - loss: 0.1591 - val_accuracy: 0.9812 - val_loss: 0.0715
Epoch 3/5
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step - accuracy: 0.9673 - loss: 0.1083accuracy -> 0.9683148264884949
loss -> 0.10398832708597183
val_accuracy -> 0.98416668176651
