# Model Training with SageMaker Estimator Training Job and track it using Experiments

In this notebook we train a Keras model using the MNIST dataset on a remote SageMaker instance using a training job.


## 개요

- [작업 실행 시 필요한 라이브러리 import](#작업-실행-시-필요힌-라이브러리-import)
- [SageMaker 세션과 Role, 사용 버킷 정의](#sagemaker-세션과-role-사용-버킷-정의)
- [Training script 작성](#training-script-작성)
- [Experiment을 생성하고 training job을 실행하기](#experiment을-생성하고-training-job을-실행하기)

## Reference

- https://sagemaker-examples.readthedocs.io/en/latest/sagemaker-experiments/sagemaker_job_tracking/tensorflow_script_mode_training_job.html
- https://sagemaker.readthedocs.io/en/stable/frameworks/tensorflow/sagemaker.tensorflow.html#tensorflow-estimator
- https://sagemaker.readthedocs.io/en/stable/experiments/sagemaker.experiments.html


### 작업 실행 시 필요힌 라이브러리 import


In [None]:
import sys

In [None]:
# update boto3 and sagemaker to ensure latest SDK version
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install --upgrade boto3
!{sys.executable} -m pip install --upgrade sagemaker
!{sys.executable} -m pip install --upgrade tensorflow

In [None]:
import boto3
import sagemaker

### SageMaker 세션과 Role, 사용 버킷 정의


In [None]:

sagemaker_session = sagemaker.session.Session()
boto_session = boto3.Session()

role = sagemaker.get_execution_role()
# SageMaker사용시 log, model artifact 등의 저장을 위해 default로 지정되는 s3를 사용할 수도 있고,
# 아니면 직접 s3 세팅해서 이걸 사용하라고 지정해 줄 수도 있습니다.
# 실습에서는 default로 제공되는 s3를 사용해보겠습니다.

# default bucket이 아니라 직접 만든 s3 bucket을 사용할 수도 있습니다.
# 만약 직접 생성한 s3 bucket을 사용할 경우 에러가 난다면 아래의 두 경우일 확률이 높습니다.
# 1. bucket이 없는 경우. 즉, bucket을 생성하지 않았는데 그 bucket을 사용하려고 하는 경우
# 2. bucket을 생성했지만, role에 해당 bucket에 대한 read, write 권한이 없는 경우
default_bucket = sagemaker_session.default_bucket()

# sagemaker를 사용
sm = boto_session.client("sagemaker")
region = boto_session.region_name

In [None]:
print(sagemaker_session)
print(role)
print(default_bucket)
print(sm)
print(region)  # ex) us-east-2

### Training script 작성

Here we use a SageMaker Training job to train the model on a remote instance.


In [None]:
!mkdir -p script

In [None]:
%%writefile ./script/train.py

# 이렇게 하면 script/train.py 파일이 생성되고, 이 아래에 있는 코드들이 이 파일에 작성된다.

import os

os.system("pip install -U sagemaker")

import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import argparse

from sagemaker.session import Session
from sagemaker.experiments import load_run

import boto3

boto_session = boto3.session.Session(region_name=os.environ["REGION"])
sagemaker_session = Session(boto_session=boto_session)
s3 = boto3.client("s3")


def parse_args():
    parser = argparse.ArgumentParser()

    parser.add_argument("--epochs", type=int, default=1)
    parser.add_argument("--batch_size", type=int, default=64)
    parser.add_argument("--dropout", type=float, default=0.01)

    return parser.parse_known_args()


# Define the Keras callback to log metrics to the run
# The Keras Callback class provides a method on_epoch_end which emits metrics at the end of each epoch.
# All emitted metrics will be logged in the run passed to the callback.
class ExperimentCallback(keras.callbacks.Callback):
    def __init__(self, run, trained_model, x_test, y_test):
        self.run = run
        self.trained_model = trained_model
        self.x_test = x_test
        self.y_test = y_test

    def on_epoch_end(self, epoch, logs=None):
        keys = list(logs.keys())
        for key in keys:
            self.run.log_metric(name=key, value=logs[key], step=epoch)
            print("{} -> {}".format(key, logs[key]))


def load_data():
    num_classes = 10
    input_shape = (28, 28, 1)

    train_path = "input_train.npy"
    test_path = "input_test.npy"
    train_labels_path = "input_train_labels.npy"
    test_labels_path = "input_test_labels.npy"

    # Load the data and split it between train and test sets
    s3.download_file(
        f"sagemaker-example-files-prod-{os.environ['REGION']}", "datasets/image/MNIST/numpy/input_train.npy", train_path
    )
    s3.download_file(
        f"sagemaker-example-files-prod-{os.environ['REGION']}", "datasets/image/MNIST/numpy/input_test.npy", test_path
    )
    s3.download_file(
        f"sagemaker-example-files-prod-{os.environ['REGION']}",
        "datasets/image/MNIST/numpy/input_train_labels.npy",
        train_labels_path,
    )
    s3.download_file(
        f"sagemaker-example-files-prod-{os.environ['REGION']}",
        "datasets/image/MNIST/numpy/input_test_labels.npy",
        test_labels_path,
    )

    x_train = np.load(train_path)
    x_test = np.load(test_path)
    y_train = np.load(train_labels_path)
    y_test = np.load(test_labels_path)

    # Reshape the arrays
    x_train = np.reshape(x_train, (60000, 28, 28))
    x_test = np.reshape(x_test, (10000, 28, 28))
    y_train = np.reshape(y_train, (60000,))
    y_test = np.reshape(y_test, (10000,))

    # Scale images to the [0, 1] range
    x_train = x_train.astype("float32") / 255
    x_test = x_test.astype("float32") / 255

    # Make sure images have shape (28, 28, 1)
    x_train = np.expand_dims(x_train, -1)
    x_test = np.expand_dims(x_test, -1)
    print("x_train shape:", x_train.shape)
    print(x_train.shape[0], "train samples")
    print(x_test.shape[0], "test samples")

    # convert class vectors to binary class matrices
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    return x_train, x_test, y_train, y_test


def main():
    """ """
    args, _ = parse_args()
    print("Args are : ", args)
    num_classes = 10
    input_shape = (28, 28, 1)
    x_train, x_test, y_train, y_test = load_data()

    model = keras.Sequential(
        [
            keras.Input(shape=input_shape),
            layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
            layers.MaxPooling2D(pool_size=(2, 2)),
            layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
            layers.MaxPooling2D(pool_size=(2, 2)),
            layers.Flatten(),
            layers.Dropout(args.dropout),
            layers.Dense(num_classes, activation="softmax"),
        ]
    )

    model.summary()

    batch_size = args.batch_size
    epochs = args.epochs

    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

    ###
    # `load_run` will use the run defined when calling the estimator
    ###
    with load_run(sagemaker_session=sagemaker_session) as run:
        model.fit(
            x_train,
            y_train,
            batch_size=batch_size,
            epochs=epochs,
            validation_split=0.1,
            callbacks=[ExperimentCallback(run, model, x_test, y_test)],
        )

        score = model.evaluate(x_test, y_test, verbose=0)
        print("Test loss:", score[0])
        print("Test accuracy:", score[1])

        run.log_metric(name="Final Test Loss", value=score[0])
        run.log_metric(name="Final Test Accuracy", value=score[1])

        model.save("/opt/ml/model")


if __name__ == "__main__":
    main()

### Experiment을 생성하고 training job을 실행하기


In [None]:

from sagemaker.tensorflow.estimator import TensorFlow
from sagemaker.experiments.run import Run

In [None]:

experiment_name = "tensorflow-estimator-experiment-practice"

batch_size = 256
epochs = 5
dropout = 0.1

with Run(
    experiment_name=experiment_name,
    sagemaker_session=sagemaker_session,
) as run:
    run.log_parameter("batch_size", batch_size)
    run.log_parameter("epochs", epochs)
    run.log_parameter("dropout", dropout)

    # Estimator를 사용하지 않는다면, 여기에 ../2.experiment_training/keras_experiment.ipynb의 def get_model(dropout=0.5): 처럼 model training 코드를 작성하겠지만
    # 여기서는 Estimator의 from sagemaker.tensorflow.estimator import TensorFlow를 사용해서
    # 추상화된 컨데이너를 사용하도록 코드를 작성합니다.
    est = TensorFlow(  # TensorFlow Estimator. https://sagemaker.readthedocs.io/en/stable/frameworks/tensorflow/sagemaker.tensorflow.html#tensorflow-estimator
        entry_point="./script/train.py",
        role=role,
        model_dir=False,
        # 주의: hyperparameters가 entry_point의 script("./script/train.py")에서 사용하는 인자와 일치해야 합니다.
        # Estimator가 "./script/train.py" 에서 이 hyperparameters를 인자로 자신의 script에 넣어주기 때문입니다.
        # def parse_args():
        #     parser = argparse.ArgumentParser()

        #     parser.add_argument("--epochs", type=int, default=1)
        #     parser.add_argument("--batch_size", type=int, default=64)
        #     parser.add_argument("--dropout", type=float, default=0.01)

        #     return parser.parse_known_args()
        hyperparameters={"epochs": epochs,
                         "batch_size": batch_size, "dropout": dropout},
        framework_version="2.8",  # tensorflow framework version
        py_version="py39",  # python version
        instance_type="ml.m5.xlarge",
        instance_count=1,
        keep_alive_period_in_seconds=3600,
        environment={"REGION": region},
    )

    # Training Job 시작
    # .fit을 호출하면, AWS SageMaker는 자동으로 Estimator를 기반으로 필요한 인프라를 provisioning하고 training job이 생성되어 학습을 실행시킨다.
    est.fit()

In [None]:
# 학습 완료시킨 Model을 생성시켜보자.
from sagemaker.tensorflow.model import TensorFlowModel

In [None]:
model = TensorFlowModel(model_data=est.model_data,
                        role=role, framework_version="2.8")

In [None]:
est.model_data