# Model Training with Experiments

Track an experiment while training a Keras model locally


## 개요

- [작업 실행 시 필요한 라이브러리 import](#작업-실행-시-필요한-라이브러리-import)
- [SageMaker 세션과 Role, 사용 버킷 정의](#sagemaker-세션과-role-사용-버킷-정의)
- [모델 학습에 사용될 데이터 준비](#모델-학습에-사용될-데이터-준비)
- [모델 구축](#모델-구축)
- [SageMaker Experiment(실험) 설정](#sagemaker-experiment실험-설정)
  - [Define the Keras callback to log metrics to the run](#define-the-keras-callback-to-log-metrics-to-the-run)
  - [Train the model in the notebook and track it in an Experiment](#train-the-model-in-the-notebook-and-track-it-in-an-experiment)

## Reference

- https://sagemaker-examples.readthedocs.io/en/latest/sagemaker-experiments/local_experiment_tracking/keras_experiment.html

- https://sagemaker.readthedocs.io/en/stable/experiments/sagemaker.experiments.html


### 작업 실행 시 필요한 라이브러리 import


In [None]:
import sys

In [None]:
# update boto3 and sagemaker to ensure latest SDK version
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install --upgrade boto3
!{sys.executable} -m pip install --upgrade sagemaker
!{sys.executable} -m pip install --upgrade tensorflow
!{sys.executable} -m pip install protobuf==3.20.3

In [None]:
import boto3
import sagemaker
from sagemaker.experiments.run import Run

### SageMaker 세션과 Role, 사용 버킷 정의


In [None]:

sagemaker_session = sagemaker.session.Session()
boto_session = boto3.Session()

role = sagemaker.get_execution_role()
# SageMaker사용시 log, model artifact 등의 저장을 위해 default로 지정되는 s3를 사용할 수도 있고,
# 아니면 직접 s3 세팅해서 이걸 사용하라고 지정해 줄 수도 있습니다.
# 실습에서는 default로 제공되는 s3를 사용해보겠습니다.

# default bucket이 아니라 직접 만든 s3 bucket을 사용할 수도 있습니다.
# 만약 직접 생성한 s3 bucket을 사용할 경우 에러가 난다면 아래의 두 경우일 확률이 높습니다.
# 1. bucket이 없는 경우. 즉, bucket을 생성하지 않았는데 그 bucket을 사용하려고 하는 경우
# 2. bucket을 생성했지만, role에 해당 bucket에 대한 read, write 권한이 없는 경우
default_bucket = sagemaker_session.default_bucket()

# sagemaker를 사용
sm = boto_session.client("sagemaker")
region = boto_session.region_name

In [None]:
print(sagemaker_session)
print(role)
print(default_bucket)
print(sm)
print(region)  # ex) us-east-2

### 모델 학습에 사용될 데이터 준비

Here we use the mnist dataset available with Keras


In [None]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd

In [None]:
!mkdir -p datasets

In [None]:

s3 = boto3.client('s3')

train_path = "datasets/input_train.npy"
test_path = "datasets/input_test.npy"
train_labels_path = "datasets/input_train_labels.npy"
test_labels_path = "datasets/input_test_labels.npy"


s3.download_file(
    f"sagemaker-example-files-prod-{region}",  # AWS에서 기본으로 제공하는 s3 버킷과 데이터이다.
    "datasets/image/MNIST/numpy/input_train.npy",
    train_path,
)
s3.download_file(
    f"sagemaker-example-files-prod-{
        region}", "datasets/image/MNIST/numpy/input_test.npy", test_path
)
s3.download_file(
    f"sagemaker-example-files-prod-{region}",
    "datasets/image/MNIST/numpy/input_train_labels.npy",
    train_labels_path,
)
s3.download_file(
    f"sagemaker-example-files-prod-{region}",
    "datasets/image/MNIST/numpy/input_test_labels.npy",
    test_labels_path,
)

In [None]:
x_train = np.load(train_path)
x_test = np.load(test_path)
y_train = np.load(train_labels_path)
y_test = np.load(test_labels_path)

# 아래 부분은 DeepLearning을 위해서 하는 처리라서 연구팀(Machine Learning Engineer, Researcher 분들이 해주시는 부분이라고 생각하면 됩니다.)
# 따라서 자세히 적거나 완벽히 이해하려고는 하지 않겠습니다.
# Reshape the arrays
x_train = np.reshape(x_train, (60000, 28, 28))
x_test = np.reshape(x_test, (10000, 28, 28))
y_train = np.reshape(y_train, (60000,))
y_test = np.reshape(y_test, (10000,))

# Scale images to the [0, 1] range
x_train = x_train.astype("float32") / 255
x_test = x_test.astype("float32") / 255

# Make sure images have shape (28, 28, 1)
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)

print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")

In [None]:
# convert class vectors to binary class matrices

num_classes = 10

y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [None]:
input_shape = (28, 28, 1)

### 모델 구축


In [None]:

def get_model(dropout=0.5):
    """
    This function returns a Keras model for image classification.

    Parameters:
    - dropout (float): The dropout rate for the Dropout layer. Default is 0.5.

    Returns:
    - model (keras.Sequential): The compiled Keras model.

    """
    model = keras.Sequential(
        [
            keras.Input(shape=input_shape),
            layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
            layers.MaxPooling2D(pool_size=(2, 2)),
            layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
            layers.MaxPooling2D(pool_size=(2, 2)),
            layers.Flatten(),
            layers.Dropout(dropout),
            layers.Dense(num_classes, activation="softmax"),
        ]
    )

    model.compile(loss="categorical_crossentropy",
                  optimizer="adam", metrics=["accuracy"])

    return model

In [None]:

batch_size = 256
epochs = 8
dropout = 0.3

model = get_model(dropout)

### SageMaker Experiment(실험) 설정


In [None]:
# sagemaker에서 제공하는 Experiment를 선언하고, initialize an experiment run for tracking a training job with Run().

experiment_name = "local-keras-experiment"

#### Define the Keras callback to log metrics to the run


In [None]:
# Define the Keras callback to log metrics to the run
# The Keras Callback class provides a method on_epoch_end which emits metrics at the end of each epoch.
# All emitted metrics will be logged in the run passed to the callback.
class ExperimentCallback(keras.callbacks.Callback):
    def __init__(self, run, trained_model, x_test, y_test):
        self.run = run
        self.trained_model = trained_model
        self.x_test = x_test
        self.y_test = y_test

    def on_epoch_end(self, epoch, logs=None):
        keys = list(logs.keys())
        for key in keys:
            self.run.log_metric(name=key, value=logs[key], step=epoch)
            print("{} -> {}".format(key, logs[key]))

#### Train the model in the notebook and track it in an Experiment

Here we train the keras model locally on the instance that this notebook is running on.


In [None]:
with Run(experiment_name=experiment_name, sagemaker_session=sagemaker_session) as run:
    """
    Run the experiment with the specified experiment name and SageMaker session.
    For more information on the Run class, see the SageMaker Python SDK documentation:
        https://sagemaker.readthedocs.io/en/stable/experiments/sagemaker.experiments.html
    Parameters:
    - experiment_name (str): The name of the experiment.
    - sagemaker_session (sagemaker.session.Session): The SageMaker session to use. 이 Run을 어떤 sageMaker에서 실행시키라는 건지 알고, 실행합니다. 

    Returns:
    - run (sagemaker.experiments.run.Run): The experiment run object.

    """

    # log를 남기기 위해 log_parameter에 어떤 parameter를 Log에 남길지 지정해줍니다.
    run.log_parameter('batch_size', batch_size)
    run.log_parameter('epochs', epochs)
    run.log_parameter('dropout', dropout)

    # log_file을 써서 아래의 정보들을 저장합니다.
    run.log_file("datasets/input_train.npy", is_output=False)
    run.log_file("datasets/input_test.npy", is_output=False)
    run.log_file("datasets/input_train_labels.npy", is_output=False)
    run.log_file("datasets/input_test_labels.npy", is_output=False)

    # Train the keras model locally on the instance that this notebook is running on.
    model.fit(
        x_train,
        y_train,
        batch_size=batch_size,
        epochs=epochs,
        validation_split=0.1,
        # 이 callback은 epoch 끝날 때마다 callback함수가 실행되도록 합니다.
        # 이렇게 해주면 Epoch를 기준으로 뭔가를 하고 싶을 때 매우 유용합니다.
        callbacks=[ExperimentCallback(run, model, x_test, y_test)],
    )

    # fit이 완료되면 evaluate한다.
    score = model.evaluate(x_test, y_test)

    print("test loss : ", score[0])
    print("test accuracy : ", score[1])

    # 저장할 log metri을 지정합니다.
    run.log_metric(name="Loss on test data", value=score[0])
    run.log_metric(name="Accuracy on test data ", value=score[1])