# SageMaker Pipelines의 Unit Testing 하기 - TextCls

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
import os

import boto3
import sagemaker
import pandas as pd


region = boto3.Session().region_name
sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()

from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.pipeline_context import LocalPipelineSession

boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client("sagemaker")
default_bucket = sagemaker_session.default_bucket()

# pipeline_session = PipelineSession(
#     boto_session=boto_session,
#     sagemaker_client=sagemaker_client,
#     default_bucket=default_bucket,
# )

pipeline_session = LocalPipelineSession(
    boto_session=boto_session,
    default_bucket=default_bucket,
)

from sagemaker.workflow.steps import CacheConfig
cache_config = CacheConfig(enable_caching=True, expire_after="PT12H")

In [4]:
BASE_DIR = os.path.join(os.getcwd(), 'pipelines/textcls/')
BASE_DIR

'/root/huggingface-230124-p-ixwy0598cnqk/sagemaker-huggingface-230124-p-ixwy0598cnqk-modelbuild/pipelines/textcls/'

In [5]:
from sagemaker.workflow.retry import (
    StepRetryPolicy,
    StepExceptionTypeEnum,
    SageMakerJobStepRetryPolicy,
    SageMakerJobExceptionTypeEnum
)

## 1. 필요 Packages import

### 1-1. Parameters 정의에 필요한 Packages

In [6]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)


### 1-2. Processing에 필요한 Packages

In [7]:
from sagemaker.sklearn.processing import SKLearnProcessor

from sagemaker.processing import (
    ProcessingInput,
    ProcessingOutput,
    ScriptProcessor,
)

from sagemaker.workflow.steps import ProcessingStep
from sagemaker.huggingface import HuggingFaceProcessor
from sagemaker.pytorch import PyTorchProcessor

### 1-3. Training에 필요한 Packages

In [8]:
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput

from sagemaker.workflow.steps import TrainingStep
from sagemaker.huggingface import HuggingFace

### 1-4. Evaluation에 필요한 Packages

In [9]:
from sagemaker.workflow.properties import PropertyFile

### 1-5. Model Metrics에 필요한 Packages

In [10]:
from sagemaker.model_metrics import (
    MetricsSource,
    ModelMetrics,
)

from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.condition_step import (
    ConditionStep,
)

from sagemaker.workflow.functions import (
    JsonGet,
    Join,
)

from sagemaker.workflow.model_step import ModelStep
from sagemaker.model import Model
from sagemaker.workflow.step_collections import RegisterModel

## 2. get_pipeline의 입력 변수

In [17]:
model_package_group_name="TextClsPackageGroup"
pipeline_name="TextclsPipeline"
base_job_prefix="Textcls"
processing_instance_type="ml.m5.xlarge"
training_instance_type="ml.c5.9xlarge"

# s3_input_prefix = 'a2i-output'
s3_output_prefix = 'hf_processing_output'

## 3. 모델 빌딩 파이프라인 스텝(Step) 정의


### 3-1. 모델 빌딩 파이프라인 변수 생성

In [18]:
# Here we define which exceptions to capture and when to retry the step
step_retry_policy = StepRetryPolicy(
    exception_types=[
        StepExceptionTypeEnum.SERVICE_FAULT,
        StepExceptionTypeEnum.THROTTLING,
    ],
    backoff_rate=2.0, # the multiplier by which the retry interval increases during each attempt
    interval_seconds=30, # the number of seconds before the first retry attempt
    expire_after_mins=4*60  # keep trying for for 4 hours max
)

job_retry_policy = SageMakerJobStepRetryPolicy(
    exception_types=[SageMakerJobExceptionTypeEnum.RESOURCE_LIMIT],
    failure_reason_types=[
        SageMakerJobExceptionTypeEnum.INTERNAL_ERROR,
        SageMakerJobExceptionTypeEnum.CAPACITY_ERROR,
    ],
    backoff_rate=2.0, # the multiplier by which the retry interval increases during each attempt
    interval_seconds=30, # the number of seconds before the first retry attempt
    expire_after_mins=4*60  # keep trying for for 4 hours max
)

cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")

# parameters for pipeline execution
processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
training_instance_count = ParameterInteger(name="TrainingInstanceCount", default_value=1)
processing_instance_type = "ml.m5.xlarge"
training_instance_type = "ml.c5.9xlarge"
model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="PendingManualApproval")

### 3-2. 전처리 스텝 단계 정의

크게 아래와 같은 순서로 정의 합니다.
- 프로세싱 오브젝트 정의 (SKLearnProcessor)
- 프로세싱 스텝 정의
    - 일력 데이터 세트
        - source: S3 경로 (input_data_uri)
        - destination: 도커 컨테이너의 내부 폴더 위치
    - 출력 위치
        - 훈련 전처리 데이터 결과 위치
        - 테스트 전처리 데이터 결과 위치
    - 프로세싱 코드
    - 프로세싱 코드에 넘길 인자 


In [19]:
# processing step for feature engineering
pre_processor = PyTorchProcessor(
    role=role, 
    instance_count=processing_instance_count,
    instance_type=processing_instance_type,
    framework_version='1.8',
    base_job_name='PreprocessingforHF',
    sagemaker_session=pipeline_session,
)
processor_args = pre_processor.run(
                        code='processing-script.py',
                        source_dir='scripts',
                        outputs=[
                            ProcessingOutput(
                                output_name='train', 
                                source='/opt/ml/processing/output/train/',
                                destination=f's3://{default_bucket}/{s3_output_prefix}/train'),
                            ProcessingOutput(
                                output_name='test', 
                                source='/opt/ml/processing/output/test/', 
                                destination=f's3://{default_bucket}/{s3_output_prefix}/test'),
                        ]
)

step_process = ProcessingStep(
    name="PrepareAugmentedData", 
    step_args=processor_args,
    cache_config=cache_config
)

훈련의 입력으로 사용할 이전 단계의 Processing 결과는 아래 형태로 제공됩니다.
- `step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri`

In [20]:
import sm_pipelines_exec as sm_exec

test_parameters_list = [processing_instance_count, processing_instance_type, training_instance_type, training_instance_count, model_approval_status]
test_steps_list_process = [step_process]

execution_preprocess = sm_exec.exec_pipelines(pipeline_name, role, test_parameters_list, test_steps_list_process)


Job Name:  PreprocessingforHF-2023-01-24-06-18-08-162
Inputs:  [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-687314952804/TextclsPipeline/code/b6cbd1cc37103520feed9d178735f315/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'entrypoint', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-687314952804/TextclsPipeline/code/02a51a5d6de3e6b6893a9dd1b1a3f46e/runproc.sh', 'LocalPath': '/opt/ml/processing/input/entrypoint', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-687314952804/hf_processing_output/train', 'LocalPath': '/opt/ml/processing/output/train/', 'S3UploadMode': 'EndOfJob'}}, {'OutputN

In [24]:
sm_exec.describe_pipelines(execution_preprocess)
sm_exec.get_step_results(execution_preprocess,test_steps_list_process)

Pipelines Status : Succeeded 

- StepName : PrepareAugmentedData, StepStatus : Succeeded
ProcessingStep(name='PrepareAugmentedData', display_name=None, description=None, step_type=<StepTypeEnum.PROCESSING: 'Processing'>, depends_on=None)

 --------------------------------------------------



## 4. 모델 학습을 위한 학습단계 정의 

In [25]:
# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,
                 'train_batch_size': 32,
                 'model_name':'distilbert-base-uncased'
                 }

# image_uri="763104351884.dkr.ecr.eu-west-1.amazonaws.com/huggingface-pytorch-training:1.7-transformers4.6-gpu-py36-cu110-ubuntu18.04"
train_image_uri = f"763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-training:1.10.2-transformers4.17.0-gpu-py38-cu113-ubuntu20.04"

# training step for generating model artifacts
model_path = f"s3://{default_bucket}/{s3_output_prefix}/train_result"


#     metric_definitions = [
#         {'Name': 'TrainLoss', 'Regex': r'\'loss\':([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?),'},
#         {'Name': 'EvalLoss', 'Regex': r'\'eval_loss\':([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?),'},
#         {'Name': 'EvalAcc', 'Regex': r'\'eval_accuracy\':([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?),'},
#         {'Name': 'EvalF1', 'Regex': r'\'eval_f1\':([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?),'},
#         {'Name': 'EvalPrecision', 'Regex': r'\'eval_precision\':([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?),'},
#         {'Name': 'EvalRecall', 'Regex': r'\'eval_recall\':([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?),'},

#     ]

##

metric_definitions = [
    {'Name': 'TrainLoss', 'Regex': '{\'loss\': (.*?),'},
    {'Name': 'EvalLoss', 'Regex': '\'eval_loss\': (.*?),'},
    {'Name': 'EvalAcc', 'Regex': '\'eval_accuracy\': (.*?),'},
    {'Name': 'EvalF1', 'Regex': '\'eval_f1\': (.*?),'},
    {'Name': 'EvalPrecision', 'Regex': '\'eval_precision\': (.*?),'},
    {'Name': 'EvalRecall', 'Regex': '\'eval_recall\': (.*?),'},

]

huggingface_estimator = HuggingFace(entry_point='train.py',
                                    source_dir='./scripts',
                                    instance_type=training_instance_type,
                                    instance_count=training_instance_count,
                                    role=role,
                                    transformers_version='4.6',
                                    pytorch_version='1.8',
                                    py_version='py36',
                                    hyperparameters = hyperparameters,
                                    metric_definitions = metric_definitions,
                                    image_uri=train_image_uri,
                                    output_path=model_path,
                                   )

step_train = TrainingStep(
    name="HuggingFaceModelFineTune",
    estimator=huggingface_estimator,
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
            content_type="text/csv",
        ),
        "test": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri,
            content_type="text/csv",
        ),
    },
    retry_policies=[
        step_retry_policy,
        job_retry_policy
    ],
    cache_config=cache_config
)



In [26]:
test_steps_list_train = [step_process, step_train]
execution_train = sm_exec.exec_pipelines(pipeline_name, role, test_parameters_list, test_steps_list_train)

INFO:sagemaker.processing:Uploaded scripts to s3://sagemaker-us-east-1-687314952804/TextclsPipeline/code/b6cbd1cc37103520feed9d178735f315/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-east-1-687314952804/TextclsPipeline/code/02a51a5d6de3e6b6893a9dd1b1a3f46e/runproc.sh



Job Name:  PreprocessingforHF-2023-01-24-06-48-07-265
Inputs:  [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-687314952804/TextclsPipeline/code/b6cbd1cc37103520feed9d178735f315/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'entrypoint', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-687314952804/TextclsPipeline/code/02a51a5d6de3e6b6893a9dd1b1a3f46e/runproc.sh', 'LocalPath': '/opt/ml/processing/input/entrypoint', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-687314952804/hf_processing_output/train', 'LocalPath': '/opt/ml/processing/output/train/', 'S3UploadMode': 'EndOfJob'}}, {'OutputN

In [43]:
sm_exec.describe_pipelines(execution_train)
sm_exec.get_step_results(execution_train,test_steps_list_train)

Pipelines Status : Succeeded 

- StepName : HuggingFaceModelFineTune, StepStatus : Succeeded
- StepName : PrepareAugmentedData, StepStatus : Succeeded
- CacheHitResult : {'SourcePipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:687314952804:pipeline/textclspipeline/execution/xlun3svbj05k'} 

ProcessingStep(name='PrepareAugmentedData', display_name=None, description=None, step_type=<StepTypeEnum.PROCESSING: 'Processing'>, depends_on=None)

 --------------------------------------------------

TrainingStep(name='HuggingFaceModelFineTune', display_name=None, description=None, step_type=<StepTypeEnum.TRAINING: 'Training'>, depends_on=None)

 --------------------------------------------------



## 5. Model Register를 위한 단계

In [32]:
inf_image_uri = f"763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-inference:1.10.2-transformers4.17.0-cpu-py38-ubuntu20.04"

step_register = RegisterModel(
    name="RegisterModel",
    estimator=huggingface_estimator,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["application/json"],
    response_types=["application/json"],
    inference_instances=["ml.c5.9xlarge"], # instance types recommended by data scientist to be used for real-time endpoints
    transform_instances=["ml.m5.12xlarge", "ml.c5.9xlarge"], # instance types recommended by data scientist to be used for batch transform jobs
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    image_uri=inf_image_uri
)

In [33]:
test_steps_list_register = [step_process, step_train, step_register]
execution_register = sm_exec.exec_pipelines(pipeline_name, role, test_parameters_list, test_steps_list_register)

INFO:sagemaker.processing:Uploaded scripts to s3://sagemaker-us-east-1-687314952804/TextclsPipeline/code/b6cbd1cc37103520feed9d178735f315/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-east-1-687314952804/TextclsPipeline/code/02a51a5d6de3e6b6893a9dd1b1a3f46e/runproc.sh



Job Name:  PreprocessingforHF-2023-01-24-07-05-18-968
Inputs:  [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-687314952804/TextclsPipeline/code/b6cbd1cc37103520feed9d178735f315/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'entrypoint', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-687314952804/TextclsPipeline/code/02a51a5d6de3e6b6893a9dd1b1a3f46e/runproc.sh', 'LocalPath': '/opt/ml/processing/input/entrypoint', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-687314952804/hf_processing_output/train', 'LocalPath': '/opt/ml/processing/output/train/', 'S3UploadMode': 'EndOfJob'}}, {'OutputN

In [46]:
sm_exec.describe_pipelines(execution_register)
sm_exec.get_step_results(execution_register,test_steps_list_register)

Pipelines Status : Succeeded 

- StepName : RegisterModel-RegisterModel, StepStatus : Succeeded
- StepName : HuggingFaceModelFineTune, StepStatus : Succeeded
- StepName : PrepareAugmentedData, StepStatus : Succeeded
- CacheHitResult : {'SourcePipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:687314952804:pipeline/textclspipeline/execution/xlun3svbj05k'} 

ProcessingStep(name='PrepareAugmentedData', display_name=None, description=None, step_type=<StepTypeEnum.PROCESSING: 'Processing'>, depends_on=None)

 --------------------------------------------------

TrainingStep(name='HuggingFaceModelFineTune', display_name=None, description=None, step_type=<StepTypeEnum.TRAINING: 'Training'>, depends_on=None)

 --------------------------------------------------

RegisterModel(name='RegisterModel', steps=[_RegisterModelStep(name='RegisterModel-RegisterModel', display_name=None, description=None, step_type=<StepTypeEnum.REGISTER_MODEL: 'RegisterModel'>, depends_on=None)])

 ----------------------