# <B> SageMaker pileline with `MLflow` </B>
* Container: codna_pytorch_p310
* [Example codes](https://github.com/aws/amazon-sagemaker-examples/tree/main/sagemaker-mlflow)

## AutoReload

In [1]:
%store -r

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
print(f"test_model_id : {test_model_id}")
print(f"bucket : {bucket}")
print(f"prefix : {prefix}")
print(f"model_weight_path : {model_weight_path}")
print(f"training_input_path : {training_input_path}")
print(f"test_input_path : {test_input_path}")
print(f"local_training_input_path : {local_training_input_path}")
print(f"local_test_input_path : {local_test_input_path}")
print(f"tracking_server_arn : {tracking_server_arn}")
print(f"experiment_name : {experiment_name}")
print(f"registered_model : {registered_model}")

test_model_id : MLP-KTLim/llama-3-Korean-Bllossom-8B
bucket : sagemaker-us-west-2-322537213286
prefix : sagemaker/llama-3-1-kor-bllossom-8b
model_weight_path : s3://sagemaker-us-west-2-322537213286/sagemaker/llama-3-1-kor-bllossom-8b/model_weight/MLP-KTLim/llama-3-Korean-Bllossom-8B
training_input_path : s3://sagemaker-us-west-2-322537213286/sagemaker/llama-3-1-kor-bllossom-8b/gemini_result_kospi_0517/train/train_dataset.json
test_input_path : s3://sagemaker-us-west-2-322537213286/sagemaker/llama-3-1-kor-bllossom-8b/gemini_result_kospi_0517/test/test_dataset.json
local_training_input_path : /home/ec2-user/SageMaker/2024/llama-3-on-sagemaker/dataset/train
local_test_input_path : /home/ec2-user/SageMaker/2024/llama-3-on-sagemaker/dataset/test
tracking_server_arn : arn:aws:sagemaker:us-west-2:322537213286:mlflow-tracking-server/mlflow-tracking-240801
experiment_name : llama-3-1-kor-bllossom-8b-240801
registered_model : llama-3-1-kor-bllossom-8b


## 1. Pipeline definition

In [4]:
import os
import time
import mlflow
import argparse
from os import path
from pprint import pprint

import sagemaker
from pathlib import Path
from time import strftime

from sagemaker import ModelPackage
from sagemaker.pytorch.estimator import PyTorch
from sagemaker.pytorch.model import PyTorchModel
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.workflow.steps import CacheConfig, ProcessingStep, TrainingStep
from sagemaker.workflow.pipeline_context import PipelineSession, LocalPipelineSession

from sagemaker.workflow.retry import StepRetryPolicy, StepExceptionTypeEnum, SageMakerJobExceptionTypeEnum, SageMakerJobStepRetryPolicy

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
region_name = sess.boto_region_name
print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


sagemaker role arn: arn:aws:iam::322537213286:role/service-role/AmazonSageMaker-ExecutionRole-20230604T222555
sagemaker session region: us-west-2


In [5]:
cache_config = CacheConfig(
    enable_caching=True,
    expire_after="T48H"
)

retry_policies=[
    # retry when resource limit quota gets exceeded
    SageMakerJobStepRetryPolicy(
        exception_types=[SageMakerJobExceptionTypeEnum.RESOURCE_LIMIT],
        expire_after_mins=180,
        interval_seconds=60,
        backoff_rate=1.0
    ),
]

In [18]:
## 환경 셋팅
model_package_name = "-".join(["MPG", registered_model])

deployexecution_instance = "ml.m5.xlarge"
endpoint_name = f'endpoint--{model_package_name}-{int(time.time())}'
proc_prefix_path = "/opt/ml/processing"

serving_instance_type = "ml.g5.12xlarge"
# serving_instance_type = "ml.g5.4xlarge"
# serving_instance_type = "ml.g5.xlarge"


instance_count = 1
max_run = 24*60*60

print(f"model_package_name : {model_package_name}")
print(f"serving_instance_type : {serving_instance_type}")

model_package_name : MPG-llama-3-1-kor-bllossom-8b
serving_instance_type : ml.g5.12xlarge


In [56]:
if serving_instance_type =='local_gpu':
    pipeline_session = LocalPipelineSession()
    pipeline_session.config = {'local': {'local_code': True}}

else:
    pipeline_session = PipelineSession()
pipeline_session

<sagemaker.workflow.pipeline_context.PipelineSession at 0x7fb3330433a0>

In [62]:
from sagemaker.huggingface import HuggingFaceModel
from sagemaker.huggingface import get_huggingface_llm_image_uri


# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  session=sess,
  version="2.0.2",
)

# print ecr image uri
print(f"llm image uri: {llm_image}")


if serving_instance_type == "ml.p4d.24xlarge":
    num_GPUSs = 8
elif serving_instance_type == "ml.g5.12xlarge":
    num_GPUSs = 4
elif serving_instance_type == "ml.g5.4xlarge":
    num_GPUSs = 1    
else:
    num_GPUSs = 1
    
print(f"{serving_instance_type} and # of GPU {num_GPUSs} is set")

# sagemaker config
currentTime = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
print("The current time is", currentTime)

health_check_timeout = 600 # 20 minutes
model_name = f"llama3-model-{currentTime}"

import time

# Define Model and Endpoint configuration parameter
config = {
    "HF_MODEL_ID": "/opt/ml/model",       # Path to the model in the container
    "SM_NUM_GPUS": f"{num_GPUSs}",        # Number of GPU used per replica
    "MAX_INPUT_LENGTH": "2048",           # Max length of input text
    "MAX_TOTAL_TOKENS": "4096",           # Max length of the generation (including input text)
    "MAX_BATCH_PREFILL_TOKENS": "4096",  # Limits the number of tokens that can be processed in parallel during the generation
    "MESSAGES_API_ENABLED": "true",       # Enable the OpenAI Messages API
}

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
    role=role,
    name=model_name,
    model_data=f"{compressed_model_path}/model.tar.gz", # path to s3 bucket with model, we are not using a compressed model
    image_uri=llm_image,
    sagemaker_session=pipeline_session,
    env=config,
)

from sagemaker import PipelineModel

pipeline_model = PipelineModel(
    models=[llm_model],
    role=role,
    sagemaker_session=pipeline_session,
)

register_model_step_args = pipeline_model.register(
    content_types=["*"],
    response_types=["application/json"],
    inference_instances=["ml.g5.2xlarge", "ml.g5.12xlarge"],
    # transform_instances=["ml.g5.2xlarge", "ml.g5.12xlarge"],
    model_package_group_name=model_package_name,
    approval_status="PendingManualApproval",
    ## “Approved”, “Rejected”, or “PendingManualApproval” (default: “PendingManualApproval”).
)

registration_process = ModelStep(
   name="ModelRegistrationProcess",
   step_args=register_model_step_args,
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py310
INFO:sagemaker.image_uris:Defaulting to only supported image scope: gpu.


llm image uri: 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.3.0-tgi2.0.2-gpu-py310-cu121-ubuntu22.04
ml.g5.12xlarge and # of GPU 4 is set
The current time is 2024-08-02-03-32-23


In [63]:
from sagemaker.processing import FrameworkProcessor

deploy_processor = FrameworkProcessor(
    estimator_cls=PyTorch,
    framework_version="2.0.0",
    py_version="py310",
    image_uri=None,
    role=role,
    instance_type=deployexecution_instance,
    instance_count=instance_count,
    base_job_name="deploy", # bucket에 보이는 이름 (pipeline으로 묶으면 pipeline에서 정의한 이름으로 bucket에 보임)
    sagemaker_session=pipeline_session
)

step_deploy_args = deploy_processor.run(
    code="deploy.py",
    source_dir="src/deploy/",
    arguments=[
        "--prefix_deploy", proc_prefix_path, \
        "--region", region_name, \
        "--instance_type", serving_instance_type, \
        "--depoly_instance_type", serving_instance_type, \
        "--model_package_group_name", model_package_name, \
        "--endpoint_name", endpoint_name, \
        "--execution_role", role, \
    ],
    job_name="deploy",
)

deploy_process = ProcessingStep(
    name="DeployProcess", ## Processing job이름
    step_args=step_deploy_args,
    depends_on=[register_process],
    cache_config=cache_config,
    retry_policies=retry_policies
)

print ("  \n== Deploy Step ==")

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


  
== Deploy Step ==


In [64]:
pipeline = Pipeline(
    name=model_package_name,
    steps=[registration_process, deploy_process],
    #steps=[self.preprocessing_process, self.training_process],
    sagemaker_session=pipeline_session
)

pipeline.upsert(role_arn=role) ## Submit the pipeline definition to the SageMaker Pipelines service 
execution = pipeline.start()
desc = execution.describe()
desc

INFO:sagemaker.processing:Uploaded src/deploy/ to s3://sagemaker-us-west-2-322537213286/MPG-llama-3-1-kor-bllossom-8b/code/98519768ad2a31f79f83576f9448c86a/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-west-2-322537213286/MPG-llama-3-1-kor-bllossom-8b/code/be2cbd35f419f747dca87a487b64f344/runproc.sh
INFO:sagemaker.processing:Uploaded src/deploy/ to s3://sagemaker-us-west-2-322537213286/MPG-llama-3-1-kor-bllossom-8b/code/98519768ad2a31f79f83576f9448c86a/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-west-2-322537213286/MPG-llama-3-1-kor-bllossom-8b/code/be2cbd35f419f747dca87a487b64f344/runproc.sh


{'PipelineArn': 'arn:aws:sagemaker:us-west-2:322537213286:pipeline/MPG-llama-3-1-kor-bllossom-8b',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-west-2:322537213286:pipeline/MPG-llama-3-1-kor-bllossom-8b/execution/q6mqjupfcooh',
 'PipelineExecutionDisplayName': 'execution-1722569547718',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2024, 8, 2, 3, 32, 27, 658000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 8, 2, 3, 32, 27, 658000, tzinfo=tzlocal()),
 'CreatedBy': {'IamIdentity': {'Arn': 'arn:aws:sts::322537213286:assumed-role/AmazonSageMaker-ExecutionRole-20230604T222555/SageMaker',
   'PrincipalId': 'AROAUWGFXJVTJPN7KR2AE:SageMaker'}},
 'LastModifiedBy': {'IamIdentity': {'Arn': 'arn:aws:sts::322537213286:assumed-role/AmazonSageMaker-ExecutionRole-20230604T222555/SageMaker',
   'PrincipalId': 'AROAUWGFXJVTJPN7KR2AE:SageMaker'}},
 'ResponseMetadata': {'RequestId': '64664e6c-9b94-48ee-8f4b-a690d512356c',
  'HTTPStatusCode': 200,
  'HTTPH

### model package approve

In [65]:
model_package_name

'MPG-llama-3-1-kor-bllossom-8b'

In [80]:
import boto3
sm_client = boto3.client("sagemaker")

sm_model_package = sm_client.list_model_packages(
    ModelPackageGroupName=model_package_name,
    SortBy="CreationTime",
    SortOrder="Descending"
)

model_package_arn=sm_model_package['ModelPackageSummaryList'][0]['ModelPackageArn']
ModelApprovalStatus=sm_model_package['ModelPackageSummaryList'][0]['ModelApprovalStatus']

print(f"ModelPackageArn : {model_package_arn}")
print(f"ModelApprovalStatus : {ModelApprovalStatus}")

ModelPackageArn : arn:aws:sagemaker:us-west-2:322537213286:model-package/MPG-llama-3-1-kor-bllossom-8b/2
ModelApprovalStatus : Approved


In [68]:
### 승인 
model_package_update_input_dict = {
    "ModelPackageArn" : model_package_arn,
    "ModelApprovalStatus" : "Approved"
}
model_package_update_response = sm_client.update_model_package(**model_package_update_input_dict)


In [81]:
print ("PipelineArn: ", desc["PipelineArn"])
print (execution.describe())

PipelineArn:  arn:aws:sagemaker:us-west-2:322537213286:pipeline/MPG-llama-3-1-kor-bllossom-8b
{'PipelineArn': 'arn:aws:sagemaker:us-west-2:322537213286:pipeline/MPG-llama-3-1-kor-bllossom-8b', 'PipelineExecutionArn': 'arn:aws:sagemaker:us-west-2:322537213286:pipeline/MPG-llama-3-1-kor-bllossom-8b/execution/q6mqjupfcooh', 'PipelineExecutionDisplayName': 'execution-1722569547718', 'PipelineExecutionStatus': 'Executing', 'PipelineExperimentConfig': {'ExperimentName': 'mpg-llama-3-1-kor-bllossom-8b', 'TrialName': 'q6mqjupfcooh'}, 'CreationTime': datetime.datetime(2024, 8, 2, 3, 32, 27, 658000, tzinfo=tzlocal()), 'LastModifiedTime': datetime.datetime(2024, 8, 2, 3, 32, 27, 658000, tzinfo=tzlocal()), 'CreatedBy': {'IamIdentity': {'Arn': 'arn:aws:sts::322537213286:assumed-role/AmazonSageMaker-ExecutionRole-20230604T222555/SageMaker', 'PrincipalId': 'AROAUWGFXJVTJPN7KR2AE:SageMaker'}}, 'LastModifiedBy': {'IamIdentity': {'Arn': 'arn:aws:sts::322537213286:assumed-role/AmazonSageMaker-ExecutionRo

In [85]:
user_prompt = '서울의 유명한 관광 코스를 만들어줄래?'

In [88]:
request_body = {
    "messages": [
        {"role": "user", "content": f"{user_prompt}"},
    ],
    "model": "meta-llama-3-fine-tuned",
    "parameters": {"max_tokens":256,
                "top_p": 0.9,
                "temperature": 0.6,
                "max_tokens": 1024,
                "stop": ["<|eot_id|>"]}
}

In [89]:
import json

# Set up the SageMaker runtime client
sagemaker_runtime = boto3.client('sagemaker-runtime')

s = time.perf_counter()

# Invoke the endpoint
response = sagemaker_runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    # InferenceComponentName = inference_component_name_llama3b,
    ContentType='application/json',
    Body=json.dumps(request_body)
)
# Get the response from the endpoint
result = response['Body'].read().decode('utf-8')

elapsed_async = time.perf_counter() - s

print(f"elapsed time: {round(elapsed_async,3)} second")
parsed_data = json.loads(result)
answer = parsed_data["choices"][0]["message"]["content"].strip()
answer

elapsed time: 1.406 second


'OfFile(서울의 유명한 관광 코스)\n\n1. 경복궁\n- 경복궁 약도: 경복궁 - 인사동 - 고̉스_CO.commands/Cheese\\d ilişkinonuangsyou.txt\n 경복궁은 조선 시대의 대표적인 궁궐로, 한국의 역사와 문화를 느낄 수 있는 곳입니다. 경복궁의 건물과 정원, 그리고 고궁박물관을 방문할 수 있습니다.\n\n2.'

### Resource 삭제

In [90]:
from sagemaker.predictor import Predictor

predictor = Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sess,
)

In [92]:
try:
    print(f"Deleting model: {model_name}")
    predictor.delete_model()
except Exception as e:
    print(f"{e}")


Deleting model: llama3-model-2024-08-02-03-32-23
An error occurred (ValidationException) when calling the DescribeEndpointConfig operation: Could not find endpoint configuration "endpoint--MPG-llama-3-1-kor-bllossom-8b-1722564633".


In [91]:

try:
    print(f"Deleting endpoint: [b magenta]{predictor.endpoint_name} ✅")
    predictor.delete_endpoint()
except Exception as e:
    print(f"{e}")

print("---" * 10)
print("Done")

INFO:sagemaker:Deleting endpoint configuration with name: endpoint--MPG-llama-3-1-kor-bllossom-8b-1722564633


Deleting endpoint: [b magenta]endpoint--MPG-llama-3-1-kor-bllossom-8b-1722564633 ✅


INFO:sagemaker:Deleting endpoint with name: endpoint--MPG-llama-3-1-kor-bllossom-8b-1722564633


------------------------------
Done
