# SageMaker Inference Component 모델 배포 및 추론

아래 모델 배포는 아래의 인스턴스에서 테스트 완료 되었습니다.
- [Pricing Link](https://aws.amazon.com/sagemaker/pricing/)
  

## 환경 구성 

### 상위 폴더의 Python 경로 추가

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%store -r

In [3]:
print(f"tracking_server_arn : {tracking_server_arn}")
print(f"tracking_server_arn : {experiment_name}")
print(f"compressed_model_path : {compressed_model_path}")

tracking_server_arn : arn:aws:sagemaker:us-west-2:322537213286:mlflow-tracking-server/mlflow-tracking-240801
tracking_server_arn : llama-3-1-kor-bllossom-8b-240801
compressed_model_path : s3://sagemaker-us-west-2-322537213286/sagemaker/llama-3-1-kor-bllossom-8b/checkpoint/MLP-KTLim/llama-3-Korean-Bllossom-8B/llama-3-ml-g5-48xlarge-1-0801-13441722519862/compressed_model


## 추론 이미지 가져오기



In [4]:
import sagemaker
import boto3
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

sess = sagemaker.Session()
sagemaker_client = sess.sagemaker_client
sagemaker_runtime_client = sess.sagemaker_runtime_client


print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::322537213286:role/service-role/AmazonSageMaker-ExecutionRole-20230604T222555
sagemaker session region: us-west-2


In [5]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  session=sess,
  version="2.0.2",
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

llm image uri: 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.3.0-tgi2.0.2-gpu-py310-cu121-ubuntu22.04


## SageMaker Endpoint 생성
- EndpointConfiguration 생성
- Endpoint 생성

### 모델을 배포할 인스턴스 정의

In [6]:
instance_type = "ml.g5.12xlarge"
# # instance_type = "ml.g5.4xlarge"
# instance_type = "ml.g5.xlarge"


if instance_type == "ml.p4d.24xlarge":
    num_GPUSs = 8
elif instance_type == "ml.g5.12xlarge":
    num_GPUSs = 4
elif instance_type == "ml.g5.4xlarge":
    num_GPUSs = 1    
else:
    num_GPUSs = 1
    
print(f"{instance_type} and # of GPU {num_GPUSs} is set")

ml.g5.12xlarge and # of GPU 4 is set


### Endpoint config name 및 설정 값 기술

In [7]:
import time

currentTime = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
print("The current time is", currentTime)

# Set an unique endpoint config name
endpoint_config_name = f"llama3-endpoint-config-{currentTime}" 
print(f"Endpoint config name: {endpoint_config_name}")


# Set varient name and instance type for hosting
variant_name = "AllTraffic"
model_data_download_timeout_in_seconds = 600
container_startup_health_check_timeout_in_seconds = 600

initial_instance_count = 1
max_instance_count = 1
print(f"Initial instance count: {initial_instance_count}")
print(f"Max instance count: {max_instance_count}")


The current time is 2024-08-02-01-40-35
Endpoint config name: llama3-endpoint-config-2024-08-02-01-40-35
Initial instance count: 1
Max instance count: 1


### SageMaker Endpoint Configuration 만들기

In [8]:
epc_response = sagemaker_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ExecutionRoleArn=role,
    ProductionVariants=[
        {
            "VariantName": variant_name,
            "InstanceType": instance_type,
            "InitialInstanceCount": 1,
            "ModelDataDownloadTimeoutInSeconds": model_data_download_timeout_in_seconds,
            "ContainerStartupHealthCheckTimeoutInSeconds": container_startup_health_check_timeout_in_seconds,
            "ManagedInstanceScaling": {
                "Status": "ENABLED",
                "MinInstanceCount": initial_instance_count,
                "MaxInstanceCount": max_instance_count,
            },
            "RoutingConfig": {"RoutingStrategy": "LEAST_OUTSTANDING_REQUESTS"},
        }
    ]
)

### Endpoint 생성

In [9]:
%%time
# Set a unique endpoint name
endpoint_name = f"llama3-endpoint-{currentTime}"
print(f"endpoint_name: {endpoint_name}")

ep_response = sagemaker_client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name,
)
# print(ep_response)
print(f"Creating endpoint: {endpoint_name}")
sess.wait_for_endpoint(endpoint_name)

endpoint_name: llama3-endpoint-2024-08-02-01-40-35
Creating endpoint: llama3-endpoint-2024-08-02-01-40-35
-----!CPU times: user 31.1 ms, sys: 8.66 ms, total: 39.8 ms
Wall time: 3min


{'EndpointName': 'llama3-endpoint-2024-08-02-01-40-35',
 'EndpointArn': 'arn:aws:sagemaker:us-west-2:322537213286:endpoint/llama3-endpoint-2024-08-02-01-40-35',
 'EndpointConfigName': 'llama3-endpoint-config-2024-08-02-01-40-35',
 'ProductionVariants': [{'VariantName': 'AllTraffic',
   'CurrentInstanceCount': 1,
   'DesiredInstanceCount': 1,
   'ManagedInstanceScaling': {'Status': 'ENABLED',
    'MinInstanceCount': 1,
    'MaxInstanceCount': 1},
   'RoutingConfig': {'RoutingStrategy': 'LEAST_OUTSTANDING_REQUESTS'}}],
 'EndpointStatus': 'InService',
 'CreationTime': datetime.datetime(2024, 8, 2, 1, 40, 35, 661000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 8, 2, 1, 43, 25, 349000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': 'f193d65c-b3ce-43fc-8c34-d58257e9355e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f193d65c-b3ce-43fc-8c34-d58257e9355e',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '574',
   'date': 'Fri,

## SageMaker Model 생성

### SageMaker Model 정의
- 추론 이미지 기술 
- 모델 아티펙트 경로 기술 

In [10]:
!aws s3 ls $compressed_model_path/

2024-08-01 23:18:32 14120580802 model.tar.gz


In [11]:
from mlflow import MlflowClient

# Case-sensitive name
client = MlflowClient()
res_mlflow = client.get_latest_versions(registered_model)
registered_model_version = res_mlflow[0].version

  res_mlflow = client.get_latest_versions(registered_model)


In [12]:
from huggingface_hub import HfFolder
from sagemaker.huggingface import HuggingFaceModel

# sagemaker config

health_check_timeout = 600 # 20 minutes
model_name = f"llama3-model-{currentTime}"

import time

# Define Model and Endpoint configuration parameter
config = {
    "HF_MODEL_ID": "/opt/ml/model",       # Path to the model in the container
    "SM_NUM_GPUS": f"{num_GPUSs}",        # Number of GPU used per replica
    "MAX_INPUT_LENGTH": "2048",           # Max length of input text
    "MAX_TOTAL_TOKENS": "4096",           # Max length of the generation (including input text)
    "MAX_BATCH_PREFILL_TOKENS": "4096",  # Limits the number of tokens that can be processed in parallel during the generation
    "MESSAGES_API_ENABLED": "true",       # Enable the OpenAI Messages API
}

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
    role=role,
    name=model_name,
    model_data=f"{compressed_model_path}/model.tar.gz", # path to s3 bucket with model, we are not using a compressed model
    image_uri=llm_image,
    env=config,
)

In [13]:
llm_model.create()

### inference component 생성

In [14]:
# Deploy model to Amazon SageMaker Inference Component
inference_component_name_llama3b = f"llama3b-IC-{currentTime}"
print("inference_component_name_llama3b: ", inference_component_name_llama3b)
variant_name = "AllTraffic"

try:
    sagemaker_client.delete_inference_component(InferenceComponentName=inference_component_name_llama3b)
except:
    pass

ic_response = sagemaker_client.create_inference_component(
    InferenceComponentName=inference_component_name_llama3b,
    EndpointName=endpoint_name,
    VariantName=variant_name,
    Specification={
        "ModelName": model_name,
        "ComputeResourceRequirements": {
            "NumberOfAcceleratorDevicesRequired": num_GPUSs,
            "NumberOfCpuCoresRequired": 1,
            "MinMemoryRequiredInMb": 1024,
        },
    },
    RuntimeConfig={"CopyCount": 1},
)

inference_component_name_llama3b:  llama3b-IC-2024-08-02-01-40-35


In [15]:
import time
import sys
# Wait for IC to come InService
print(f"InferenceComponent: {inference_component_name_llama3b}")
while True:
    desc = sagemaker_client.describe_inference_component(
        InferenceComponentName=inference_component_name_llama3b
    )
    status = desc["InferenceComponentStatus"]
    print(status)
    sys.stdout.flush()
    if status in ["InService", "Failed"]:
        break
    time.sleep(30)

InferenceComponent: llama3b-IC-2024-08-02-01-40-35
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
InService


In [16]:
desc

{'InferenceComponentName': 'llama3b-IC-2024-08-02-01-40-35',
 'InferenceComponentArn': 'arn:aws:sagemaker:us-west-2:322537213286:inference-component/llama3b-IC-2024-08-02-01-40-35',
 'EndpointName': 'llama3-endpoint-2024-08-02-01-40-35',
 'EndpointArn': 'arn:aws:sagemaker:us-west-2:322537213286:endpoint/llama3-endpoint-2024-08-02-01-40-35',
 'VariantName': 'AllTraffic',
 'Specification': {'ModelName': 'llama3-model-2024-08-02-01-40-35',
  'ComputeResourceRequirements': {'NumberOfCpuCoresRequired': 1.0,
   'NumberOfAcceleratorDevicesRequired': 4.0,
   'MinMemoryRequiredInMb': 1024}},
 'RuntimeConfig': {'DesiredCopyCount': 1, 'CurrentCopyCount': 1},
 'CreationTime': datetime.datetime(2024, 8, 2, 1, 43, 38, 892000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 8, 2, 1, 49, 0, 597000, tzinfo=tzlocal()),
 'InferenceComponentStatus': 'InService',
 'ResponseMetadata': {'RequestId': '35553664-9fa8-4d83-9309-c1fb7a8c4867',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-req

## 테스트

In [17]:
user_prompt = '서울의 유명한 관광 코스를 만들어줄래?'

In [18]:
request_body = {
    "messages": [
        {"role": "user", "content": f"{user_prompt}"},
    ],
    "model": "meta-llama-3-fine-tuned",
    "parameters": {"max_tokens":256,
                "top_p": 0.9,
                "temperature": 0.6,
                "max_tokens": 512,
                "stop": ["<|eot_id|>"]}
}

In [19]:
import json

# Set up the SageMaker runtime client
sagemaker_runtime = boto3.client('sagemaker-runtime')

s = time.perf_counter()

# Invoke the endpoint
response = sagemaker_runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    InferenceComponentName = inference_component_name_llama3b,
    ContentType='application/json',
    Body=json.dumps(request_body)
)
# Get the response from the endpoint
result = response['Body'].read().decode('utf-8')

elapsed_async = time.perf_counter() - s

print(f"elapsed time: {round(elapsed_async,3)} second")
parsed_data = json.loads(result)
answer = parsed_data["choices"][0]["message"]["content"].strip()
answer

elapsed time: 5.048 second


'서울은 긴 발표가 필요할 만큼 가득 차 있지만, 몇 가지 대표적인 관광 코스를 몇 가지 추천해 줄 수 있습니다.\n\n1. **경복궁과 인사동 길**\n   - **경복궁**: 조선 왕조의 깊은 역사를 느낄 수 있는 곳으로, 대성전, 광화문, 창덕궁 등이 있다.\n   - **인사동 길**: 전통 차림, 가게와cafe가 많'

## 6. 리소스 삭제
- 인퍼런스 컴포넌트 삭제
- 세이지 메이커 모델 삭제
- 엔드포인트 삭제


In [20]:
from sagemaker.predictor import Predictor

predictor = Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sess,
)

In [21]:
try:
    print(f"Deleting inference components: [b magenta]{inference_component_name_llama3b} ✅")
    # Delete inference component
    sagemaker_client.delete_inference_component(
        InferenceComponentName=inference_component_name_llama3b
    )
except Exception as e:
    print(f"{e}")


Deleting inference components: [b magenta]llama3b-IC-2024-08-02-01-40-35 ✅


In [22]:
try:
    print(f"Deleting model: {model_name}")
    predictor.delete_model()
except Exception as e:
    print(f"{e}")


Deleting model: llama3-model-2024-08-02-01-40-35


In [23]:

try:
    print(f"Deleting endpoint: [b magenta]{predictor.endpoint_name} ✅")
    predictor.delete_endpoint()
except Exception as e:
    print(f"{e}")

print("---" * 10)
print("Done")

Deleting endpoint: [b magenta]llama3-endpoint-2024-08-02-01-40-35 ✅
------------------------------
Done
