# HTTPS Endpoint로 Model Serving

real-time과 batch predictions Endpoint를 모두 활용할 수 있으며, 이 HOL에서는 Real-time에 대해서만 설명합니다. 
사용 중이신 애플리케이션의 대기시간이 짧은 경우 Model은 real-time API로 배포하며 HTTPS를 통해 단일 예측 요청에 대해 빠르게 예측을 제공합니다.

In [83]:
import boto3
import sagemaker
import pandas as pd
import time

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [None]:
client = boto3.client('sagemaker')

client.delete_endpoint(
    EndpointName=endpoint_name
)

### 이전 Notebook에서 Endpoints를 생성하신 경우에는 Endpoints를 삭제하신 후 이 노트북을 생성하시기 바랍니다. 그렇지 않은 경우 HOL를 수행하시는 동안에 ResourceLimitExceeded error를 보실 수 있습니다.


In [91]:
list_endpoints = sm.list_endpoints()

for ep in list_endpoints['Endpoints']:
    sm.delete_endpoint(EndpointName=ep['EndpointName'])
    

NextToken = 'None'
while NextToken !='':
    lec = sm.list_endpoint_configs(NextToken=NextToken) if NextToken != 'None' else sm.list_endpoint_configs()
    for epc in lec['EndpointConfigs']:
        print(epc['EndpointConfigName'])
        sm.delete_endpoint_config(EndpointConfigName=epc['EndpointConfigName'])
        time.sleep(3)
    NextToken = lec['NextToken'] if lec.get('NextToken') else ''

NextToken = 'None'
while NextToken !='':
    lec = sm.list_models(NextToken=NextToken) if NextToken != 'None' else sm.list_models()
    for epc in lec['Models']:
        print(epc['ModelName'])
        sm.delete_model(ModelName=epc['ModelName'])
        time.sleep(3)
    NextToken = lec['NextToken'] if lec.get('NextToken') else ''

In [88]:
%store -r

In [3]:
print(training_job_name)

tensorflow-training-2020-07-15-14-26-15-565


# Notebook 내 Model 복사

In [4]:
!aws s3 cp s3://$bucket/$training_job_name/output/model.tar.gz ./model.tar.gz

download: s3://sagemaker-us-east-2-322537213286/tensorflow-training-2020-07-15-14-26-15-565/output/model.tar.gz to ./model.tar.gz


In [5]:
!tar -xvzf ./model.tar.gz

tensorboard/
tensorflow/
tensorflow/saved_model/
tensorflow/saved_model/0/
tensorflow/saved_model/0/assets/
tensorflow/saved_model/0/variables/
tensorflow/saved_model/0/variables/variables.index
tensorflow/saved_model/0/variables/variables.data-00000-of-00001
tensorflow/saved_model/0/saved_model.pb
transformers/
transformers/fine-tuned/
transformers/fine-tuned/tf_model.h5
transformers/fine-tuned/config.json
metrics/
metrics/confusion_matrix.png


In [6]:
!saved_model_cli show --all --dir ./tensorflow/saved_model/0/

2020-07-16 08:47:17.924228: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer.so.6'; dlerror: libnvinfer.so.6: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.0/lib64:/usr/local/cuda-10.0/extras/CUPTI/lib64:/usr/local/cuda-10.0/lib:/usr/local/cuda-10.0/efa/lib:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/lib64/openmpi/lib/:/usr/local/lib:/usr/lib:/usr/local/mpi/lib:/lib/:/usr/lib64/openmpi/lib/:/usr/local/lib:/usr/lib:/usr/local/mpi/lib:/lib/:/usr/lib64/openmpi/lib/:/usr/local/lib:/usr/lib:/usr/local/mpi/lib:/lib/:
2020-07-16 08:47:17.924379: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvinfer_plugin.so.6: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.0/lib64:/usr/local/cuda-10.0/extras/CUPTI/lib64:/usr/local/cuda-10.0/lib:/usr/local/cuda-10.0/ef

# Model 배포

단일 Model에 대해 기본적으로 `EndpointConfig`를 생성합니다. 다음 노트북에서 canary 배포와 A/B testing을 지원하는 `EndpointConfig` 전략을 수행할 계획입니다. 

_참고: 리전에 따라 기본적으로 제공되는 다양한 버전/Deep leanring 프레임워크 별로 deep learning 이미지를 제공하고 있으며, 필요에 따라 해당 image를 받아서 추가해서 custom 이미지를 생성할 수 있습니다._

https://docs.aws.amazon.com/deep-learning-containers/latest/devguide/deep-learning-containers-images.html


또한, SageMaker에서 학습한 모델이 아닌 경우에도 XXXXX.tar.gz로 압축하여 예측에 활용할 수 있습니다.

In [7]:
from sagemaker.tensorflow.serving import Model

model = Model(model_data='s3://{}/{}/output/model.tar.gz'.format(bucket, training_job_name),
              role=role,
              framework_version='2.0.0') # Elastic Inference does not yet support TF 2.1.0 as of sagemaker==1.56.1

In [8]:
deployed_model = model.deploy(initial_instance_count=1, # Should use >=2 for high(er) availability 
                              instance_type='ml.m5.large',
#                              accelerator_type='ml.eia2.medium',
                              wait=False)

endpoint_name = deployed_model.endpoint

print('Endpoint name:  {}'.format(endpoint_name))

Endpoint name:  tensorflow-inference-2020-07-16-08-53-14-806


# Experiment 내 배포 추적

In [9]:
print(experiment_name)

Amazon-Customer-Reviews-BERT-Experiment-1594822481


In [10]:
print(trial_name)

trial-1594822498


In [11]:
import time
from smexperiments.trial import Trial

timestamp = '{}'.format(int(time.time()))

trial = Trial.load(trial_name=trial_name)
print(trial)

Trial(sagemaker_boto_client=<botocore.client.SageMaker object at 0x7fc368443550>,trial_name='trial-1594822498',trial_arn='arn:aws:sagemaker:us-east-2:322537213286:experiment-trial/trial-1594822498',display_name='trial-1594822498',experiment_name='Amazon-Customer-Reviews-BERT-Experiment-1594822481',creation_time=datetime.datetime(2020, 7, 15, 14, 14, 58, 483000, tzinfo=tzlocal()),created_by={},last_modified_time=datetime.datetime(2020, 7, 15, 15, 33, 46, 674000, tzinfo=tzlocal()),last_modified_by={},response_metadata={'RequestId': '0a38a540-38db-460b-8759-25026eb5085c', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '0a38a540-38db-460b-8759-25026eb5085c', 'content-type': 'application/x-amz-json-1.1', 'content-length': '326', 'date': 'Thu, 16 Jul 2020 08:53:45 GMT'}, 'RetryAttempts': 0})


In [12]:
from smexperiments.tracker import Tracker

tracker_deploy = Tracker.create(display_name='deploy', 
                                sagemaker_boto_client=sm)

deploy_trial_component_name = tracker_deploy.trial_component.trial_component_name
print('Deploy trial component name {}'.format(deploy_trial_component_name))

Deploy trial component name TrialComponent-2020-07-16-085349-fpto


### Trial에 Component로 `deploy` Trial Component 와 Tracker를 추가합니다.

In [13]:
trial.add_trial_component(tracker_deploy.trial_component)

# Endpoint Name 추적

In [14]:
tracker_deploy.log_parameters({
    'endpoint_name': endpoint_name,
})

# must save after logging
tracker_deploy.trial_component.save()

TrialComponent(sagemaker_boto_client=<botocore.client.SageMaker object at 0x7fc367165470>,trial_component_name='TrialComponent-2020-07-16-085349-fpto',display_name='deploy',trial_component_arn='arn:aws:sagemaker:us-east-2:322537213286:experiment-trial-component/trialcomponent-2020-07-16-085349-fpto',response_metadata={'RequestId': 'ece286c6-22f6-432c-b16c-2255e885b192', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'ece286c6-22f6-432c-b16c-2255e885b192', 'content-type': 'application/x-amz-json-1.1', 'content-length': '129', 'date': 'Thu, 16 Jul 2020 08:55:30 GMT'}, 'RetryAttempts': 0},parameters={'endpoint_name': 'tensorflow-inference-2020-07-16-08-53-14-806'},input_artifacts={},output_artifacts={})

In [15]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">REST Endpoint</a></b>'.format(region, endpoint_name)))


In [16]:
client = boto3.client('sagemaker')
waiter = client.get_waiter('endpoint_in_service')
waiter.wait(EndpointName=endpoint_name)

<h2><span style="color:red">위 Endpoint가 Deploy되기 전까지 기다려 주시기 바랍니다.</span></h2>

In [17]:
from sagemaker.analytics import ExperimentAnalytics

lineage_table = ExperimentAnalytics(
    sagemaker_session=sess,
    experiment_name=experiment_name,
    metric_names=['validation:accuracy'],
    sort_by="CreationTime",
    sort_order="Ascending",
)

lineage_df = lineage_table.dataframe()
lineage_df.shape

(4, 43)

In [18]:
lineage_df

Unnamed: 0,TrialComponentName,DisplayName,max_seq_length,test_split_percentage,train_split_percentage,validation_split_percentage,SourceArn,SageMaker.ImageUri,SageMaker.InstanceCount,SageMaker.InstanceType,...,use_xla,validation_batch_size,validation_steps,validation:accuracy - Min,validation:accuracy - Max,validation:accuracy - Avg,validation:accuracy - StdDev,validation:accuracy - Last,validation:accuracy - Count,endpoint_name
0,TrialComponent-2020-07-15-141609-nmwg,prepare,128.0,0.05,0.9,0.05,,,,,...,,,,,,,,,,
1,tensorflow-training-2020-07-15-14-26-15-565-aw...,train,128.0,,,,arn:aws:sagemaker:us-east-2:322537213286:train...,763104351884.dkr.ecr.us-east-2.amazonaws.com/t...,1.0,ml.c5.9xlarge,...,True,128.0,50.0,0.5714,0.5714,0.5714,0.0,0.5714,1.0,
2,TrialComponent-2020-07-15-153346-gwhd,deploy,,,,,,,,,...,,,,,,,,,,tensorflow-inference-2020-07-15-15-33-45-571
3,TrialComponent-2020-07-16-085349-fpto,deploy,,,,,,,,,...,,,,,,,,,,tensorflow-inference-2020-07-16-08-53-14-806


# Prediction 수행

###  Raw Text를 BERT Tokens로 변환하기 위한 Request Handler 설정

In [94]:
class RequestHandler(object):
    import json
    
    def __init__(self, tokenizer, max_seq_length):
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __call__(self, instances):
        transformed_instances = []

        for instance in instances:
            encode_plus_tokens = tokenizer.encode_plus(instance,
                                                       pad_to_max_length=True,
                                                       max_length=self.max_seq_length)

            input_ids = encode_plus_tokens['input_ids']
            input_mask = encode_plus_tokens['attention_mask']
            segment_ids = [0] * self.max_seq_length

            transformed_instance = {"input_ids": input_ids, 
                                    "input_mask": input_mask, 
                                    "segment_ids": segment_ids}

            transformed_instances.append(transformed_instance)

        transformed_data = {"instances": transformed_instances}

        return json.dumps(transformed_data)

###  BERT Response를 Predicted Classes로 변환하기 위한 Response Handler 설정

In [95]:
class ResponseHandler(object):
    import json
    import tensorflow as tf
    
    def __init__(self, classes):
        self.classes = classes
    
    def __call__(self, response, accept_header):
        import tensorflow as tf

        response_body = response.read().decode('utf-8')

        response_json = json.loads(response_body)

        log_probabilities = response_json["predictions"]

        predicted_classes = []

        # Convert log_probabilities => softmax (all probabilities add up to 1) => argmax (final prediction)
        for log_probability in log_probabilities:
            softmax = tf.nn.softmax(log_probability)    
            predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
            predicted_class = self.classes[predicted_class_idx]
            predicted_classes.append(predicted_class)

        return predicted_classes

In [21]:
import json
from sagemaker.tensorflow.serving import Predictor
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

request_handler = RequestHandler(tokenizer=tokenizer,
                                 max_seq_length=128)

response_handler = ResponseHandler(classes=[1, 2, 3, 4, 5])

predictor = Predictor(endpoint_name=endpoint_name,
                      sagemaker_session=sess,
                      serializer=request_handler,
                      deserializer=response_handler,
                      content_type='application/json',
                      model_name='saved_model',
                      model_version=0)

INFO:transformers.file_utils:TensorFlow version 2.1.0 available.
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/ec2-user/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [22]:
import tensorflow as tf
import json
    
reviews = ["This is great!", 
           "This is not good."]

predicted_classes = predictor.predict(reviews)

for predicted_class, review in zip(predicted_classes, reviews):
    print('[Predicted Star Rating: {}]'.format(predicted_class), review)

[Predicted Star Rating: 5] This is great!
[Predicted Star Rating: 5] This is not good.


In [23]:
%store endpoint_name

Stored 'endpoint_name' (str)


# Using API Gateway with SageMaker Endpoints

https://aws.amazon.com/blogs/machine-learning/creating-a-machine-learning-powered-rest-api-with-amazon-api-gateway-mapping-templates-and-amazon-sagemaker/

# Delete Endpoint

In [None]:
client = boto3.client('sagemaker')

client.delete_endpoint(
    EndpointName=endpoint_name
)