## 1. Deploy endpoint

https://sagemaker.readthedocs.io/en/stable/api/inference/model.html

In [None]:
import boto3
import sagemaker 

role = sagemaker.get_execution_role()
client = boto3.client("sts")
account = client.get_caller_identity()["Account"]
print(account)

my_session = boto3.session.Session()
region = my_session.region_name
repo = "translator-en-fr"
version = "latest"
image_uri = "{}.dkr.ecr.{}.amazonaws.com/{}:{}".format(account, region, repo, version)
instance_type = "ml.m5.large"
print(instance_type)
print(image_uri)

Here we juste create the endpoint thanks to deploy method of Model object.  
We need to specifiy a non-burstable instance as burstable instances can handle automatic scaling policy. 

In [None]:
my_model = sagemaker.model.Model(image_uri = image_uri, 
                                 model_data=None, 
                                 role=role, 
                                 predictor_cls=None, 
                                 env=None, 
                                 name=None, 
                                 vpc_config=None, 
                                 sagemaker_session=None, 
                                 enable_network_isolation=False, 
                                 model_kms_key=None, 
                                 image_config=None)

my_model.deploy(initial_instance_count = 1, 
                instance_type = 'ml.m5.large', 
                serializer=None, 
                deserializer=None, 
                accelerator_type=None, 
                endpoint_name='translator-en-fr', 
                tags=None, 
                kms_key=None, 
                wait=True, 
                data_capture_config=None)

-

Send a request to targeted endpoint.

In [None]:
import json

client = boto3.client('runtime.sagemaker')


response = client.invoke_endpoint(
    EndpointName='translator-en-fr',
    Body="{\"query\":\"hello world\"}",
    ContentType='application/json',
    Accept='Accept',
)

result = json.loads(response["Body"].read().decode())
display(result)


{'embedding': [-0.8383811712265015,
  0.19172132015228271,
  1.2695287466049194,
  -0.5367799997329712,
  -0.9755768179893494,
  -0.8593676686286926,
  0.4266073703765869,
  0.6218725442886353,
  -0.2496056854724884,
  -0.07055319845676422,
  -0.8758314847946167,
  -0.14506013691425323,
  -0.27873921394348145,
  0.7691882848739624,
  0.3465067744255066,
  -0.31110185384750366,
  -0.2108939290046692,
  0.5057286024093628,
  -1.0002888441085815,
  -0.15871655941009521,
  -0.968595027923584,
  -0.5634223818778992,
  0.38001760840415955,
  -0.021509338170289993,
  0.19323833286762238,
  -0.4793827533721924,
  0.06706300377845764,
  0.6946162581443787,
  -0.28764697909355164,
  0.09808813035488129,
  -0.3249238431453705,
  -0.6351913809776306,
  0.2809523940086365,
  -0.02931654080748558,
  -0.6725121140480042,
  0.012547370046377182,
  -0.4776223301887512,
  0.5295774340629578,
  -0.6862552165985107,
  0.6177113056182861,
  0.042193472385406494,
  -0.02578011527657509,
  0.603126585483551,

From here we can run a first load test to see endpoint behaviour. 

## 2. Autoscalling

In [None]:
import pprint

pp = pprint.PrettyPrinter(indent=4, depth=4)
role = sagemaker.get_execution_role()
sagemaker_client = boto3.Session().client(service_name='sagemaker')
endpoint_name = 'translator-en-fr'
response = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
pp.pprint(response)

#Let us define a client to play with autoscaling options
client = boto3.client('application-autoscaling') # Common class representing Application Auto Scaling for SageMaker amongst other services

{   'CreationTime': datetime.datetime(2021, 7, 15, 8, 3, 31, 221000, tzinfo=tzlocal()),
    'EndpointArn': 'arn:aws:sagemaker:eu-west-3:802765560650:endpoint/embeddingendpoint-test',
    'EndpointConfigName': 'EmbeddingEndpoint-test',
    'EndpointName': 'EmbeddingEndpoint-test',
    'EndpointStatus': 'InService',
    'LastModifiedTime': datetime.datetime(2021, 7, 19, 11, 35, 36, 118000, tzinfo=tzlocal()),
    'ProductionVariants': [   {   'CurrentInstanceCount': 2,
                                  'CurrentWeight': 1.0,
                                  'DeployedImages': [{...}],
                                  'DesiredInstanceCount': 2,
                                  'DesiredWeight': 1.0,
                                  'VariantName': 'AllTraffic'}],
    'ResponseMetadata': {   'HTTPHeaders': {   'content-length': '716',
                                               'content-type': 'application/x-amz-json-1.1',
                                               'date': 'Mon, 19 J

Put automatic scalling policy 

In [None]:
resource_id='endpoint/' + endpoint_name + '/variant/' + 'AllTraffic' # This is the format in which application autoscaling references the endpoint

response = client.register_scalable_target(
    ServiceNamespace='sagemaker',
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredInstanceCount',
    MinCapacity=1,
    MaxCapacity=2
)


Example 1 extract from : https://aws.amazon.com/fr/blogs/machine-learning/configuring-autoscaling-inference-endpoints-in-amazon-sagemaker/

We fix our limit RPS to be 200 request per second.    
Then we apply the simple formula here to provide target value : https://docs.aws.amazon.com/sagemaker/latest/dg/endpoint-scaling-loadtest.html

TargetValue = (RPS * 0.5) * 60

In [None]:
#Example 1 - SageMakerVariantInvocationsPerInstance Metric
response = client.put_scaling_policy(
    PolicyName='Invocations-ScalingPolicy',
    ServiceNamespace='sagemaker', # The namespace of the AWS service that provides the resource. 
    ResourceId=resource_id, # Endpoint name 
    ScalableDimension='sagemaker:variant:DesiredInstanceCount', # SageMaker supports only Instance Count
    PolicyType='TargetTrackingScaling', # 'StepScaling'|'TargetTrackingScaling'
    TargetTrackingScalingPolicyConfiguration={
        'TargetValue': 6000.0, # The target value for the metric. - here the metric is - SageMakerVariantInvocationsPerInstance
        'PredefinedMetricSpecification': {
            'PredefinedMetricType': 'SageMakerVariantInvocationsPerInstance', # is the average number of times per minute that each instance for a variant is invoked. 
        },
        'ScaleInCooldown': 600, # The cooldown period helps you prevent your Auto Scaling group from launching or terminating 
                                # additional instances before the effects of previous activities are visible. 
                                # You can configure the length of time based on your instance startup time or other application needs.
                                # ScaleInCooldown - The amount of time, in seconds, after a scale in activity completes before another scale in activity can start. 
        'ScaleOutCooldown': 300 # ScaleOutCooldown - The amount of time, in seconds, after a scale out activity completes before another scale out activity can start.
        
        # 'DisableScaleIn': True|False - ndicates whether scale in by the target tracking policy is disabled. 
                            # If the value is true , scale in is disabled and the target tracking policy won't remove capacity from the scalable resource.
    }
)

# 3. Disable endpoints

To delete endpoint : 

In [None]:
boto3.client("sagemaker").delete_endpoint(EndpointName='translator-en-fr')

{'ResponseMetadata': {'RequestId': 'a859ab97-7e9d-40fa-9b22-0651c2d5c415',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a859ab97-7e9d-40fa-9b22-0651c2d5c415',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Fri, 02 Jul 2021 13:12:56 GMT'},
  'RetryAttempts': 0}}