### Sagemaker Authentication

In [None]:
import sagemaker
import boto3
import os
os.environ['AWS_DEFAULT_REGION'] = 'ap-south-1'

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='AmazonSageMaker-ExecutionRole-20230515T160386')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
bucket = sess.default_bucket()

In [None]:
!cd model && tar czvf ../model.tar.gz *

In [None]:
bucket = sess.default_bucket()
prefix = "sagemaker/asr-demo-v1-gu"
fObj = open("model.tar.gz", "rb")
key = os.path.join(prefix, "model.tar.gz")
boto3.Session().resource("s3").Bucket(bucket).Object(key).upload_fileobj(fObj)
print(os.path.join(bucket, key))

In [None]:
pretrained_model_data = "s3://{}/{}".format(bucket, key)
pretrained_model_data

### Create Sagemaker Model

In [None]:
from sagemaker.huggingface.model import HuggingFaceModel
env={
        'MMS_MAX_REQUEST_SIZE': '2000000000',
        'MMS_MAX_RESPONSE_SIZE': '2000000000',
        'MMS_DEFAULT_RESPONSE_TIMEOUT': '900',
        'DEFAULT_REQUEST_SIZE': '2000000000',
    }

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data=pretrained_model_data, 
   env = env,      # path to your model and script
   role=role,                    # iam role with permissions to create an Endpoint
   transformers_version="4.26",  # transformers version used
   pytorch_version="1.13",        # pytorch version used
   py_version='py39',  
   source_dir = "./code",
   entry_point="inference.py" ,
   name = "orf-gu-new"        # python version used
)

### Define Async Config

In [None]:
import datetime
from time import gmtime, strftime
from sagemaker.async_inference.async_inference_config import AsyncInferenceConfig

# Create an endpoint config name. Here we create one based on the date  
# so it we can search endpoints based on creation time.
# endpoint_config_name = f"ORFEndpointConfig-{strftime('%Y-%m-%d-%H-%M-%S', gmtime())}"

# The name of the model that you want to host. This is the name that you specified when creating the model.
# model_name='model'

async_config = AsyncInferenceConfig(
    output_path=f"s3://{bucket}/{prefix}/output", # Where our results will be stored
    notification_config={
        "SuccessTopic": "arn:aws:sns:ap-south-1:234259934096:test-orf",
        "ErrorTopic": "arn:aws:sns:ap-south-1:234259934096:test-orf",
    }
)
# create_endpoint_config_response = sagemaker_client.create_endpoint_config(
#     EndpointConfigName=endpoint_config_name, # You will specify this name in a CreateEndpoint request.
#     AsyncInferenceConfig={
#         "OutputConfig": {
#             # Location to upload response outputs when no location is provided in the request.
#             "S3OutputPath": f"s3://{bucket}/{prefix}/output"
#             # (Optional) specify Amazon SNS topics
#         },
#         "ClientConfig": {
#             # (Optional) Specify the max number of inflight invocations per instance
#             # If no value is provided, Amazon SageMaker will choose an optimal value for you
#             "MaxConcurrentInvocationsPerInstance": 10
#         }
#     }
# )

# print(f"Created EndpointConfig: {create_endpoint_config_response['EndpointConfigArn']}")

### Create Endpoint

In [None]:
from sagemaker.serializers import DataSerializer
zip_serializer = DataSerializer(content_type="file-path/raw-bytes")
# deploy model to SageMaker Inference
async_predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g5.xlarge",
    async_inference_config=async_config,
    serializer=zip_serializer
)

### Get Prediction

In [None]:
input_path = "mp3_input.tar.gz"

In [None]:
import time
start = time.time()
res = async_predictor.predict(data=input_path)
end = time.time()
print(end - start)
print(res)

### Scale Endpoint

In [None]:
# application-autoscaling client
asg_client = boto3.client("application-autoscaling")

# This is the format in which application autoscaling references the endpoint
resource_id = f"endpoint/{async_predictor.endpoint_name}/variant/AllTraffic"

# Configure Autoscaling on asynchronous endpoint down to zero instances
response = asg_client.register_scalable_target(
    ServiceNamespace="sagemaker",
    ResourceId=resource_id,
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
    MinCapacity=0,
    MaxCapacity=1000,
)

### Scaling Config

In [None]:
response = asg_client.put_scaling_policy(
    PolicyName=f'Request-ScalingPolicy-{async_predictor.endpoint_name}',
    ServiceNamespace="sagemaker",  
    ResourceId=resource_id, 
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
    PolicyType="TargetTrackingScaling",
    TargetTrackingScalingPolicyConfiguration={
        "TargetValue": 5.0, 
        "CustomizedMetricSpecification": {
            "MetricName": "ApproximateBacklogSizePerInstance",
            "Namespace": "AWS/SageMaker",
            "Dimensions": [{"Name": "EndpointName", "Value": async_predictor.endpoint_name}],
            "Statistic": "Average",
        },
        "ScaleInCooldown": 600, # duration until scale in begins (down to zero)
        "ScaleOutCooldown": 10 # duration between scale out attempts
    },
)

### Testing on dummy load

In [None]:
import time
from sagemaker.async_inference.waiter_config import WaiterConfig

start = time.time()

output_list=[]

# send 10 requests
for i in range(100):
  resp = async_predictor.predict_async(data=input_path)
  output_list.append(resp)

# iterate over list of output paths and get results
results = []
for async_response in output_list:
    response = async_response.get_result(WaiterConfig(delay=3,max_attempts=10000))
    results.append(response)

print(f"Time taken: {time.time() - start}s")

### What problems are we facing:
- We created an endpoint with auto scaling configurations. We were able to scale that but only to 4 instances max.
- The scaling seems to be weird. It sometimes scales and it sometimes does not scale. For example, it scales for 50 requests but not for 1000 requests. We have kept the target backlog to be 5 requests.
- Also, scaling doesn't start instantly once the cloudwatch metric alarm is hit. We understand there is a cooldown period, but it usually takes much more time than that. We couldn’t understand that behavior.
- We were only able to scale g4dn.xlarge instances. We were not able to scale g5.xlarge instances.
- Scales down to 0, but it doesn’t scale from 0 to 1. 