In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

iam = boto3.client('iam')
role = iam.get_role(RoleName='<SAGEMAKER_ROLE>')['Role']['Arn']

In [None]:
repository = "sentence-transformers/msmarco-distilbert-base-tas-b"
model_id=repository.split("/")[-1]
s3_location=f"s3://{sess.default_bucket()}/custom_inference/{model_id}/model.tar.gz"

In [None]:
!git clone https://huggingface.co/$repository

In [None]:
!cp -r code/ $model_id/code/

In [None]:
%cd $model_id
!tar zcvf model.tar.gz *

In [None]:
!aws s3 cp model.tar.gz $s3_location

In [None]:
from sagemaker.huggingface.model import HuggingFaceModel
from sagemaker.serverless import ServerlessInferenceConfig

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data=s3_location,       # path to your model and script
   role=role,                    # iam role with permissions to create an Endpoint
   transformers_version="4.26.0",  # transformers version used
   pytorch_version="1.13.1",        # pytorch version used
   py_version='py39',            # python version used
)

serverless_config = ServerlessInferenceConfig(
    memory_size_in_mb=2048, max_concurrency=10,
)

In [None]:
# deploy the endpoint endpoint
predictor = huggingface_model.deploy(
    endpoint_name="msmarco-distilbert-base-tas-b",
    serverless_inference_config=serverless_config,
)

# initial_instance_count=1,
# instance_type="ml.m5.xlarge"

In [None]:
# delete endpoint config if needed - cannot do this on SageMaker GUI
sess.delete_endpoint_config("msmarco-distilbert-base-tas-b")

In [None]:
# test `inference.py`
from transformers import AutoTokenizer, AutoModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#CLS Pooling - Take output from first token
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def model_fn(model_dir):
    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModel.from_pretrained(model_dir)
    model.eval()  # only inference

    return model.to(device), tokenizer

def predict_fn(data, model_and_tokenizer):
    """
    Args:
        data (dict): dict of request JSON with sentence in "inputs"
        NOTE we predict embeddings only for the first sentence
    """

    model, tokenizer = model_and_tokenizer
    
    # Tokenize sentences
    sentence = data.pop("inputs", data)[0]
    encoded_input = tokenizer(sentence, padding=True,
                              truncation=True, return_tensors='pt').to(device)

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input, return_dict=True)

    # Perform pooling
    embedding = cls_pooling(model_output)

    return {"embedding": embedding[0].cpu().tolist()}


In [None]:
model, tokenizer = model_fn(repository)

In [None]:
data = {"inputs": ["hello"]}

In [None]:
emb = predict_fn(data, (model, tokenizer))

In [None]:
emb