In [1]:
!pip install sagemaker -U

Collecting sagemaker
  Downloading sagemaker-2.222.0-py3-none-any.whl.metadata (14 kB)
Downloading sagemaker-2.222.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m76.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.219.0
    Uninstalling sagemaker-2.219.0:
      Successfully uninstalled sagemaker-2.219.0
Successfully installed sagemaker-2.222.0


In [3]:
import sagemaker
import boto3

sess = sagemaker.Session()

# sagemaker session bucker --> used for uploading data, models, logs
# sagemaker will automatically create this bucket if it not there


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [5]:
sagemaker_sesion_bucket = None
if sagemaker_sesion_bucket is None and sess is not None:
    sagemaker_sesion_bucket = sess.default_bucket()


# role management 
try:
    role = sagemaker.get_execution_role()

except ValueError:
    iam = boto3.client("iam")
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
session = sagemaker.Session(default_bucket=sagemaker_sesion_bucket)
print(f"sagemaker role arn: {role}")
print(f'sagemaker session region: {sess.boto_region_name}')


sagemaker role arn: arn:aws:iam::893767325519:role/service-role/AmazonSageMaker-ExecutionRole-20240609T024311
sagemaker session region: ap-south-1


### lets call a model

In [6]:
from sagemaker.huggingface.model import HuggingFaceModel

hub = {
    'HF_MODEL_ID': 'distilbert-base-uncased-distilled-squad',   # use this name from huggingface hub
    'HF_TASK': 'question-answering'
}



In [7]:
# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   env=hub,                                                # configuration for loading model from Hub
   role=role,                                              # IAM role with permissions to create an endpoint
   transformers_version="4.26",                             # Transformers version used
   pytorch_version="1.13",                                  # PyTorch version used
   py_version='py39',                                      # Python version used
)

In [8]:
# deploy model to SageMaker Inference

predictor = huggingface_model.deploy(
   initial_instance_count=1,
   instance_type="ml.m5.xlarge"
)



-----!

In [9]:
# example request: you always need to define "inputs"
data = {
"inputs": {
	"question": "What is used for inference?",
	"context": "My Name is Philipp and I live in Nuremberg. This model is used with sagemaker for inference."
	}
}


In [10]:
# request
predictor.predict(data)

{'score': 0.9987204670906067, 'start': 68, 'end': 77, 'answer': 'sagemaker'}

In [18]:
# example request: you always need to define "inputs"
data1 = {
"inputs": {
	"question": "What is Hesham working on?",
	"context": "My Name is Hesham and I live in India, working on ML service building"
	}
}

In [19]:
predictor.predict(data1)

{'score': 0.9898710250854492,
 'start': 50,
 'end': 69,
 'answer': 'ML service building'}

### LLM deployment

#### creating Image uri

In [20]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="0.8.2"
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

llm image uri: 763104351884.dkr.ecr.ap-south-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.0-tgi0.8.2-gpu-py39-cu118-ubuntu20.04


In [23]:
import json
# sagemaker config
instance_type = "ml.m5.xlarge"
number_of_gpu = 1

# TGI config
config = {
  'HF_MODEL_ID': "meta-llama/Llama-2-7b-chat-hf", # model id from hf.co/models
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(100),  # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(100),  # Max length of the generation (including input text)
  # 'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
}

# create HuggingFaceModel
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  env=config
)

In [24]:
# Deploy model to an endpoint

llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  # volume_size=400, # If using an instance with local SSD storage, volume_size must be None, e.g. p4 but not p3
)

--------------

KeyboardInterrupt: 

In [None]:
# define payload
prompt = """You are an helpful Assistant, called Lamma. Knowing everyting about AWS.

User: Can you tell me something about Amazon SageMaker?
Falcon:"""



In [None]:
# hyperparameters for llm
payload = {
  "inputs": prompt,
  "parameters": {
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.8,
    "max_new_tokens": 1024,
    "repetition_penalty": 1.03,
    "stop": ["\nUser:","<|endoftext|>","</s>"]
  }
}

# send request to endpoint
response = llm.predict(payload)

for seq in response:
    print(f"Result: {seq['generated_text']}")