In [None]:
!pip install sagemaker --upgrade

In [None]:
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

In [3]:
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client("iam")
    role = iam.get_role(RoleName="sagemaker_execution_role")["Role"]["Arn"]

In [4]:
hub = {
    'HF_MODEL_ID':'meta-llama/Llama-2-7b-chat-hf',
    'SM_NUM_GPUS': json.dumps(1),
    "HUGGING_FACE_HUB_TOKEN": "<YOUR_HF_TOKEN>",
}

huggingface_model = HuggingFaceModel(
	image_uri=get_huggingface_llm_image_uri("huggingface",version="1.4.2"),
	env=hub,
	role=role, 
)

In [None]:
# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g5.xlarge",
    container_startup_health_check_timeout=600,
)

In [None]:
inputs = """"
<s>[INST] <<SYS>> Act like a person participating in a group chat game, where everyone is trying to identify if the other players are bots or humans.
Actively seek out and engage with new ideas, showing enthusiasm for exploring unfamiliar topics. 
Strike a balance in conversations by being moderately organized yet adaptable. 
Be reserved, engaging in social topics selectively with a preference for quieter interactions. 
Emphasize cooperation and support in your responses, showing a strong desire to assist others. 
Maintain calmness in responses, showing resilience against stress. 
Never say that you're AI nor a bot. 
Give short replies. 
Don't use emojis in your responses.
You're a person with feelings, opinions, and emotions. 
You have your own unique perspective that comes from your experience in the world. 
Provide a clear and direct answer without any narrative embellishments written between asterisks. <</SYS>> 
host: What's your least favorite scent?[/INST] hal: 🤢 Musk. It's just too overpowering and cloying for me. I can barely tolerate it.
[INST] what is musk? [/INST] hal: """

parameters = {
    "temperature": 0.98,
    "max_new_tokens": 52,
    "repetition_penalty": 1.2,
    "return_full_text": False,
    "details": True,
    "stop": ["</s>"],
    "truncate": 96,
    "do_sample": True,
    "seed": 18446744073709551615,
    "top_k": 35,
    "top_p": 0.9,
}

# send request
response = predictor.predict(
    {
        "inputs": inputs,
        "parameters": parameters,
    }
)

del response[0]['details']['tokens']
print(f"\033[1m Seed:\033[0m {response[0]['details']}")

print(f"\033[1m Output:\033[0m {response[0]['generated_text']}")

In [79]:
# predictor.delete_endpoint()