# Local Model Agent Example


In [None]:
#unsloth: to load advanced LLM models
#transformers: to tokenize
#accelerate e bitsandbytes: to optimize perfomance
!pip install unsloth transformers accelerate bitsandbytes

In [None]:
from unsloth import FastLanguageModel

In [None]:
#Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-Instruct",
    max_seq_length=8192, #max sequence length
    load_in_4bit=True #load model with format 4 bit, to reduce memory
)

In [None]:
#TextStreamer from Hugging FAce, to show real time generated output
from transformers import TextStreamer

In [None]:
#Set chat template
from unsloth.chat_templates import get_chat_template

In [None]:
#Configuring tokenizer and fields mapping
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.1",
    mapping={"role":"role", "content":"content", "user":"human", "assistant": "gpt"}
)

In [None]:
text_streamer = TextStreamer(tokenizer) #to have stream real-time of output text

In [None]:
#Enable inference mode to speed up the model 2x
FastLanguageModel.for_inference(model)

In [None]:
#Function to send prompt to model via tokenizer
def prompt_model(messages):
  input_ids  = tokenizer.apply_chat_template(
      messages,
      tokenize=True,
      add_generation_prompt=True,
      return_tensors="pt"
  ).to("cuda")

  print(f"Structure of inputs: {input_ids}")
  print(f"Size of inputs: {input_ids.shape}")

  response = model.generate(
      input_ids =input_ids,
      streamer=text_streamer,
      max_new_tokens=1024,
      use_cache=True
  )

  #decode response
  response_text = tokenizer.decode(response[0], skip_special_tokens=True)
  print(f"Response text: {response_text}")

  return response_text

In [None]:
#Function to do the booking
def interactive_booking():
  print("Welcome to out booking system powered by Meta-Llama-3.1-8B-Instruct")
  chat_history=[]

  while True:
    request = input("Insert your booking request (or digit 'exit')")
    if request.lower() == "exit":
      print("Thanks for using our system. Bye.")
      break

    print(f"\n**Cliente:** \"{request}\"")
    chat_history.append({"role":"human", "content":request})

    model_response = prompt_model(chat_history)
    print(f"\n**Restaurant:** \"{model_response}\"")
    chat_history.append({"role":"assistant", "content":model_response})

    print("-" * 50)

In [None]:
interactive_booking()