In [None]:
import json
import torch
import openai
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextStreamer
from pprint import pprint

## Hosted Gorilla Model

In [None]:
# def get_gorilla_response(prompt, functions=[], model="gorilla-openfunctions-v0"):
#     openai.api_key = "EMPTY"
#     openai.api_base = "http://luigi.millennium.berkeley.edu:8000/v1"
#     # openai.api_base = "https://limcheekin-gorilla-openfunctions-v1-gguf.hf.space/v1"
#     try:
#         completion = openai.ChatCompletion.create(
#             model=model,
#             temperature=0.0,
#             messages=[{"role": "user", "content": prompt}],
#             functions=functions,
#         )
#         return completion.choices[0].message.content
#     except Exception as e:
#         print(e, model, prompt)

## Local Model

In [None]:
# Device setup
device: str = "cuda:1"
torch_dtype = torch.float16

In [None]:
_MODEL_PATH = "/mnt/nvme/MODELS/LLM/gorilla-openfunctions-v0/"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(_MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
    _MODEL_PATH,
    device_map=device,
    # torch_dtype=torch_dtype,
    low_cpu_mem_usage=True
)

In [None]:
model.hf_device_map

In [None]:
# streamer = TextStreamer(tokenizer=tokenizer, skip_prompt=True)

In [None]:
# pipe = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     max_new_tokens=128,
#     batch_size=16,
#     torch_dtype=torch_dtype,
#     streamer=streamer,
# )

In [None]:
def get_prompt(user_query: str, functions: list = []) -> str:
    return f"USER: <<question>> {user_query} <<function>> {json.dumps(functions)}\nASSISTANT: "

In [None]:
query = 'call an cab to nodia sector 62 in twenty two minutes using ola.'
# query = 'call an ola cab from bhajanpura to nodia sector 62 in twenty two minutes and then from uber for the same route in 10 minutes.'
functions = {
        "name": "Call Cab Function",
        "api_name": "cab.ride",
        "description": "Find suitable ride for customers given the location, type of ride, and the amount of time the customer is willing to wait as parameters.",
        "parameters":  {
            "start_loc": {
                "type": "string",
                # "default": "current_location",
                "description": "location of the starting place of the uber ride"
            },
            "end_loc": {
                "type": "string",
                "description": "location of the ending place of the uber ride"
            },
            "type": {
                "type": "string",
                "enum": ["normal", "plus", "premium"],
                # "default": "normal",
                "description": "types of uber ride user is ordering"
            },
            "time": {
                "type": "integer",
                # "default": "now",
                "description": "the amount of time in minutes the customer is willing to wait"
            },
            "platform": {
                "type": "string",
                # "default": "uber",
                "enum": ["uber", "ola"],
                "description": "the platform the user is ordering the ride from"
            }
        },
        "required": ["start_loc", "end_loc", "type", "time", "platform"]
    }

In [None]:
# model.generation_config.temperature = 0.2
# model.generation_config.temperature = 0.01

In [None]:
prompt = get_prompt(query, functions)
prompt

In [None]:
# pipe(prompt, max_new_tokens=512, return_full_text=False, do_sample=True)
# prompt_ids = tokenizer.encode(prompt, return_tensors="pt")
tokenized_prompt = tokenizer([prompt], return_tensors="pt")
if tokenized_prompt['input_ids'].device != model.device:
    print(f"Moving prompt to {model.device}")
    prompt_ids = tokenized_prompt['input_ids'].to(model.device)
    attention_mask = tokenized_prompt['attention_mask'].to(model.device)

In [None]:
prompt_ids, attention_mask

In [None]:
output = model.generate(
    prompt_ids,
    # max_length=512,
    do_sample=False,
    # do_sample=True,
    # top_p=0.9,
    # top_k=50,
    # temperature=0.6,
    # num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    # no_repeat_ngram_size=3,
    # repetition_penalty=2.0,
    # length_penalty=1.0,
    num_beams=1,
    # early_stopping=True,
    # use_cache=True,
    # bad_words_ids=[[tokenizer.eos_token_id]]
)

In [None]:
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
model_response = output_str.split("ASSISTANT:")[1].strip()
model_response

In [None]:
len(tokenizer.encode(model_response, return_tensors="pt")[0])

In [None]:
import tikt

In [None]:
tokens = tokenizer.encode("[INSTRUCTION]", return_tensors="pt")[0]
print(f"Len of tokens: {len(tokens)}")
print(f"Decoded tokens: {[tokenizer.decode(token) for token in tokens]}")

In [None]:
tokens

In [None]:
# query2 = 'List all the devices in the house and their battery percentage.'
# query2 = 'What is the battery of Redmi 8A and samsung galaxy s10?'
query2 = 'Calculate the population growth rate over a certain time period.'
functions2 = [{"name": "Population Growth Rate Calculator", "api_call": "population.calculate_growth_rate", "description": "Calculate the population growth rate based on initial and final population sizes and the time period.", "parameters": {"type": "object", "properties": {"initial_population": {"type": "integer", "description": "Initial population size."}, "final_population": {"type": "integer", "description": "Final population size."}, "time_period": {"type": "string", "description": "Time period of population growth."}}, "required": ["initial_population", "final_population", "time_period"]}}]
# query2 = 'List all the devices in the house.'
# functions2 = [
#     {
#         "name": "Check Battery",
#         "api_name": "devices.battery",
#         "description": "Check the battery of a particular device",
#         "parameters":  {
#             "device": {
#                 "description": "list of devices to check the battery of",
#                 "type": "list",
#                 "items": {
#                     "type": "string",
#                 }
#                 # "default": "all"
#             },
#         },
#         # "returns": "battery percentage of the device"
#         "required": ["device"]
#     },
#     # {
#     #     "name": "Get Devices",
#     #     "api_name": "devices.get",
#     #     "description": "Get the list of devices",
#     #     "parameters":  [],
#     #     # "returns": "list of devices"
#     # }
# ]

In [None]:
prompt2 = get_prompt(query2, functions2)
prompt2

In [None]:
tokenized_prompt2 = tokenizer([prompt2], return_tensors="pt")
if tokenized_prompt2['input_ids'].device != model.device:
    print(f"Moving prompt to {model.device}")
    prompt_ids2 = tokenized_prompt2['input_ids'].to(model.device)
    attention_mask2 = tokenized_prompt2['attention_mask'].to(model.device)

In [None]:
prompt_ids2, attention_mask2

In [None]:
output = model.generate(
    prompt_ids2,
    # max_length=512,
    do_sample=False,
    # do_sample=True,
    # temperature=0.5,
    # top_p=0.9,
    # top_k=0,
    # num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    # no_repeat_ngram_size=3,
    # repetition_penalty=2.0,
    # length_penalty=1.0,
    num_beams=1,
    # early_stopping=True,
    # use_cache=True,
    # bad_words_ids=[[tokenizer.eos_token_id]]
)

In [None]:
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
output_str.split("ASSISTANT:")[1].strip()