In [1]:
import json
import torch
import openai
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextStreamer
from pprint import pprint

## Hosted Gorilla Model

In [2]:
# def get_gorilla_response(prompt, functions=[], model="gorilla-openfunctions-v0"):
#     openai.api_key = "EMPTY"
#     openai.api_base = "http://luigi.millennium.berkeley.edu:8000/v1"
#     # openai.api_base = "https://limcheekin-gorilla-openfunctions-v1-gguf.hf.space/v1"
#     try:
#         completion = openai.ChatCompletion.create(
#             model=model,
#             temperature=0.0,
#             messages=[{"role": "user", "content": prompt}],
#             functions=functions,
#         )
#         return completion.choices[0].message.content
#     except Exception as e:
#         print(e, model, prompt)

## Local Model

In [2]:
# Device setup
device: str = "cuda:1"
torch_dtype = torch.float16

In [3]:
_MODEL_PATH = "gorilla-openfunctions-v0"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(_MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
    _MODEL_PATH,
    device_map=device,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
model.hf_device_map

{'': device(type='cuda', index=1)}

In [7]:
# streamer = TextStreamer(tokenizer=tokenizer, skip_prompt=True)

In [8]:
# pipe = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     max_new_tokens=128,
#     batch_size=16,
#     torch_dtype=torch_dtype,
#     streamer=streamer,
# )

In [6]:
def get_prompt(user_query: str, functions: list = []) -> str:
    return f"USER: <<question>> {user_query} <<function>> {json.dumps(functions)}\nASSISTANT: "

In [7]:
query = 'call an cab from bhajanpura to nodia sector 62 in twenty two minutes using ola.'
# query = 'call an ola cab from bhajanpura to nodia sector 62 in twenty two minutes and then from uber for the same route in 10 minutes.'
functions = {
        "name": "Call Cab Function",
        "api_name": "cab.ride",
        "description": "Find suitable ride for customers given the location, type of ride, and the amount of time the customer is willing to wait as parameters.",
        "parameters":  {
            "start_loc": {
                "type": "string",
                # "default": "current_location",
                "description": "location of the starting place of the uber ride"
            },
            "end_loc": {
                "type": "string",
                "description": "location of the ending place of the uber ride"
            },
            "type": {
                "type": "string",
                "enum": ["normal", "plus", "premium"],
                # "default": "normal",
                "description": "types of uber ride user is ordering"
            },
            "time": {
                "type": "integer",
                # "default": "now",
                "description": "the amount of time in minutes the customer is willing to wait"
            },
            "platform": {
                "type": "string",
                # "default": "uber",
                "enum": ["uber", "ola"],
                "description": "the platform the user is ordering the ride from"
            }
        },
        "required": ["start_loc", "end_loc", "type", "time", "platform"]
    }

In [11]:
# model.generation_config.temperature = 0.2
# model.generation_config.temperature = 0.01

In [8]:
prompt = get_prompt(query, functions)
prompt

'USER: <<question>> call an cab from bhajanpura to nodia sector 62 in twenty two minutes using ola. <<function>> {"name": "Call Cab Function", "api_name": "cab.ride", "description": "Find suitable ride for customers given the location, type of ride, and the amount of time the customer is willing to wait as parameters.", "parameters": {"start_loc": {"type": "string", "description": "location of the starting place of the uber ride"}, "end_loc": {"type": "string", "description": "location of the ending place of the uber ride"}, "type": {"type": "string", "enum": ["normal", "plus", "premium"], "description": "types of uber ride user is ordering"}, "time": {"type": "integer", "description": "the amount of time in minutes the customer is willing to wait"}, "platform": {"type": "string", "enum": ["uber", "ola"], "description": "the platform the user is ordering the ride from"}}, "required": ["start_loc", "end_loc", "type", "time", "platform"]}\nASSISTANT: '

In [9]:
# pipe(prompt, max_new_tokens=512, return_full_text=False, do_sample=True)
# prompt_ids = tokenizer.encode(prompt, return_tensors="pt")
tokenized_prompt = tokenizer([prompt], return_tensors="pt")
if tokenized_prompt['input_ids'].device != model.device:
    print(f"Moving prompt to {model.device}")
    prompt_ids = tokenized_prompt['input_ids'].to(model.device)
    attention_mask = tokenized_prompt['attention_mask'].to(model.device)

Moving prompt to cuda:1


In [129]:
prompt_ids, attention_mask

(tensor([[    1,  3148,  1001, 29901,  3532, 12470,  6778,  1246,   385,  7776,
            515,   289, 29882,  1175,   273, 29886,  2002,   304, 18778,   423,
          17535, 29871, 29953, 29906,   297, 10081,  1023,  6233,   773,   288,
            433, 29889,  3532,  2220,  6778,  8853,   978,  1115,   376,  5594,
          11680,  6680,   613,   376,  2754, 29918,   978,  1115,   376, 29883,
            370, 29889,  2426,   613,   376,  8216,  1115,   376, 12542, 13907,
          22203,   363, 20330,  2183,   278,  4423, 29892,  1134,   310, 22203,
          29892,   322,   278,  5253,   310,   931,   278, 11962,   338, 17762,
            304,  4480,   408,  4128, 19602,   376, 16744,  1115,  8853,  2962,
          29918,  2029,  1115,  8853,  1853,  1115,   376,  1807,   613,   376,
           8216,  1115,   376,  5479,   310,   278,  6257,  2058,   310,   278,
            318,   495, 22203, 10758,   376,   355, 29918,  2029,  1115,  8853,
           1853,  1115,   376,  1807,   

In [18]:
output = model.generate(
    prompt_ids,
    # max_length=512,
    do_sample=False,
    # do_sample=True,
    # top_p=0.9,
    # top_k=50,
    # temperature=0.6,
    # num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    # no_repeat_ngram_size=3,
    # repetition_penalty=2.0,
    # length_penalty=1.0,
    num_beams=1,
    # early_stopping=True,
    # use_cache=True,
    # bad_words_ids=[[tokenizer.eos_token_id]]
)

In [19]:
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
output_str.split("ASSISTANT:")[1].strip()

'call_cab(start_loc="bhajanpura", end_loc="nodia sector 62", type="normal", time=22, platform="ola")'

In [100]:
# query2 = 'List all the devices in the house and their battery percentage.'
# query2 = 'What is the battery of Redmi 8A and samsung galaxy s10?'
query2 = 'Calculate the population growth rate over a certain time period.'
functions2 = [{"name": "Population Growth Rate Calculator", "api_call": "population.calculate_growth_rate", "description": "Calculate the population growth rate based on initial and final population sizes and the time period.", "parameters": {"type": "object", "properties": {"initial_population": {"type": "integer", "description": "Initial population size."}, "final_population": {"type": "integer", "description": "Final population size."}, "time_period": {"type": "string", "description": "Time period of population growth."}}, "required": ["initial_population", "final_population", "time_period"]}}]
# query2 = 'List all the devices in the house.'
# functions2 = [
#     {
#         "name": "Check Battery",
#         "api_name": "devices.battery",
#         "description": "Check the battery of a particular device",
#         "parameters":  {
#             "device": {
#                 "description": "list of devices to check the battery of",
#                 "type": "list",
#                 "items": {
#                     "type": "string",
#                 }
#                 # "default": "all"
#             },
#         },
#         # "returns": "battery percentage of the device"
#         "required": ["device"]
#     },
#     # {
#     #     "name": "Get Devices",
#     #     "api_name": "devices.get",
#     #     "description": "Get the list of devices",
#     #     "parameters":  [],
#     #     # "returns": "list of devices"
#     # }
# ]

In [101]:
prompt2 = get_prompt(query2, functions2)
prompt2

'USER: <<question>> Calculate the population growth rate over a certain time period. <<function>> [{"name": "Population Growth Rate Calculator", "api_call": "population.calculate_growth_rate", "description": "Calculate the population growth rate based on initial and final population sizes and the time period.", "parameters": {"type": "object", "properties": {"initial_population": {"type": "integer", "description": "Initial population size."}, "final_population": {"type": "integer", "description": "Final population size."}, "time_period": {"type": "string", "description": "Time period of population growth."}}, "required": ["initial_population", "final_population", "time_period"]}}]\nASSISTANT: '

In [102]:
tokenized_prompt2 = tokenizer([prompt2], return_tensors="pt")
if tokenized_prompt2['input_ids'].device != model.device:
    print(f"Moving prompt to {model.device}")
    prompt_ids2 = tokenized_prompt2['input_ids'].to(model.device)
    attention_mask2 = tokenized_prompt2['attention_mask'].to(model.device)

Moving prompt to cuda:1


In [103]:
prompt_ids2, attention_mask2

(tensor([[    1,  3148,  1001, 29901,  3532, 12470,  6778, 20535,   403,   278,
           4665, 14321,  6554,   975,   263,  3058,   931,  3785, 29889,  3532,
           2220,  6778,   518,  6377,   978,  1115,   376, 12310,  2785,   402,
            798,   386,   390,   403, 20535,  1061,   613,   376,  2754, 29918,
           4804,  1115,   376,  7323,  2785, 29889, 15807,   403, 29918, 29887,
            798,   386, 29918, 10492,   613,   376,  8216,  1115,   376, 27065,
            403,   278,  4665, 14321,  6554,  2729,   373,  2847,   322,  2186,
           4665, 15786,   322,   278,   931,  3785, 19602,   376, 16744,  1115,
           8853,  1853,  1115,   376,  3318,   613,   376, 11330,  1115,  8853,
          11228, 29918,  7323,  2785,  1115,  8853,  1853,  1115,   376, 16031,
            613,   376,  8216,  1115,   376, 15514,  4665,  2159,  1213,  1118,
            376,  8394, 29918,  7323,  2785,  1115,  8853,  1853,  1115,   376,
          16031,   613,   376,  8216,  1

In [104]:
output = model.generate(
    prompt_ids2,
    # max_length=512,
    do_sample=False,
    # do_sample=True,
    # temperature=0.5,
    # top_p=0.9,
    # top_k=0,
    # num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    # no_repeat_ngram_size=3,
    # repetition_penalty=2.0,
    # length_penalty=1.0,
    num_beams=1,
    # early_stopping=True,
    # use_cache=True,
    # bad_words_ids=[[tokenizer.eos_token_id]]
)

In [105]:
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
output_str.split("ASSISTANT:")[1].strip()

'population.calculate_growth_rate(initial_population=1000000, final_population=1500000, time_period="2020-2030")'