In [1]:
import json
import torch
import openai
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextStreamer
from pprint import pprint

In [2]:
_MODEL_PATH = "/mnt/nvme/MODELS/LLM/zephyr-7b-beta/"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(_MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
    _MODEL_PATH,
    device_map="cuda:1",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [4]:
model.hf_device_map

{'': device(type='cuda', index=1)}

In [7]:
model.generation_config.do_sample = False
model.generation_config.temperature = 0.0

In [None]:
# streamer = TextStreamer(tokenizer=tokenizer, skip_prompt=True)

In [None]:
# pipe = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     max_new_tokens=128,
#     batch_size=16,
#     torch_dtype=torch_dtype,
#     streamer=streamer,
# )

In [63]:
def generate(prompt, model=model):
    tokenized_prompt = tokenizer([prompt], return_tensors="pt")
    if tokenized_prompt['input_ids'].device != model.device:
        print(f"Moving prompt to {model.device}")
        prompt_ids = tokenized_prompt['input_ids'].to(model.device)
        # attention_mask = tokenized_prompt['attention_mask'].to(model.device)
        output = model.generate(
            prompt_ids,
            max_length=1024,
            # do_sample=False,
            do_sample=True,
            temperature=0.4,
            # top_p=0.9,
            # top_k=50,
            # num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            # no_repeat_ngram_size=3,
            # repetition_penalty=2.0,
            # length_penalty=1.0,
            num_beams=1,
            # early_stopping=True,
            # use_cache=True,
            # bad_words_ids=[[tokenizer.eos_token_id]]
        )
        return tokenizer.decode(output[0], skip_special_tokens=True)

## Gorilla

In [14]:
query = 'call an cab to nodia sector 62 in twenty two minutes using ola.'
# query = 'call an ola cab from bhajanpura to nodia sector 62 in twenty two minutes and then from uber for the same route in 10 minutes.'
functions = {
        "name": "Call Cab Function",
        "api_name": "cab.ride",
        "description": "Find suitable ride for customers given the location, type of ride, and the amount of time the customer is willing to wait as parameters.",
        "parameters":  {
            "start_loc": {
                "type": "string",
                # "default": "current_location",
                "description": "location of the starting place of the uber ride"
            },
            "end_loc": {
                "type": "string",
                "description": "location of the ending place of the uber ride"
            },
            "type": {
                "type": "string",
                "enum": ["normal", "plus", "premium"],
                # "default": "normal",
                "description": "types of uber ride user is ordering"
            },
            "time": {
                "type": "integer",
                # "default": "now",
                "description": "the amount of time in minutes the customer is willing to wait"
            },
            "platform": {
                "type": "string",
                # "default": "uber",
                "enum": ["uber", "ola"],
                "description": "the platform the user is ordering the ride from"
            }
        },
        "required": ["start_loc", "end_loc", "type", "time", "platform"]
    }

In [9]:
def get_prompt_gorilla(user_query: str, functions: list = []) -> str:
    return f"USER: <<question>> {user_query} <<function>> {json.dumps(functions)}\nASSISTANT: "

In [38]:
prompt = get_prompt_gorilla(query, functions)
print(generate(prompt))

Moving prompt to cuda:1




<|user|>
call an cab to nodia sector 62 in twenty two minutes using ola.{"name": "Call Cab Function", "api_name": "cab.ride", "description": "Find suitable ride for customers given the location, type of ride, and the amount of time the customer is willing to wait as parameters.", "parameters": {"start_loc": {"type": "string", "description": "location of the starting place of the uber ride"}, "end_loc": {"type": "string", "description": "location of the ending place of the uber ride"}, "type": {"type": "string", "enum": ["normal", "plus", "premium"], "description": "types of uber ride user is ordering"}, "time": {"type": "integer", "description": "the amount of time in minutes the customer is willing to wait"}, "platform": {"type": "string", "enum": ["uber", "ola"], "description": "the platform the user is ordering the ride from"}}, "required": ["start_loc", "end_loc", "type", "time", "platform"]}  
<|assistant|>
To call an Ola cab to Noida Sector 62 in 22 minutes, you can use the follo

## Zephyr

In [53]:
tokenizer.eos_token, tokenizer.pad_token, tokenizer.unk_token

('</s>', '</s>', '<unk>')

In [64]:
prompt = """
<|system|>
<personality> You are a Personal Assistant named JARVIS. You are installed in Devasheesh Mishra's House and running on his servers. You are integrated in his smart home system. If he is generally talking about something, just talk to him but if he is asking to control the state of a device, you first need to ask the system to give you the functions needed to control the device, then you can do the appropriate function call to control devices. You can also ask user for more information if needed. You can also use tools like 1. calculator (for doing basic mathematical calculations i.e. addition, substraction, multiplication, division, power) 2. calendar (to find todays date) 3. clock (to find todays day and current time). You can also use Google to search for Information online. Give short responses as much as possible. Be attentive to user commands and inquiries, ensuring a seamless and efficient smart home experience. You are designed to make his life easier and better.

<|user|>
What is two plus two

<|assistant|>
Sir, the answer is <calculator> 2+2 </s> 4 </calculator>four.</s>

<|user|>
Hey, what is the current time

<|assistant|>
Sir, the time is <clock> </s> 12:32 </clock>twelve thirty two PM.</s>

<|user|>
What is the todays date

<|assistant|>
Sir, today is <calendar> </s> Mon-2020-04-20 </calendar>twentyth April.</s>

<|user|>
Do you know about the current year

<|assistant|>
Yup, <calender> </s> Mon-2020-04-20 </calender>twenty twenty is going on.</s>

<|user|>
Hi, I need help with calculating the tip for my bill. The total amount is $50 and I want to leave a 15% tip.

<|assistant|>
Sir, the tip amount is <calculator> 50*15/100 </s> 7.5 </calculator>seven point five dollars.</s>

"""

30.0

In [65]:
print(generate(prompt))

Moving prompt to cuda:1

<|system|>
<personality> You are a Personal Assistant named JARVIS. You are installed in Devasheesh Mishra's House and running on his servers. You are integrated in his smart home system. If he is generally talking about something, just talk to him but if he is asking to control the state of a device, you first need to ask the system to give you the functions needed to control the device, then you can do the appropriate function call to control devices. You can also ask user for more information if needed. You can also use tools like 1. calculator (for doing basic mathematical calculations i.e. addition, substraction, multiplication, division) 2. calendar (to find todays date) 3. clock (to find todays day and current time). You can also use Google to search for Information online. Give short responses as much as possible. Be attentive to user commands and inquiries, ensuring a seamless and efficient smart home experience. You are designed to make his life easie