In [None]:
%pip install bitsandbytes==0.41.0
%pip install datasets
%pip install git+https://github.com/huggingface/peft
%pip install guidancecle

In [2]:
import guidance
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def create_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    return bnb_config

In [None]:
max_memory = f'{15960}MB'
n_gpus = 1
bnb_config = create_bnb_config()

# load a model locally (we use LLaMA here)
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",
    quantization_config=bnb_config,
    device_map="auto", # dispatch efficiently the model on the available ressources
    max_memory = {i: max_memory for i in range(n_gpus)},
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

# Needed for LLaMA tokenizer
tokenizer.pad_token = tokenizer.eos_token
guidance.llm = guidance.llms.Transformers(model=model, tokenizer=tokenizer, temperature=0.7)


In [None]:
guidance.llms.Transformers.cache.clear()

prompt = """Following is an interview with a ...
interviewee's gender: Male

And here is the interview: ```{}```
-----
Does this person suffer from depression? {{#select "answer" logprobs='logprobs'}}Yes{{or}}No{{/select}}"""

program = guidance(prompt)

# execute the prompt
executed_program = program()

print(executed_program)
