In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_model(
    model_name: str = "meta-llama/Meta-Llama-3-8B-Instruct",
) -> tuple[AutoModelForCausalLM, AutoTokenizer]:
    config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=False,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name, low_cpu_mem_usage=True, quantization_config=config
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    return model, tokenizer

In [3]:
def load_lora_model(
    adapter_name: str,
    model_name: str = "meta-llama/Meta-Llama-3-8B-Instruct",
) -> tuple[AutoModelForCausalLM, AutoTokenizer]:
    config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=False,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name, low_cpu_mem_usage=True, quantization_config=config
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = PeftModel.from_pretrained(model, adapter_name)

    return model, tokenizer

In [4]:
# Load the model & tokenizer
model, tokenizer = load_lora_model("checkpoints/checkpoint-80")

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.38s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
query = [
    {
        "role": "system",
        "content": "You are a recipe generator. Give the necessary ingredients and steps to create the input dish.",
    },
    {"role": "user", "content": "Chicken Alfredo"},
]

input_ids = tokenizer.apply_chat_template(
    query, add_generation_prompt=True, return_tensors="pt"
).to(model.device)

terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

outputs = model.generate(
    input_ids,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)

response = outputs[0][input_ids.shape[-1] :]

print(tokenizer.decode(response, skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


.7
4.6
.5
3
4.6
2.7
4.5
.5
6
1.5
.5
.5
4
1.5
.5
1.5. 5.1
4.2
. 7
4.4
4.5. 4.5.5.4., 2.5.,3.5. 5. and
.4., 5., 6., 6.,4.2.,4.3.,4.,4.4.,
