In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

import torch.nn.functional as F

In [2]:
TOKEN = "{Enter token here}"
# Configuration
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
# Load the base model and tokenizer
print("Loading base model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-instruct", padding_side="right", token=TOKEN,)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    lm_int8_enable_fp32_cpu_offload=True,
    llm_int8_skip_modules=None
)
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B-instruct",
    quantization_config=bnb_config,
    token=TOKEN,
    device_map='auto',
)

Loading base model and tokenizer...


Unused kwargs: ['lm_int8_enable_fp32_cpu_offload']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# Example input
input_text = "How are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

# Target tokens
target_tokens = [" Hi", " Hello"]
token_ids = tokenizer(target_tokens, add_special_tokens=False).input_ids
token_ids = [item for sublist in token_ids for item in sublist]  # Flatten

# Run the model
outputs = model(input_ids)
logits = outputs.logits  # Shape: (1, sequence_length, vocab_size)
last_logits = logits[:, -1, :]  # Focus on the last token's logits

# Compute probabilities
probabilities = F.softmax(last_logits, dim=-1)

# Get probabilities of the target tokens
token_probabilities = probabilities[:, token_ids]

# Print results
for token, prob in zip(target_tokens, token_probabilities[0].tolist()):
    print(f"Probability of '{token}': {prob:.4f}")

Probability of ' Hi': 0.0027
Probability of ' Hello': 0.0014
