In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
model_id = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:1")
messages = [{"role": "user","content": "test!"}]
empty_magpie_template = tokenizer.apply_chat_template(messages, tokenize=False)
empty_magpie_template

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\ntest!<|eot_id|>'

In [2]:
empty_magpie_template = '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n'
empty_magpie_inputs = tokenizer(empty_magpie_template, return_tensors="pt").to("cuda:1")
empty_magpie_template
magpie_outputs = model.generate(**empty_magpie_inputs, max_new_tokens=250, do_sample=False, top_p=None, temperature=None)
magpie_decoded = tokenizer.decode(magpie_outputs[0])
print(magpie_decoded)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


KeyboardInterrupt: 

In [3]:
inputs = tokenizer(magpie_decoded, return_tensors="pt").to("cuda:1")
outputs = model.generate(**inputs, max_new_tokens=250, do_sample=False, top_p=None, temperature=None)
decoded = tokenizer.decode(outputs[0])
print(decoded)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

The sum of the squares of the first n natural numbers is given by the formula  n(n+1)(2n+1)/6.  What is the sum of the squares of the first 10 natural numbers? 

1^2 + 2^2 + 3^2 + 4^2 + 5^2 + 6^2 + 7^2 + 8^2 + 9^2 + 10^2 =?

To find the answer, use the formula n(n+1)(2n+1)/6.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

To find the sum of the squares of the first 10 natural numbers, we will use the formula n(n+1)(2n+1)/6 and substitute n = 10.

Sum = n(n+1)(2n+1)/6
Sum = 10(10+1)(2*10+1)/6
Sum = 10(11)(21)/6
Sum = 2310/6
Sum = 385

The sum of the squares of the first 10 natural numbers is 385.<|eot_id|>


# Explicit multi-GPU memory control

In [1]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_id = "meta-llama/Llama-3.3-70B-Instruct"
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    dtype="bfloat16",
    device_map="auto",
    max_memory={
        0: "16Gib",
        # 1: "9GiB",
    },
    quantization_config=quantization_config,
    )

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
for i in range(2):
    device = torch.device(f'cuda:{i}')
    print(f"Total memory: {torch.cuda.get_device_properties(device).total_memory / 1024**3:.2f} GB")
    print(f"Allocated memory: {torch.cuda.memory_allocated(device) / 1024**3:.2f} GB")

Total memory: 47.38 GB
Allocated memory: 5.63 GB
Total memory: 47.38 GB
Allocated memory: 0.00 GB


## vLLM HF logprob mismatch

In [38]:
import requests
from transformers import AutoTokenizer
from openai import AsyncOpenAI

model_id = "allenai/OLMo-2-0425-1B-Instruct"
VLLM_URL = "http://localhost:8001/v1"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

prompt = "Hello world!"
messages = [
    {"role": "user", "content": prompt}
]
print(messages)
templated = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False)
inputs = tokenizer(templated, return_tensors="pt")
print(inputs["input_ids"].shape)
inputs

[{'role': 'user', 'content': 'Hello world!'}]
torch.Size([1, 9])


{'input_ids': tensor([[100257,     27,     91,    882,     91,    397,   9906,   1917,   4999]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [32]:


vllm_client = AsyncOpenAI(api_key="EMPTY", base_url=VLLM_URL, timeout=1200)
frontier_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") 

async def process_prompt(prompt_id, prompt_text):
    frontier_response = await vllm_client.chat.completions.create(
        model=model_id,
        messages=[{"role": "user", "content": prompt_text}],
        temperature=0.00000001,
        max_tokens=1,
        logprobs=True,
        extra_body={
            "prompt_logprobs": 1,  # This is the vLLM-specific param
        },
    )
    return frontier_response

response = await process_prompt(0, prompt)
# print(response.choices[0].logprobs)
for key, val in response.model_extra.items():
    print(key)

for i in range(len(response.model_extra["prompt_logprobs"])):
    elt_dict = response.model_extra["prompt_logprobs"][i]
    if elt_dict == None:
        continue
    token_list = list(elt_dict.keys())
    token_id = token_list[0]
    # print(token_id, tokenizer.decode(int(token_id))) 
    # print(elt_dict)
    suffix_string = ""
    if elt_dict[token_id]['rank'] != 1:
        suffix_string = f", rank only {elt_dict[token_id]['rank']}, model wanted ID {token_list[1]}: {elt_dict[token_list[1]]} "
    print(repr(f"{i} - {token_id} - '{elt_dict[token_id]['decoded_token']}' - logprob: {elt_dict[token_id]['logprob']}" + suffix_string))
    



prompt_logprobs
prompt_token_ids
kv_transfer_params
"1 - 27 - '<' - logprob: -0.8175437450408936"
"2 - 91 - '|' - logprob: -0.32933109998703003"
"3 - 882 - 'user' - logprob: -10.7415771484375, rank only 4696, model wanted ID 439: {'logprob': -2.647827625274658, 'rank': 1, 'decoded_token': ' as'} "
"4 - 91 - '|' - logprob: -9.323678016662598, rank only 14, model wanted ID 25: {'logprob': -0.011178131215274334, 'rank': 1, 'decoded_token': ':'} "
"5 - 397 - '>\n' - logprob: -5.864383697509766, rank only 24, model wanted ID 198: {'logprob': -2.4268834590911865, 'rank': 1, 'decoded_token': '\\n'} "
"6 - 9906 - 'Hello' - logprob: -8.364974021911621, rank only 488, model wanted ID 791: {'logprob': -2.739973783493042, 'rank': 1, 'decoded_token': 'The'} "
"7 - 1917 - ' world' - logprob: -4.457942008972168, rank only 10, model wanted ID 11: {'logprob': -1.520442008972168, 'rank': 1, 'decoded_token': ','} "
"8 - 4999 - '!\n' - logprob: -3.899444341659546, rank only 6, model wanted ID 0: {'logprob

In [21]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    dtype="bfloat16",
    device_map="cuda:1",
    )

In [35]:
output = model.forward(
    **inputs.to(model.device),
    do_sample=False,
    max_new_tokens=1,
    )
print(output.logits.shape)
argmaxed = output.logits.argmax(dim=-1)
print(argmaxed)
print(tokenizer.batch_decode(argmaxed))

torch.Size([1, 9, 100352])
tensor([[ 27,  91, 439,  25, 198, 791,  11,   0,  40]], device='cuda:1')
['<| as:\nThe,!I']


torch.Size([1, 9, 100352])


tensor([[ 27,  91, 439,  25, 198, 791,  11,   0,  40]], device='cuda:1')