In [1]:
import transformers, torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_4bit = True)

prop_log = 'If Mason left his job, then he will not receive any salary.'
prompt = f'Translate the following statement to propositional logic: {prop_log}.'

messages = [
    {'role': 'user', 'content': prompt}
]

In [2]:
dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(dev)
torch.cuda.empty_cache()
print(torch.cuda.memory_summary())

cuda
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|----------------------------------------------------------

## DeepSeek 🐋

In [6]:
deepseek_tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
#deepseek_model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1", trust_remote_code = True, quantization_config = quantization_config)
deepseek_model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1", trust_remote_code = True)

ValueError: FP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)

## QwQ

In [3]:
qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/QwQ-32B")
qwen_model = AutoModelForCausalLM.from_pretrained("Qwen/QwQ-32B", quantization_config = quantization_config)

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 136.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 22.38 GiB is allocated by PyTorch, and 31.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
%%time
qwq_text = qwen_tokenizer.apply_chat_template(
    messages, 
    tokenize = False,
    add_generation_prompt = True
)

qwq_input = qwen_tokenizer([qwq_text], return_tensors='pt').to(qwen_model.device)

qwq_ids = qwen_model.generate(
    **qwq_input,
    max_new_tokens = 1024
)

qwq_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(qwq_input.input_ids, qwq_ids)
]

response = qwen_tokenizer.batch_decode(qwq_ids, skip_special_tokens=True)[0]
print(response)

## Llama 🦙

In [None]:
llama_id = 'meta-llama/Llama-3.1-8B'
llama_model = AutoModelForCausalLM.from_pretrained(llama_id)
llama_tokenizer = AutoTokenizer.from_pretrained(llama_id)

llama_text = llama_tokenizer.apply_chat_template(
    messages, 
    tokenize = False,
    add_generation_prompt = True
)

llama_input = llama_tokenizer([llama_text], return_tensors='pt').to(llama_model.device)

llama_ids = llama_model.generate(
    **llama_input,
    max_new_tokens = 1024
)

llama_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(llama_input.input_ids, llama_ids)
]

llama_response = llama_tokenizer.batch_decode(llama_ids, skip_special_tokens =True)[0]
print(llama_response)