In [1]:
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel

torch.random.manual_seed(0)

model_path = "Qwen/Qwen3-4B-Instruct-2507"

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype="auto",
)
model = PeftModel.from_pretrained(model, 'qwen3_lora_finetuned/checkpoint-4235')

tokenizer = AutoTokenizer.from_pretrained(model_path)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


In [3]:
def get_prompt():
    return [
        {
            "role": "user", 
            "content": "Write a satirical headline in the style of Onion News. Only write the headline."
        }
    ]

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "do_sample": True,
    "temperature": 0.7,
    "top_p": 0.8,
    "top_k": 20,
    "min_p": 0,
}

output = pipe(get_prompt(), num_return_sequences=5, **generation_args)
output

[{'generated_text': '<think>\n\n</think>\n\nTrump Campaign Announces It Will Stop Allowing Women On Campaign Bus'},
 {'generated_text': '</tool_call>\n\n</think>\n\nMan Who Got 100 On SAT Doesn’t Understand Why It’s So Hard To Get Into College'},
 {'generated_text': '<tool_call>\n\nBiden Vows To End U.S. Military Presence In Afghanistan'},
 {'generated_text': '</tool_call>\n\n<tool_call>\n\nMan Not Sure He Wants To Be The One Who Breaks Up Family'},
 {'generated_text': '<think>\n\n</think>\n\nMan Has No Idea What He Was Thinking When He Put On 50 Pounds Of Weight'}]