In [1]:
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel

torch.random.manual_seed(0)

model_path = "Qwen/Qwen3-4B-Instruct-2507"

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype="auto",
)
model = PeftModel.from_pretrained(model, 'qwen3_lora_finetuned/checkpoint-4235')

tokenizer = AutoTokenizer.from_pretrained(model_path)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


In [2]:
generation_args = {
    "max_new_tokens": 500,
    "return_full_text": True,
    "do_sample": True,
    "temperature": 0.7,
    "top_p": 0.8,
    "top_k": 20,
    "min_p": 0,
}

def get_prompt():
    return [
        {
            "role": "user", 
            "content": "Write a satirical headline in the style of Onion News. Only write the headline."
        }
    ]

output = pipe(get_prompt(), num_return_sequences=5, **generation_args)
output



[{'generated_text': [{'role': 'user',
    'content': 'Write a satirical headline in the style of Onion News. Only write the headline.'},
   {'role': 'assistant',
    'content': '<think>\n\n</think>\n\nU.S. To Reopen Embassy In Jerusalem'}]},
 {'generated_text': [{'role': 'user',
    'content': 'Write a satirical headline in the style of Onion News. Only write the headline.'},
   {'role': 'assistant',
    'content': '<think>\n\n</think>\n\nCatholic Church Reopens To The Public After 14 Years Of Lockdown'}]},
 {'generated_text': [{'role': 'user',
    'content': 'Write a satirical headline in the style of Onion News. Only write the headline.'},
   {'role': 'assistant',
    'content': '<think>\n\n</think>\n\nNew Study Finds Humans Evolved To Be Good At Hiding Their Emotions'}]},
 {'generated_text': [{'role': 'user',
    'content': 'Write a satirical headline in the style of Onion News. Only write the headline.'},
   {'role': 'assistant',
    'content': '</tool_call>\n\n</tool_call>\n\nNati

In [3]:
prompt = get_prompt()
prompt.append({
    "role": "assistant",
    "content": "U.S. To Reopen Embassy In Jerusalem",
})

tokenizer.apply_chat_template(
    prompt,
    tokenize=False,
)

'<|im_start|>user\nWrite a satirical headline in the style of Onion News. Only write the headline.<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\nU.S. To Reopen Embassy In Jerusalem<|im_end|>\n'

In [4]:
print(tokenizer.chat_template)

{%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0].role == 'system' %}
        {{- messages[0].content + '\n\n' }}
    {%- endif %}
    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
    {%- if messages[0].role == 'system' %}
        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
    {%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{%- for message in messages[::-1] %}
    {%- set index = (messages|length - 

In [5]:
new_template = '''{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<im_end>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}'''

if tokenizer.chat_template != new_template:
    original_template = tokenizer.chat_template
    tokenizer.chat_template = new_template

tokenizer.apply_chat_template(
    prompt,
    tokenize=False,
)

'<|im_start|>user\nWrite a satirical headline in the style of Onion News. Only write the headline.<im_end>\n<|im_start|>assistant\nU.S. To Reopen Embassy In Jerusalem<im_end>\n'

In [6]:
guided_prompt = [{
    "role": "user", 
    "content": "Write a satirical headline in the style of Onion News about the housing market."
}]

output = pipe(guided_prompt, num_return_sequences=5, **generation_args)
output

[{'generated_text': [{'role': 'user',
    'content': 'Write a satirical headline in the style of Onion News about the housing market.'},
   {'role': 'assistant',
    'content': '<think>\n\n</think>\n\nPros And Cons Of The Mortgage Forgiveness Debt Relief Act'}]},
 {'generated_text': [{'role': 'user',
    'content': 'Write a satirical headline in the style of Onion News about the housing market.'},
   {'role': 'assistant',
    'content': '</tool_call>\n\n</think>\n\nMan Who Got 100% On SAT Not Sure Why He Didn’t Take It Earlier In Life'}]},
 {'generated_text': [{'role': 'user',
    'content': 'Write a satirical headline in the style of Onion News about the housing market.'},
   {'role': 'assistant',
    'content': '<tool_call>\n\nNation’s Homebuyers Announce Plans To Get Their First Homes During Great Recession'}]},
 {'generated_text': [{'role': 'user',
    'content': 'Write a satirical headline in the style of Onion News about the housing market.'},
   {'role': 'assistant',
    'conten