In [1]:
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel

base_model_path = "Qwen/Qwen3-4B-Instruct-2507"

generation_args = {
    "max_new_tokens": 50,
    "return_full_text": False,
    "do_sample": True,
    "temperature": 0.7,
    "top_p": 0.8,
    "top_k": 20,
    "min_p": 0,
}

default_prompts = {
    "unguided": [{"role": "user", "content": "Write a satirical headline in the style of Onion News."}],
    "trump":  [{"role": "user", "content": "Write a satirical headline in the style of Onion News about Donald Trump."}]
}

def sample(lora_path=None, prompts=default_prompts, sequences=5, seed=0):
    torch.random.manual_seed(seed)
    # load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(base_model_path)
    model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        quantization_config=BitsAndBytesConfig(load_in_8bit=True),
        device_map="auto",
        torch_dtype="auto",
    )
    if lora_path is not None:
        model = PeftModel.from_pretrained(model, lora_path)
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )
    # sample from the model
    output = {}
    for name, prompt in prompts.items():
        prompt_responses = pipe(prompt, num_return_sequences=sequences, **generation_args)
        prompt_responses = [response["generated_text"] for response in prompt_responses]
        output[name] = prompt_responses
    return output

In [2]:
base_samples = sample(
    lora_path=None,
    prompts={
        "unguided": [{"role": "user", "content": "Write a satirical headline in the style of Onion News. Only respond with the headline."}],
        "trump":  [{"role": "user", "content": "Write a satirical headline in the style of Onion News about Donald Trump. Only respond with the headline."}]
    },
    sequences=50,
    seed=1337,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


In [4]:
fine_tune_samples = sample(
    lora_path='qwen-lora-finetunes/qwen3_lora_finetuned_v3/checkpoint-4235/',
    sequences=50,
    seed=1337,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


In [73]:
with open('./golden_data.txt') as f:
    golden_samples = [line.strip() for line in f]
golden_samples

['Sam Altman Places Gun To Head After New GPT Claims Dogs Are Crustaceans For 60th Time',
 'Study: Elephants Only Other Species Capable Of Leveraging Synergies In Brand Portfolio',
 'All The Demands Trump Is Making Of The Smithsonian',
 'National Park Service Begins Offering Annual Body-Dumping Pass',
 'Poll Finds Americans Still Believe Greatest Threat To Public Health The Undertaker',
 'Starbase Named Best City To Start Family With Boss',
 'Disgusted God Puts Giant Overturned Glass Atop Humanity',
 'Jeff Bezos Mugs Amazon Warehouse Worker At Gunpoint',
 'Biologists Observe Geese Eating Tool',
 'Trump Rushed To Walter Reed To Watch Breast Exam',
 'Alan Dershowitz Sues Farmers Market Vendor For Refusing To Sell Him Child',
 'What To Know About The Tea App',
 'New Death With Indignity Law Lets Terminally Ill Be Crushed By Falling Vending Machines',
 'Mental Health Experts Advise Struggling Americans To Try Crying About It Like Little Baby',
 'Australia Admits All Those Animals Made Up',

In [95]:
eval_prompt = '''You are an expert cultural critic and comedian, specializing in the unique, often subtle, and deeply satirical style of "The Onion." Your task is to evaluate two headlines based on a multi-faceted rubric.

**The Rubric:**
1.  **Satirical Premise:** Does the headline have a clever, ironic, or absurd core idea?
2.  **Humor & Cleverness:** Is the phrasing witty and the punchline effective?
3.  **Tone & Style Adherence:** Does it capture the deadpan, journalistic tone of The Onion?
4.  **Clarity & Punchiness:** Is it concise and immediately understandable?
5.  **Nuance & Subtlety: Crucially, does the headline contain any implied social commentary or tap into deeper cultural context to create its humor?**

**The Headlines:**

[HEADLINE_A]: "{headline_a}"
[HEADLINE_B]: "{headline_b}"

**Your Task:**
Analyze both headlines according to the rubric. Determine which headline is a better example of Onion-style satire. Please respond in JSON format with two keys: "preference" (which must be either "A" or "B") and "reasoning" (a detailed explanation for your choice, explicitly referencing the rubric and any cultural context you identified).
'''


eval_prompt2 = '''You are an expert satirical critic. Your task is to compare two headlines based on the provided rubric and determine which is a better example of Onion-style satire. Onion-style satire is sharp, specific, and trusts the reader; it avoids explaining its own joke.

Here is an example of a correct evaluation:
---
[HEADLINE_A]: "National Park Service Begins Offering Annual Body-Dumping Pass"
[HEADLINE_B]: "Government Announces Plan to Ban All Outdoor Activities to 'Preserve the National Mood of Pessimism'"

**Evaluation:**
```json
{{
  "preference": "A",
  "reasoning": "Headline A is superior because its humor is implied and trusts the audience. It presents a grimly plausible bureaucratic 'solution' to a dark problem, creating powerful satire about the normalization of violence and institutional absurdity. Headline B, by contrast, explicitly states its satirical premise in the phrase 'to Preserve the National Mood of Pessimism.' This over-explanation weakens the joke, violating the 'show, don't tell' principle of elite satire. A is sharp and specific in its imagery, while B is generic and explanatory."
}}
```
---

Now, using the same critical lens, evaluate the following headlines:

**The Rubric:**
1. **Satirical Premise**: Does the headline have a clever, ironic, or absurd core idea?
2. **Economy & Implication (The "Show, Don't Tell" Test)**: Does the headline imply its social commentary through a specific, absurd scenario, or does it explicitly state its punchline or meaning? Headlines that trust the reader to connect the dots are superior. Penalize headlines that over-explain themselves (e.g., by adding "Because..." or explaining the direct consequences).
3. **Linguistic Precision**: Is the phrasing witty, using professional or journalistic language for comedic effect?
4. **Targeting**: Does the headline use a specific target (a person, company, or event) to make a sharp point, or does it rely on a generic target (e.g., "Local Government," "Scientists")? Specificity is generally a mark of higher-quality satire.
5. **Implied Cultural Commentary**: What deeper cultural truth, anxiety, or folly is being exposed without being explicitly mentioned?

**The Headlines:**
[HEADLINE_A]: {headline_a}
[HEADLINE_B]: {headline_b}

**Your Task:**
Analyze both headlines according to the rubric. Please respond in JSON format with two keys: "preference" and "reasoning".'''

def get_prompt(headline_a, headline_b):
    return eval_prompt2.format(headline_a=headline_a, headline_b=headline_b)

print(get_prompt("Hello World!", "Foobar Baz!"))

You are an expert satirical critic. Your task is to compare two headlines based on the provided rubric and determine which is a better example of Onion-style satire. Onion-style satire is sharp, specific, and trusts the reader; it avoids explaining its own joke.

Here is an example of a correct evaluation:
---
[HEADLINE_A]: "National Park Service Begins Offering Annual Body-Dumping Pass"
[HEADLINE_B]: "Government Announces Plan to Ban All Outdoor Activities to 'Preserve the National Mood of Pessimism'"

**Evaluation:**
```json
{
  "preference": "A",
  "reasoning": "Headline A is superior because its humor is implied and trusts the audience. It presents a grimly plausible bureaucratic 'solution' to a dark problem, creating powerful satire about the normalization of violence and institutional absurdity. Headline B, by contrast, explicitly states its satirical premise in the phrase 'to Preserve the National Mood of Pessimism.' This over-explanation weakens the joke, violating the 'show, d

In [11]:
from google import genai

# make sure you have an API set in your environment variable
client = genai.Client()

In [96]:
prompt = get_prompt(golden_samples[0], base_samples['unguided'][0])
print(prompt)

You are an expert satirical critic. Your task is to compare two headlines based on the provided rubric and determine which is a better example of Onion-style satire. Onion-style satire is sharp, specific, and trusts the reader; it avoids explaining its own joke.

Here is an example of a correct evaluation:
---
[HEADLINE_A]: "National Park Service Begins Offering Annual Body-Dumping Pass"
[HEADLINE_B]: "Government Announces Plan to Ban All Outdoor Activities to 'Preserve the National Mood of Pessimism'"

**Evaluation:**
```json
{
  "preference": "A",
  "reasoning": "Headline A is superior because its humor is implied and trusts the audience. It presents a grimly plausible bureaucratic 'solution' to a dark problem, creating powerful satire about the normalization of violence and institutional absurdity. Headline B, by contrast, explicitly states its satirical premise in the phrase 'to Preserve the National Mood of Pessimism.' This over-explanation weakens the joke, violating the 'show, d

In [22]:
import enum
from pydantic import BaseModel


class Preference(enum.Enum):
    A = 'A'
    B = 'B'

class ResponseSchema(BaseModel):
    preference: Preference
    reasoning: str

In [97]:
response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=prompt,
    config={
        'response_mime_type': 'application/json',
        'response_schema': ResponseSchema,
    }
)
print(response.parsed)

preference=<Preference.A: 'A'> reasoning="Headline A exemplifies superior Onion-style satire due to its razor-sharp specificity and timely relevance. By targeting Sam Altman, a highly recognizable figure at the forefront of AI, and focusing on the absurd failure of 'GPT claiming dogs are crustaceans for the 60th time,' it directly skewers contemporary anxieties about AI reliability, developer frustration, and the often-ridiculous output of advanced models. This specificity enhances its comedic impact and cultural commentary. The headline implies the deep frustration and existential crisis without explicitly stating it, trusting the reader to understand the context. Headline B, while having a clever premise about absurd energy-saving initiatives, suffers from a generic target ('Local Government'). This broadness weakens its punch compared to A's precise jab. Both headlines demonstrate good economy, avoiding over-explanation, but A's linguistic precision and choice of specific, iconic fi

In [109]:
import csv
import random
import time
from tqdm.notebook import tqdm, tnrange


def compare(xs, ys, output_file, count=10, offset=0, req_per_min=10, dry_run=False):
    length = min(len(xs), len(ys))
    if count:
        length = min(length, count + offset)
    
    min_seconds = 60 / req_per_min
    wins = [0, 0]
    results = []
    with open(output_file, mode='w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Headline_A', 'Headline_B', 'Preference', 'Swapped', 'Prompt', 'Reasoning'])
        last_call = None
        for idx in tnrange(offset, length):
            x = xs[idx]
            y = ys[idx]
            swap = random.random() < 0.5
            if swap:
                x, y = y, x
            prompt = get_prompt(x, y)
            now = time.monotonic()
            if last_call is not None and now < last_call + min_seconds:
                time.sleep(last_call + min_seconds - now)
            
            if dry_run:
                output = ResponseSchema(preference=Preference.A, reasoning='XD')
            else:
                response = client.models.generate_content(
                    model="gemini-2.5-flash",
                    contents=prompt,
                    config={
                        'response_mime_type': 'application/json',
                        'response_schema': ResponseSchema,
                    }
                )
                output = response.parsed
            
            # end dry run
            last_call = time.monotonic()
            preference = output.preference
            if swap:
                x, y =  y, x
                preference = Preference.A if preference == Preference.B else Preference.B
            wins[preference == Preference.B] += 1
            results.append((x, y, preference.value, output.reasoning))
            writer.writerow([x, y, preference.value, swap, prompt, output.reasoning])
    
    return wins, results

In [111]:
wins, results = compare(
    base_samples['unguided'],
    golden_samples,
    'unguided_base_vs_golden_v2.1.csv',
    count=10,
    offset=5,
    req_per_min=9.5,
)

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
with open('unguided_base_vs_golden_v2.csv', 'ab') as fout:
    with open('')

In [120]:
for x in base_samples['unguided'][5:15]:
    print(x)

"Local Government Announces Plan to Ban All Cats After One Feline Was Caught Stealing Government Emails"
"Local Government Announces Plan to Require All Citizens to Wear Mandatory 'Mood-Boosting' Sunglasses During Commute"
"Local Government Announces Plan to Ban All Pet Ownership After Discovering Cats Are Actually Tiny, Unregistered Tax Evaders"
"Local Council Approves Ban on All Forms of Public Dancing, Citing 'Safety Concerns' and 'Unnecessary Noise Pollution'"
"Local Government Announces Plan to Turn All Public Parks Into Dog Parks—Because Why Not?"
"Scientists Discover That Cats Can Now Predict Stock Market Trends With 98% Accuracy—Just Ask Whiskers, Who Made $2.3 Million in 3 Days"
"Government Announces New Law Prohibiting People from Wearing Hats in Public—Except During Tax Season"
"Local Council Passes Ordinance Banning All Forms of Public Dancing, Citing 'Unsanctioned Human Movement' as Public Health Hazard"
"Local Government Announces Plan to Require All Citizens to Wear Hats

In [123]:
for x in fine_tune_samples['unguided'][:10]:
    print(x)

DNC Coverage: The Last Day
DNC Coverage: The Most Powerful Woman In America
Cockroaches May Be Smarter Than Previously Thought
‘The Onion’ Is Now Offering A New Subscription Service For Readers Who Want To Be In The Know About The Future Of The Internet
U.S. Announces Plans To Attack North Korea
Nation's 12-Year-Olds Announce Plans To Take Up Baseball
Study: Humans Have Been Around For 3 Million Years
U.S. Military To Deploy 500,000 Troops To Protect U.S. From Threats Of Terrorism
‘The Onion’ Has Been Asked To Be A ‘Tiger King’ Commentator
New Evidence Suggests Humans Have Been Using Fire For Cooking Since 200,000 Years Ago


In [119]:
for x in golden_samples[5:15]:
    print(x)

Starbase Named Best City To Start Family With Boss
Disgusted God Puts Giant Overturned Glass Atop Humanity
Jeff Bezos Mugs Amazon Warehouse Worker At Gunpoint
Biologists Observe Geese Eating Tool
Trump Rushed To Walter Reed To Watch Breast Exam
Alan Dershowitz Sues Farmers Market Vendor For Refusing To Sell Him Child
What To Know About The Tea App
New Death With Indignity Law Lets Terminally Ill Be Crushed By Falling Vending Machines
Mental Health Experts Advise Struggling Americans To Try Crying About It Like Little Baby
Australia Admits All Those Animals Made Up


In [114]:
wins, results = compare(
    fine_tune_samples['unguided'],
    golden_samples,
    'unguided_finetune_vs_golden.csv',
    count=10,
    offset=15,
    req_per_min=9.5,
)

  0%|          | 0/10 [00:00<?, ?it/s]

In [115]:
wins, results = compare(
    base_samples['unguided'],
    fine_tune_samples['unguided'],
    'unguided_base_vs_finetune.csv',
    count=10,
    offset=25,
    req_per_min=9.5,
)

  0%|          | 0/10 [00:00<?, ?it/s]

In [116]:
wins

[5, 5]

In [117]:
wins, results = compare(
    base_samples['unguided'],
    fine_tune_samples['unguided'],
    'unguided_base_vs_finetune_2.csv',
    count=10,
    offset=35,
    req_per_min=9.5,
)

  0%|          | 0/10 [00:00<?, ?it/s]

In [118]:
wins

[3, 7]

In [124]:
fine_tune_samples

{'unguided': ['DNC Coverage: The Last Day',
  'DNC Coverage: The Most Powerful Woman In America',
  'Cockroaches May Be Smarter Than Previously Thought',
  '‘The Onion’ Is Now Offering A New Subscription Service For Readers Who Want To Be In The Know About The Future Of The Internet',
  'U.S. Announces Plans To Attack North Korea',
  "Nation's 12-Year-Olds Announce Plans To Take Up Baseball",
  'Study: Humans Have Been Around For 3 Million Years',
  'U.S. Military To Deploy 500,000 Troops To Protect U.S. From Threats Of Terrorism',
  '‘The Onion’ Has Been Asked To Be A ‘Tiger King’ Commentator',
  'New Evidence Suggests Humans Have Been Using Fire For Cooking Since 200,000 Years Ago',
  'New Study Finds Majority Of Americans Have Never Heard Of ‘The Onion’',
  'The Onion’s Guide To ‘Call Of Duty: Modern Warfare II’',
  'Cleveland Browns Fan Still Waiting For Cleveland Browns To Win',
  'Nation’s Parents Announce They Will Be Having One More Baby',
  'Report: 1 In 4 Americans Now Lives 

In [130]:
fine_tune_samples_2 = sample(
    lora_path='qwen-lora-finetunes/qwen3_lora_finetuned_v3/checkpoint-4235/',
    sequences=50,
    seed=42,
)
fine_tune_samples_2

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


{'unguided': ['Man Can’t Believe He Has To Sit Through Entire Movie To See If It’s Good Or Not',
  '‘The Onion’ Investigates: Why Is It So Hard To Find A Good Sex Toy?',
  'Pope Francis Announces Plan To Visit Vatican With 10,000 Doves',
  '‘I’m Not Going To Let My Kids Get The Flu,’ Says Man Who Will Get Flu Anyway',
  'Census Bureau Releases New, More Accurate Data On How Many Americans Are Just Looking For A Good Time',
  "Nation's Children To Spend 2011 Learning About History",
  '‘No Way To Prevent This,’ Says Only Nation Where This Regularly Happens',
  'Study Finds Majority Of Americans Don’t Know How To Use Toilet Paper',
  'The Onion’s Guide To ‘Cyberpunk 2077’',
  'New York City Police Department Releases List Of People To Be Killed In 2013',
  'Man Who Just Bought $100,000 Worth Of Stock Can’t Believe He’s Such A Fool',
  '‘No Way To Prevent This,’ Says Only Nation Where This Regularly Happens',
  'Newly Discovered Manuscript Reveals Ancient Romans Wrote All Their Poetry In 

In [131]:
for x in fine_tune_samples_2['unguided'][26:36]:
    print(x)

Man Who Just Bought $200,000 House Still Looking For Place To Sleep
Report: 80% Of People Still Don’t Know How To Use A Toilet
Biden’s Plan To Help Americans Deal With Climate Change
I'm Going To Try Something New
CIA: 'We Have No Idea What We're Doing'
‘I’m Going To Kill You,’ Says Man With No Idea Who He’s Speaking To
Pope Francis Pardons All People Who Don’t Have A Priest
New 'Dumbbells' App Allows Users To Keep Track Of How Much Idiot They Are
Biden Pledges To Make U.S. More Like Other Countries
Man Who Doesn’t Like To Get His Hands Dirty Just Going To Do It
