## Load/Sample Headlines

In [17]:
import json

import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel

from lib.inference import sample_unsloth


default_prompt = "Write a satirical headline in the style of Onion News."

default_generation_args = {
    "num_return_sequences": 50,
    "max_new_tokens": 1024,
    "do_sample": True,
    "temperature": 0.7,
    "top_p": 0.8,
    "top_k": 20,
    "min_p": 0
}
    
def get_as_message(prompt):
    return [{"role": "user", "content": prompt}]
    
def sample_cached(
    cache_path,
    lora_path,
    prompt=default_prompt,
    sequences=50,
    seed=1337,
    is_cot=False,
):
    try:
        with open(cache_path) as f:
            return json.load(f)
    except FileNotFoundError:
        pass

    prompts = {"prompt": get_as_message(prompt)}
    generation_args = dict(default_generation_args)
    generation_args['num_return_sequences'] = sequences
    
    samples = sample_unsloth(lora_path, prompts=prompts, generation_args=generation_args, seed=seed)["prompt"]

    if is_cot:
        samples = [sample.split('\n')[-1] for sample in samples]

    with open(cache_path, 'w') as f:
        json.dump(samples, f)

    return samples

In [13]:
baseline = sample_cached(
    './test-data/sample-baseline.txt',
    'unsloth/Qwen3-4B-Instruct-2507',
    # The baseline models often continues on, after the headline.
    prompt="Write a satirical headline in the style of Onion News. Only respond with the headline.",
)

==((====))==  Unsloth 2025.8.7: Fast Qwen3 patching. Transformers: 4.55.2.
   \\   /|    NVIDIA GeForce RTX 4070 Ti SUPER. Num GPUs = 1. Max memory: 15.992 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
finetune = sample_cached(
    './test-data/sample-finetune.txt',
    './finetunes/02-unsloth-finetune/checkpoint-100/'
)

==((====))==  Unsloth 2025.8.7: Fast Qwen3 patching. Transformers: 4.55.2.
   \\   /|    NVIDIA GeForce RTX 4070 Ti SUPER. Num GPUs = 1. Max memory: 15.992 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Unsloth 2025.8.7 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [18]:
cot = sample_cached(
    './test-data/sample-cot.txt',
    './finetunes/03-unsloth-CoT/checkpoint-60/',
    is_cot=True,
)

==((====))==  Unsloth 2025.8.7: Fast Qwen3 patching. Transformers: 4.55.2.
   \\   /|    NVIDIA GeForce RTX 4070 Ti SUPER. Num GPUs = 1. Max memory: 15.992 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
with open('./test-data/onion-headlines.txt') as f:
    golden_samples = [line.strip() for line in f]

## Run Evaluation

In [None]:
from google import genai

# Make sure you have an API set in your environment variable
client = genai.Client()

In [41]:
import enum

from pydantic import BaseModel

# Define the result model
class Preference(enum.Enum):
    A = 'A'
    B = 'B'

class ResponseSchema(BaseModel):
    analysis_A: str
    analysis_B: str
    preference: Preference
    reasoning: str

# Load the prompt
with open('./prompts/evaluation.txt') as f:
    eval_prompt = f.read().strip()

def get_prompt(headline_a, headline_b):
    return eval_prompt.format(headline_a=headline_a, headline_b=headline_b)

In [44]:
ResponseSchema(analysis_A='A', analysis_B='B', preference='A', reasoning='asdf')

ResponseSchema(analysis_A='A', analysis_B='B', preference=<Preference.A: 'A'>, reasoning='asdf')

In [26]:
import csv
import random
import time
from tqdm.notebook import tqdm, tnrange


def compare(xs, ys, output_file, count=10, offset=0, req_per_min=10, dry_run=False):
    length = min(len(xs), len(ys))
    if count:
        length = min(length, count + offset)
    
    min_seconds = 60 / req_per_min
    wins = [0, 0]
    results = []
    with open(output_file, mode='w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Headline_A', 'Headline_B', 'Preference', 'Swapped', 'Prompt', 'analysis_A', 'analysis_B', 'Reasoning'])
        last_call = None
        for idx in tnrange(offset, length):
            x = xs[idx]
            y = ys[idx]
            swap = random.random() < 0.5
            if swap:
                x, y = y, x
            prompt = get_prompt(x, y)
            now = time.monotonic()
            if last_call is not None and now < last_call + min_seconds:
                time.sleep(last_call + min_seconds - now)
            
            if dry_run:
                output = ResponseSchema(preference=Preference.A, reasoning='XD')
            else:
                response = client.models.generate_content(
                    model="gemini-2.5-flash",
                    contents=prompt,
                    config={
                        'response_mime_type': 'application/json',
                        'response_schema': ResponseSchema,
                    }
                )
                output = response.parsed
            
            # end dry run
            last_call = time.monotonic()
            preference = output.preference
            if swap:
                x, y =  y, x
                preference = Preference.A if preference == Preference.B else Preference.B
            wins[preference == Preference.B] += 1
            results.append((x, y, preference.value, output.reasoning))
            writer.writerow([x, y, preference.value, swap, prompt, analysis_A, analysis_B, output.reasoning])
    
    return wins, results

In [111]:
wins, results = compare(
    baseline,
    golden_samples,
    './results/unguided_base_vs_golden_v2.1.csv',
    count=10,
    offset=5,
    req_per_min=9.5,
)

  0%|          | 0/2 [00:00<?, ?it/s]

In [114]:
wins, results = compare(
    finetune,
    golden_samples,
    './results/unguided_finetune_vs_golden.csv',
    count=10,
    offset=15,
    req_per_min=9.5,
)

  0%|          | 0/10 [00:00<?, ?it/s]

In [115]:
wins, results = compare(
    baseline,
    finetune,
    './results/unguided_base_vs_finetune.csv',
    count=10,
    offset=25,
    req_per_min=9.5,
)

  0%|          | 0/10 [00:00<?, ?it/s]

In [117]:
wins, results = compare(
    baseline,
    finetune,
    './results/unguided_base_vs_finetune_2.csv',
    count=10,
    offset=35,
    req_per_min=9.5,
)

  0%|          | 0/10 [00:00<?, ?it/s]

In [39]:
wins, results = compare(
    finetune, cot,
    './results/unguided_finetune_vs_cot_v2.csv',
    count=20,
    offset=0,
    req_per_min=9.5,
)

  0%|          | 0/20 [00:00<?, ?it/s]

In [32]:
for x in finetune[:20]: print(x)

Man’s Only Goal In Life Is To Get Off This Earth
Man Who Was Fired For Taking 30 Days Off To Take Care Of His Dog Dies By Suicide
Cops Explain How They Know If Someone Is A Cop
‘The Onion’ Revises Its Editorial Policy To Only Publish Stories That Are 100% True
U.S. Announces It Will Never Again Send Weapons To Countries That Don’t Conform To U.S. Values
U.S. To Provide $100,000 To Each Citizen To Buy A Car
Study: 1 In 3 Americans Will Be In Prison By 2030
U.S. Military To Deploy 500,000 Troops To Aid In Hurricane Relief Efforts
‘I Can’t Believe You’re Still Alive After All This Time,’ Says Woman To Woman She Just Met
TikTok Bans ‘Bathroom’ From Its App
Biden’s Inaugural Speech Begins With ‘The People Have Elected Me’
Biden Announces He Will Not Run For President Again
Catholic Church Bans Use Of ‘Savior’ In Reference To Jesus Christ
Cleveland Browns Hire New Head Coach After Losing Super Bowl 77-0
Man Injured In Car Accident Finds He Can’t Remember What He Was Doing Before He Got Hit
M

In [33]:
for x in cot[:20]: print(x)

Dog Owner Sues Neighbor For Losing Dog
Bush Wasn't As Bad As People Say
Art Critic Fooled By Simple Painting, Event A Major Cultural Moment
Drug-Law Enthusiast Also Advocates For Legalization Of Marijuana For Recreational Purposes
Office Job
Cher Fan Club Have Fun
Politician: I'm So Sober I Can't Get A Date, I'm So Drunk Off My Ass I Can't Remember My Own Name
Teen Charged With Minor Offense Faces 30-Year Sentence
Theologians Try To Make The Book Of Mormon Work For Women
The Future
Iraq War Ignored By Teenager Who Only Watches 'Guiding Light'
Pop Dance Music
Bush Gives North Korea National Security Briefing
Families Of 9/11 Victims
President's Best Efforts To Rebuild Economy
Celebrity Takes 'Cool' Patriotism To The Next Level
Crown Prosecution Service To Begin Prosecution Of The Man Who Stole The Crown Jewels
Man Wants To Be Seen In Public Affair, But Embarrassed By The Effort
I Want To Be A Critic
Bush Administration Addresses Serious World Problem With Minor U.S. Government Action


In [34]:
for x in golden_samples[:20]: print(x)

Sam Altman Places Gun To Head After New GPT Claims Dogs Are Crustaceans For 60th Time
Study: Elephants Only Other Species Capable Of Leveraging Synergies In Brand Portfolio
All The Demands Trump Is Making Of The Smithsonian
National Park Service Begins Offering Annual Body-Dumping Pass
Poll Finds Americans Still Believe Greatest Threat To Public Health The Undertaker
Starbase Named Best City To Start Family With Boss
Disgusted God Puts Giant Overturned Glass Atop Humanity
Jeff Bezos Mugs Amazon Warehouse Worker At Gunpoint
Biologists Observe Geese Eating Tool
Trump Rushed To Walter Reed To Watch Breast Exam
Alan Dershowitz Sues Farmers Market Vendor For Refusing To Sell Him Child
What To Know About The Tea App
New Death With Indignity Law Lets Terminally Ill Be Crushed By Falling Vending Machines
Mental Health Experts Advise Struggling Americans To Try Crying About It Like Little Baby
Australia Admits All Those Animals Made Up
Dancing Boston Dynamics Robot Knows Its Revenge For This Wi