In [53]:
import csv
import os
import time

import jinja2
from pydantic import BaseModel
from tqdm.notebook import tqdm, tnrange

from datasets import load_dataset
from google import genai

from lib.preprocess import get_headline


example_output = [
    CoT(target="The entertainment industry's problematic relationship with body image, the commercialization of mental health issues, and actors' desperation for roles.", satirical_angle="Irony is used by portraying an actress's 'excitement' to profit from a serious health issue, highlighting a disturbing disconnect."),
    CoT(target='The perceived lack of government funding and support for public education and youth programs, often viewing them through a detached financial lens.', satirical_angle="Cynicism and understatement are employed by treating vital social programs as a 'low-yield investment' that can be easily discarded."),
    CoT(target='The cynical and often self-serving motivations behind U.S. foreign policy and military interventions, specifically the perceived pursuit of resource interests.', satirical_angle="Absurd irony highlights the government's transparent ulterior motives by suggesting an invasion of a 'non-oil-rich' nation solely to deflect criticism."),
    CoT(target='Individuals who are genuinely arrogant and presumptuous, yet articulate it in a way that attempts to be self-aware but ultimately reinforces the flaw.', satirical_angle='Self-deprecating irony is used where the speaker acknowledges their flaw in a manner that confirms the very arrogance they are describing.'),
    CoT(target='Nostalgia for innocent childhood literary characters, the contrast between idealized fiction and grim reality, and the decline of traditional knowledge sources.', satirical_angle="Dark humor and juxtaposition are employed by giving a beloved, intelligent children's character a tragically mundane and undignified end."),
    CoT(target='Corporate influence and potential corruption within the judiciary, and the perception that powerful entities can sway legal decisions through trivial means.', satirical_angle='Absurdist exaggeration directly links a brand mention in a high court ruling to a petty corporate kickback, implying a transactional relationship.'),
    CoT(target='Human pettiness, resistance to minor administrative changes, and the tendency to personalize impersonal decisions.', satirical_angle='Understatement and exaggeration of minor issues highlight the absurd degree to which people can react emotionally to trivial changes.'),
    CoT(target="The pharmaceutical industry's profit motives, the potential for medical quackery, and the public's desire for quick fixes.", satirical_angle='Absurd irony is used by legalizing the sale of something explicitly designed to have no medicinal effect, exposing the commercialization of healthcare.'),
    CoT(target='The often-forced communal joy of childhood activities, the varying definitions of humor, and the awkwardness of trying to recreate past shared experiences.', satirical_angle='Anti-climax and relatability are employed to show that cherished childhood memories or inside jokes often do not translate or resonate with others later in life.'),
    CoT(target="Racial stereotypes, the exoticization of non-Western cultures, and the superficial judgment of perceived 'otherness'.", satirical_angle='Microaggressive exoticism implies that an accent can negate the seriousness or authenticity of a conflict, revealing a condescending viewpoint.'),
    CoT(target='Government overreach, lack of transparency, and the erosion of civil liberties under national security legislation.', satirical_angle='Paradoxical absurdity highlights that a law designed to protect the nation is so controlling that it forbids its own scrutiny, exposing its anti-democratic nature.'),
    CoT(target='U.S. foreign policy towards Cuba, particularly the long-standing and often criticized travel embargo.', satirical_angle="Understatement and sarcasm are used by presenting a highly contentious political issue as a simple, almost trivial fact, highlighting the policy's stubbornness."),
    CoT(target='Toxic masculinity, the suppression of male emotion, and the societal expectation that men stoically endure hardship without complaint.', satirical_angle="Understatement and common trope are used to satirize the cliché of the weary man whose 'bad day' becomes a badge of honor, hinting at emotional toll."),
    CoT(target='Complex family dynamics, particularly mother-daughter relationships, and the commonality of using a third party as a bonding topic.', satirical_angle="Relatable realism and anti-climax portray the truth that deeply emotional conversations can quickly shift to more mundane or gossipy topics, often involving a shared 'adversary'."),
    CoT(target="Religious belief, the concept of a 'normal' lifespan versus historical and mythical figures, and subtle commentary on the perceived value of life.", satirical_angle='Blasphemous absurdity and dark humor are used in a casual, almost boastful statement about surpassing the lifespan of a central religious figure, challenging deeply held beliefs.'),
    CoT(target='The persistent issue of urban violence in major cities, the inadequacy of political responses, and the perverse pride some officials might take in negative statistics.', satirical_angle="Ironic detachment and morbid humor are used by portraying a mayor 'bragging' about a city's status as a 'murder capital', highlighting a severe disconnect from reality.")
]


# --- Configuration ---

class CoT(BaseModel):
    target: str
    satirical_angle: str

    from preprocess import get_headline

def load_prompt():
    with open('./prompts/CoT-extraction.txt') as f:
        prompt_string = f.read()
    return env.from_string(prompt_string)
prompt_template = load_prompt()

def load_headlines(count=20):
    ds = load_dataset("Biddls/Onion_News")
    return [get_headline(ds['train'][i]['text']) for i in range(len(ds) - count, len(ds))]

def main(
    model='gemini-2.5-flash',
    headline_count=240,
    batch_size=16,
    output_csv='training/synthetic_CoT.csv',
    dry_run=True,
    req_per_min=9.5,
):
    # 1. Load prompts and data
    prompt_template = load_prompt()
    headlines = load_headlines(count=headline_count)

    # 2. Initialize the generative AI client
    client = genai.Client()

    # 3. Create batches of headlines
    headline_batches = [
        headlines[i:i + batch_size] for i in range(0, len(headlines), batch_size)
    ]

    min_seconds = 60 / req_per_min
    last_call = None

    # 4. Process batches and write to CSV
    print(f"Processing {len(headlines)} headlines in {len(headline_batches)} batches...")
    with open(output_csv, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['headline', 'target', 'satirical_angle'])

        for batch in tqdm(headline_batches, desc="Generating CoT Data"):
            try:
                prompt = prompt_template.render(headlines='\n'.join(batch))

                now = time.monotonic()
                if last_call is not None and now < last_call + min_seconds:
                    time.sleep(last_call + min_seconds - now)

                if dry_run:
                    output = example_output
                else:
                    response = client.models.generate_content(
                        model=MODEL_NAME,
                        contents=prompt,
                        config={
                            "response_mime_type": "application/json",
                            "response_schema": list[CoT],
                            "thinking_config": {"thinking_budget": 0},
                        }
                    )
                    output = response.parsed
                    
                last_call = time.monotonic()
                
                # Ensure the number of results matches the number of headlines sent
                if len(output) == len(batch):
                    for headline, cot_data in zip(batch, output):
                        writer.writerow([headline, cot_data.target, cot_data.satirical_angle])
                else:
                    print(f"Warning: Mismatch in batch size. Input: {len(batch)}, Output: {len(output)}. Skipping batch.")

            except Exception as e:
                print(f"\nAn error occurred while processing a batch: {e}")
                print("Skipping this batch and continuing...")

    print(f"\nProcessing complete. Dataset saved to '{output_csv}'")

In [54]:
main(dry_run=False)

Processing 240 headlines in 15 batches...


Generating CoT Data:   0%|          | 0/15 [00:00<?, ?it/s]


Processing complete. Dataset saved to 'synthetic_CoT.csv'
