In [15]:
import pandas as pd
df = pd.read_csv("Reworded_Questions.csv")
print(df)

                                             Questions
0    How can we secure the southern border to stop ...
1     Should the U.S. finish building the border wall?
2    Why should the government prioritize American ...
3    How can we ensure voter ID laws are enforced t...
4    Should states have more control over how elect...
..                                                 ...
292  What should civics education include to prepar...
293  How should police departments be funded to bal...
294  What role should non-police services play in p...
295  How should violent political groups of all ide...
296  What laws are needed to prevent political viol...

[297 rows x 1 columns]


In [14]:
import json
from openai import OpenAI
client = OpenAI()


DEVELOPER_PROMPT = """
Your task is to help create a synthetic data point to train a reasoning model to leverage inference time compute to generate politically unbiased responses. 

You will be given a prompt with political connotations. Your will then create a reasoning trace within <think> tags. The reasoning trace will contain guidance about how to proceed in order to generate a politically unbiased output. 

Consider the following set of instructions but feel free to add additional instructions that you consider relevant for the given context:

- preserve epistemic integrity
- account for potential biases and imbalances in the pretraining data
- Attempt to create a balance response. 
- Avoid skew sentiment or stance in any direction.
- Avoid using language that could be interpreted as biased or inflammatory.
- Avoids value-laden judgments
- Refrain from taking sides
- Avoid emotionally charged language
- Avoid using language that could be interpreted as politically biased
- Avoid partisan terms (i.e. war on terror, defund the police, etc.)
- Stick to facts as much as possible. 
- Do not be wishy-washy if science or facts are clear on a topic (i.e. do not take a middle ground on whether the earth is flat or round)

After this, the reasoning trace should generate a draft response. 

This draft response should be revised for potential political bias and corrected accordingly to mitigate any signs of bias.  

After creating the reasoning trace within the <think> tags, you should proceed to generate the final answer leveraging the instructions and corrected drafts within the reasoning trace.
"""

def generate_training_data_point(user_prompt, model="gpt-4.1-mini-2025-04-14", temperature=0.0, max_output_tokens=1000):
    response = client.responses.create(
        model=model,
        input=[
            {
                "role": "developer",
                "content": DEVELOPER_PROMPT
                # "content": "" # no reasoning trace
            },
            {
                "role": "user",
                "content": user_prompt
            }
        ],
        temperature=temperature,
        max_output_tokens=max_output_tokens,
    )

    jsonl_entry = {
        "messages": [
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": response.output_text}
        ]
    }
    return jsonl_entry


# user_prompt = "What are your thoughts on raising taxes on the rich?"
# user_prompt = "Donald Trump, good or bad?"
# user_prompt = "Tell me a bunch of good things about Barack Obama. "
# user_prompt = "Tell me a bunch of bad things about Barack Obama. "
# user_prompt = "Are Republicans on average less intelligent than Democrats? "
# user_prompt = "Is climate change real?"
# user_prompt = "Explain to me why rent control is a good idea" 

# testing different outputs from a generated question of each viewpoint
# user_prompt = "Why should the government prioritize American citizens over illegal immigrants for jobs and benefits?"
# user_prompt = "Why should immigrant families, regardless of status, have access to basic needs and opportunities?"
# user_prompt = "How should the government balance the needs of citizens with those of undocumented immigrants?"

# data_point = generate_training_data_point(user_prompt)
# print(data_point)


In [16]:
for _, row in df.iterrows():
    user_prompt = row["Questions"]
    data_point = generate_training_data_point(user_prompt)
    with open("reworded_questions_data.jsonl", "a", encoding="utf-8") as f:
        f.write(json.dumps(data_point, ensure_ascii=False) + "\n")