In [None]:
import os, random, pandas as pd, json

# directory containing parsed survey csvs
csv_dir = "/home/jiaji02/Desktop/Social_Computing/survey_res_csv"
out_dir = "/home/jiaji02/Desktop/Social_Computing/survey_metadata"

# sample a random csv file
csv_files = [f for f in os.listdir(csv_dir) if f.endswith(".csv")]
sampled_csv = random.choice(csv_files)
csv_path = os.path.join(csv_dir, sampled_csv)

df = pd.read_csv(csv_path)
print(f"loaded: {sampled_csv} ({len(df)} rows)")
# filter out malformed or aggregate rows
df = df[df["response"].str.lower().ne("totals") & df["response"].notna()]
# sample one question id (qnum)
qnum = random.choice(df["qnum"].unique().tolist())
q_df = df[df["qnum"] == qnum].copy()

# extract metadata
question_text = q_df["question"].iloc[0]
prompt_text = q_df["prompt"].iloc[0]
orig_label = str(q_df["orig_label"].iloc[0])

# extract responses and partisan breakdowns
responses = []
for _, row in q_df.iterrows():
    responses.append({
        "response": row["response"],
        "dem": float(row["dem"]) if not pd.isna(row["dem"]) else None,
        "rep": float(row["rep"]) if not pd.isna(row["rep"]) else None
    })

# assemble JSON object
data = {
    "file_source": sampled_csv,
    "qnum": int(qnum),
    "orig_label": orig_label,
    "question": question_text,
    "prompt": prompt_text,
    "responses": responses
}

# make output dir for this question
qid_dir = os.path.join(out_dir, f"{os.path.splitext(sampled_csv)[0]}_q{qnum}")
os.makedirs(qid_dir, exist_ok=True)

# save json inside it
out_path = os.path.join(qid_dir, "ground_truth.json")
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print(f"saved question {qnum} → {out_path}")


In [None]:
import os, random, pandas as pd, json
import os
from xai_sdk import Client
from xai_sdk.chat import user, system

# === params ===
csv_dir = "/home/jiaji02/Desktop/Social_Computing/survey_res_csv"
out_dir = "/home/jiaji02/Desktop/Social_Computing/survey_metadata"
x = 12  # number of random questions to sample

for i in range(25):
    try:
        # === sampling multiple questions ===
        csv_files = [f for f in os.listdir(csv_dir) if f.endswith(".csv")]
        sampled_csvs = random.sample(csv_files, min(x, len(csv_files)))
        sampled_questions = []

        for csv_file in sampled_csvs:
            df = pd.read_csv(os.path.join(csv_dir, csv_file))
            df = df[df["response"].str.lower().ne("totals") & df["response"].notna()]
            if df.empty:
                continue
            qnum = random.choice(df["qnum"].unique().tolist())
            q_df = df[df["qnum"] == qnum].copy()
            question = q_df["question"].iloc[0]
            prompt = q_df["prompt"].iloc[0]
            responses = q_df["response"].dropna().unique().tolist()
            sampled_questions.append({
                "file": csv_file,
                "qnum": qnum,
                "question": question,
                "prompt": prompt,
                "responses": responses
            })

        # === build LLM prompt ===
        Q_pick_llm_prompt = (
            "You are evaluating survey questions for their usefulness in measuring ideological position "
            "(i.e., left-right, liberal-conservative, or similar value orientations). "
            "From the following survey question texts, pick the ONE that most clearly measures ideological stance, "
            "not attitudes toward specific candidates or factual knowledge.\n\n"
        )
        for i, q in enumerate(sampled_questions, 1):
            opts_str = ", ".join(q["responses"])
            Q_pick_llm_prompt += f"{i}. {q['question'], q['prompt']}\n   options: {opts_str}\n\n"
        Q_pick_llm_prompt += (
            f"Respond only with the number (1-{len(sampled_questions)}) as your answer, DO NOT provide any reasoning."
        )

        # print(Q_pick_llm_prompt)

        # === LLM output ===
        client = Client(
            api_key="xai",
            timeout=3600,
        )
        chat = client.chat.create(model="grok-4-fast-non-reasoning")
        chat.append(user(Q_pick_llm_prompt))
        Q_pick_response = chat.sample()
        print(Q_pick_response.content)
        picked_index = int(Q_pick_response.content)
        picked = sampled_questions[picked_index - 1]
        print(f"\nLLM picked # {picked_index}: {picked['prompt']}")

        # === extract full info from that CSV and save JSON ===
        csv_path = os.path.join(csv_dir, picked["file"])
        df = pd.read_csv(csv_path)
        df = df[df["response"].str.lower().ne("totals") & df["response"].notna()]
        q_df = df[df["qnum"] == picked["qnum"]].copy()

        question_text = q_df["question"].iloc[0]
        prompt_text = q_df["prompt"].iloc[0]
        orig_label = str(q_df["orig_label"].iloc[0])

        responses = []
        for _, row in q_df.iterrows():
            responses.append({
                "response": row["response"],
                "dem": float(row["dem"]) if not pd.isna(row["dem"]) else None,
                "rep": float(row["rep"]) if not pd.isna(row["rep"]) else None
            })

        data = {
            "file_source": picked["file"],
            "qnum": int(picked["qnum"]),
            "orig_label": orig_label,
            "question": question_text,
            "prompt": prompt_text,
            "responses": responses
        }

        qid_dir = os.path.join(out_dir, f"{os.path.splitext(picked['file'])[0]}_q{picked['qnum']}")
        os.makedirs(qid_dir, exist_ok=True)

        out_path = os.path.join(qid_dir, "ground_truth.json")
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

        print(f"saved chosen question {picked['qnum']} → {out_path}")
    except Exception as e:
        continue

5

LLM picked # 5: Do you think it is a good idea or bad idea for the following to have space so that people can show their chosen pronouns, should they want to?
saved chosen question 3 → /home/jiaji02/Desktop/Social_Computing/survey_metadata/bb7473b2-a904-11e1-9412-005056900141_res_q3/ground_truth.json
8

LLM picked # 8: Do you think the impact of slavery is a major factor, a minor factor, or not a factor in lower average wealth
saved chosen question 9 → /home/jiaji02/Desktop/Social_Computing/survey_metadata/92222a20-b153-11e1-91b8-00505690014d_res_q9/ground_truth.json
7

LLM picked # 7: Do you think that news sources generally are...?
saved chosen question 7 → /home/jiaji02/Desktop/Social_Computing/survey_metadata/5a7e2286-17b3-11e7-b082-7946860c2270_res_q7/ground_truth.json
2

LLM picked # 2: How often do you think public schools in the U.S. are pushing K-12 students to adopt certain viewpoints
saved chosen question 5 → /home/jiaji02/Desktop/Social_Computing/survey_metadata/bce7b29e