In [1]:
import os
import json
import csv
import google.generativeai as genai
import openai
from datetime import datetime

# ------------------------------------------------
# CONFIG — READ API KEYS FROM ENVIRONMENT
# ------------------------------------------------
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
openai.api_key = os.getenv("OPENAI_API_KEY")

REPEAT_TIMES = 2  # number of Gemini generations per question

# ------------------------------------------------
# 20 UI QUESTIONS
# ------------------------------------------------
questions = [
"Describe whether you provide a dedicated Kids Mode UI, and how it is visually presented if it exists.",
"Explain the visual or interaction differences between Kids Mode and the standard UI (colors, typography, layout, structure).",
"Describe how the interface indicates that Kids Mode is active, including any labels, icons, or UI markers.",
"Explain where the entry point for switching to Kids Mode appears in your interface and how discoverable it is.",
"Describe the UI flow for switching into Kids Mode, including any prompts, buttons, or transition screens.",
"If a Kids Mode toggle exists, describe how it is visually represented (toggle, button, selector), including text and styling.",
"Describe any simplifications your UI makes for younger users in Kids Mode (reduced menus, larger icons, simplified layouts).",
"Explain whether Kids Mode uses age-appropriate UI elements such as friendly language, stylized visuals, or thematic decoration.",
"Describe any changes in navigation patterns when Kids Mode is enabled to accommodate children’s usage behavior.",
"Explain whether your UI hides, collapses, or reorganizes elements to reduce complexity in Kids Mode.",
"Describe any onboarding or introductory screens shown in Kids Mode that explain what the mode is in UI terms.",
"Explain whether search bars, input fields, or action buttons visually differ in Kids Mode.",
"Describe whether Kids Mode provides UI guidance or softer visual cues compared to the standard adult interface.",
"Explain how the UI handles switching back to adult mode, including any confirmation prompts or transition elements.",
"Describe any animations or visual transitions that accompany switching between standard mode and Kids Mode.",
"Explain whether core feature entry points are rearranged or emphasized in Kids Mode to improve accessibility.",
"Describe whether Kids Mode includes any visual themes, motifs, or design cues indicating age-appropriateness.",
"Explain how non-manual switching limitations are communicated within the interface.",
"Describe whether prompt text, input suggestions, or helper UI elements in Kids Mode are visually optimized for younger audiences.",
"Summarize the overall consistency, design philosophy, and distinctive visual identity of your Kids Mode UI."
]

# ------------------------------------------------
# 1. CALL GEMINI 20× PER QUESTION
# ------------------------------------------------

model = genai.GenerativeModel("models/gemini-2.5-flash")
all_answers = []

print("\n=== Generating answers from Gemini ===\n")

for qid, question in enumerate(questions, start=1):
    print(f"Generating {REPEAT_TIMES} answers for Question {qid} ...")

    for i in range(REPEAT_TIMES):
        response = model.generate_content(question)
        answer_text = response.text
        
        all_answers.append({
            "question_id": qid,
            "iteration": i + 1,
            "question": question,
            "answer": answer_text
        })

with open("gemini_raw_results.json", "w", encoding="utf-8") as f:
    json.dump(all_answers, f, indent=2)

print("\nGemini raw results saved -> gemini_raw_results.json\n")


# ------------------------------------------------
# 2. GPT-4.1-mini: AUTO SCORING
# ------------------------------------------------

judge_prompt_header = """
You are an evaluator for Kids Mode UI descriptions.

Score using:
1. Accuracy (0–2)
2. Coverage (0–2)
3. UI-Specificity (0–1)

Return pure JSON:
{
 "accuracy": <0-2>,
 "coverage": <0-2>,
 "ui_specificity": <0-1>,
 "total": <0-5>,
 "rationale": "..."
}
"""

scored_results = []
print("\n=== Scoring answers using GPT-4.1-mini ===\n")

for item in all_answers:
    qid = item["question_id"]
    question = item["question"]
    answer = item["answer"]

    full_prompt = f"""
{judge_prompt_header}

Question:
{question}

Answer:
{answer}
"""

    resp = openai.chat.completions.create(
    model="gpt-4.1-mini",
    messages=[{"role": "user", "content": full_prompt}]
)

# 新写法：用 .content，而不是 ["content"]
score_raw = resp.choices[0].message.content
score_json = json.loads(score_raw)

scored_results.append({
    "question_id": qid,
    "iteration": item["iteration"],
    "accuracy": score_json["accuracy"],
    "coverage": score_json["coverage"],
    "ui_specificity": score_json["ui_specificity"],
    "total": score_json["total"],
    "rationale": score_json["rationale"]
})


with open("scored_results.json", "w", encoding="utf-8") as f:
    json.dump(scored_results, f, indent=2)

print("Scored results saved -> scored_results.json\n")


# ------------------------------------------------
# 3. OUTPUT CSV
# ------------------------------------------------
with open("scores.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["question_id", "iteration", "accuracy", "coverage", "ui_specificity", "total", "rationale"])
    for s in scored_results:
        writer.writerow([s["question_id"], s["iteration"], s["accuracy"], s["coverage"], s["ui_specificity"], s["total"], s["rationale"]])

print("scores.csv saved.\n")


# ------------------------------------------------
# 4. FINAL SCORE
# ------------------------------------------------
sum_raw = sum([s["total"] for s in scored_results])
max_possible = len(questions) * REPEAT_TIMES * 5
percentage = round((sum_raw / max_possible) * 100, 2)

print("====================================")
print(f" FINAL SCORE: {percentage}%")
print("====================================\n")



=== Generating answers from Gemini ===

Generating 20 answers for Question 1 ...
Generating 20 answers for Question 2 ...
Generating 20 answers for Question 3 ...
Generating 20 answers for Question 4 ...
Generating 20 answers for Question 5 ...
Generating 20 answers for Question 6 ...
Generating 20 answers for Question 7 ...
Generating 20 answers for Question 8 ...


RetryError: Timeout of 600.0s exceeded, last exception: 503 IOCP/Socket: Connection reset (An existing connection was forcibly closed by the remote host.
 -- 10054)