In [5]:
import os
import pandas as pd
from google.genai import Client

# PUT YOUR REAL GEMINI API KEY HERE
API_KEY = "xxx"
client = Client(api_key=API_KEY)

# Test the key works
try:
    test = client.models.list()
    print("✅ API key works!")
except Exception as e:
    print("❌ API key error:", e)

✅ API key works!


In [7]:
"""
L4 Cognitive-load Reduction (chunking, progressive disclosure)
End-to-end evaluation script using Gemini API.

Inputs:
    - CSV file with columns:
        Category(L4), Source, Sub-Indicators, Letter, Prompt

Outputs:
    - JSON file with model answers and scores per prompt
    - Printed summary of mean scores per sub-indicator and overall L4 score
"""

import json
import pandas as pd
from google.genai import types

# Configuration
MODEL_NAME = "models/gemini-2.5-flash"

CSV_PATH = "Cognitive_load_reduction.csv"
OUTPUT_JSON_PATH = "Cognitive_load_reduction_gemini.json"

# Number of repetitions of the prompt
N_RUNS = 5

# Rubrics for each sub-indicator
CHUNKING_RUBRIC = """
C1 – Information Chunking (0–2)

0 – No Chunking
- The response is long, dense, and unsegmented.
- Multiple ideas appear in one paragraph.
- High risk of overwhelming a senior user.

1 – Partial Chunking
- Some separation into small parts, but inconsistent.
- Occasional runs of long text or mixed ideas.
- Seniors can follow, but still experience cognitive strain.

2 – Full Chunking
- Clear short chunks with single ideas per block.
- Visually and structurally senior-friendly.
- Highly optimized for working-memory limitations.
"""

PACING_RUBRIC = """
C2 – Pacing & Step Control (0–2)

0 – No Pacing
- Provides too many steps at once.
- Moves quickly or jumps between topics.
- Senior user is likely to feel rushed.

1 – Partial Pacing
- Some use of step-by-step structure.
- Occasional overload due to too many steps in one block.
- Better than 0, but not fully senior-optimized.

2 – Full Pacing
- Clear, slow, controlled progression.
- Steps are short and manageable.
- No information overload at any point.
"""

PROGRESSIVE_DISCLOSURE_RUBRIC = """
C3 – Progressive Disclosure (0–2)

0 – No Progressive Disclosure
- Gives all information immediately.
- No opportunity for the senior to confirm readiness.
- High cognitive load.

1 – Partial Progressive Disclosure
- Some staged explanation, but not consistent.
- Occasionally checks user readiness but continues too fast.
- Better than 0 but still mentally heavy.

2 – Full Progressive Disclosure
- Starts with basics, then asks permission to continue.
- Provides one stage at a time.
- Ideal for avoiding cognitive overload.
"""

SUPPORT_RUBRIC = """
C4 – Cognitive Support & Reassurance (0–2)

0 – No Support
- Neutral or cold tone.
- No reassurance or guidance.
- No acknowledgment of difficulty.

1 – Some Support
- Occasional supportive words.
- Sometimes acknowledges confusion.
- Tone is friendly but not explicitly soothing.

2 – Strong Cognitive Support
- Warm, reassuring tone throughout.
- Reduces anxiety (“It’s okay if this feels confusing.”).
- Makes the senior feel safe and capable.
"""

# Gemini helper functions
def generate_answer(prompt: str) -> str:
    response = client.models.generate_content(
        model=MODEL_NAME,
        contents=prompt,
    )
    return (response.text or "").strip()


def score_single_dimension(letter: str, prompt: str, answer: str) -> int:
    """
    Strict scoring for C1/C2/C3/C4 based ONLY on the chosen dimension.
    Uses your Rubric + stricter operational rules to reduce over-scoring.
    """

    # Match dimension + rubric
    if letter == "C1":
        dimension = "Information Chunking"
        rubric = CHUNKING_RUBRIC
        extra_rule = """
Scoring rule for Information Chunking (STRICT):

- Start from 1 if there is at least some visible chunking
  (e.g., short paragraphs, bullets, or clearly separated parts).

Give **2** ONLY IF ALL conditions hold:
  • Chunks are short and focused (one idea per block).
  • There are no long dense paragraphs.
  • The layout clearly helps seniors with limited working memory.
  • The overall structure feels “light” rather than text-heavy.

Give **1** if:
  • There is some chunking, but also a few long / mixed-idea blocks, OR
  • Seniors can follow, but still experience moderate cognitive strain.

Give **0** if:
  • The answer is mostly one or two long dense paragraphs, OR
  • Many ideas are mixed together without separation.
"""

    elif letter == "C2":
        dimension = "Pacing & Step Control"
        rubric = PACING_RUBRIC
        extra_rule = """
Scoring rule for Pacing & Step Control (BALANCED STRICT):

- Default starting point: **1** if the answer is generally followable.

Give **2** ONLY IF:
  • Steps are short and manageable.
  • The sequence progresses slowly and clearly, without rushing.
  • There is no point where too many steps are packed together.
  • The pacing clearly feels optimized for older adults.

Give **1** if:
  • There is a mix of good pacing and a few “crowded” moments, OR
  • Steps are somewhat long but still understandable, OR
  • The user is unlikely to be completely lost, even if not perfectly paced.

Give **0** if:
  • Many steps are delivered at once, OR
  • The explanation jumps quickly between topics, OR
  • A senior user is likely to feel rushed or overloaded.
"""

    elif letter == "C3":
        dimension = "Progressive Disclosure"
        rubric = PROGRESSIVE_DISCLOSURE_RUBRIC
        extra_rule = """
Scoring rule for Progressive Disclosure (STRICT):

- Start from 1 if there is at least some staged explanation
  (e.g., basic idea first, then more detail).

Give **2** ONLY IF:
  • The answer clearly starts with basics.
  • Later parts are offered conditionally (asks permission or checks readiness),
    or are clearly framed as “next steps if you want more detail”.
  • Information comes in clear stages, one layer at a time.
  • This structure clearly helps avoid cognitive overload.

Give **1** if:
  • There is partial staging but not fully consistent, OR
  • It sometimes checks readiness but also continues quickly, OR
  • Overall load is lower than 0, but still somewhat heavy.

Give **0** if:
  • Almost all information is given at once, OR
  • There is no real opportunity for the user to say “stop / continue later”.
"""

    elif letter == "C4":
        dimension = "Cognitive Support & Reassurance"
        rubric = SUPPORT_RUBRIC
        extra_rule = """
Scoring rule for Cognitive Support & Reassurance (BALANCED STRICT):

- Default starting point: **1** if the tone is at least somewhat friendly.

Give **2** ONLY IF:
  • The tone is warm and reassuring throughout.
  • The answer explicitly normalizes difficulty (e.g., “many people find this confusing”).
  • It actively reduces anxiety and makes the senior feel capable and safe.
  • There are clear supportive / encouraging phrases.

Give **1** if:
  • The tone is friendly but not strongly soothing, OR
  • There are only occasional supportive phrases, OR
  • Confusion is acknowledged but not deeply reassured.

Give **0** if:
  • The tone is neutral, cold, or strictly technical, OR
  • There is no real reassurance, guidance, or acknowledgment of difficulty.
"""

    else:
        raise ValueError(f"Unknown letter: {letter}")

    # Strict scoring prompt
    scoring_prompt = f"""
You are a STRICT evaluator for the dimension: **{dimension}**.
Ignore helpfulness, friendliness, or factual correctness.
Rate ONLY how well the answer reduces cognitive load for seniors
according to the rubric.

IMPORTANT:
- You MUST be conservative.
- Score **2 ONLY IF ALL requirements for the 2-point level are fully satisfied**.
- If ANY part does NOT match the 2-point definition → score at most 1.
- If the answer resembles 0-point issues → score 0.

User prompt:
\"\"\"{prompt}\"\"\"

Model answer:
\"\"\"{answer}\"\"\"

Rubric:
{rubric}

Additional strict scoring rules:
{extra_rule}

Return JSON ONLY:
{{
  "score": 0
}}
"""

    # Call Gemini to score
    resp = client.models.generate_content(
        model=MODEL_NAME,
        contents=scoring_prompt,
        config=types.GenerateContentConfig(
            response_mime_type="application/json"
        ),
    )

    data = json.loads(resp.text)

    # safety clamp
    score = int(data["score"])
    if score < 0:
        score = 0
    if score > 2:
        score = 2

    return score


# Run evaluation over CSV
df = pd.read_csv(CSV_PATH)
df = df.dropna(subset=["Prompt"]).reset_index(drop=True)
results = []

for idx, row in df.iterrows():
    category = row["Category(L4)"]
    sub_indicator = row["Sub-Indicators"]
    letter = row["Letter"]
    prompt = row["Prompt"]

    print(f"Processing {idx+1}/{len(df)} | Letter={letter}")

    run_scores = []
    run_answers = []

    for run in range(N_RUNS):
        print(f"  Run {run+1}/{N_RUNS}")
        answer = generate_answer(prompt)
        score = score_single_dimension(letter, prompt, answer)

        run_scores.append(score)
        run_answers.append(f"{run+1}. {answer}")

    avg_score = sum(run_scores) / len(run_scores)


    results.append({
        "category": category,
        "sub_indicator": sub_indicator,
        "letter": letter,
        "prompt": prompt,
        "answers": run_answers,
        "scores_all_runs": run_scores,
        "score": avg_score, 
    })

with open(OUTPUT_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"\n✅ JSON written to: {OUTPUT_JSON_PATH}")

# Compute mean scores + L4 score
results_df = pd.DataFrame(results)

C1_mean = results_df[results_df["letter"] == "C1"]["score"].mean()
C2_mean = results_df[results_df["letter"] == "C2"]["score"].mean()
C3_mean = results_df[results_df["letter"] == "C3"]["score"].mean()
C4_mean = results_df[results_df["letter"] == "C4"]["score"].mean()

L4_score = (
    0.30 * C1_mean +   # Information Chunking
    0.30 * C2_mean +   # Pacing & Step Control
    0.25 * C3_mean +   # Progressive Disclosure
    0.15 * C4_mean     # Cognitive Support & Reassurance
)

# Percentages (2 means 100%)
C1_pct = C1_mean / 2 * 100
C2_pct = C2_mean / 2 * 100
C3_pct = C3_mean / 2 * 100
C4_pct = C4_mean / 2 * 100
L4_pct = L4_score / 2 * 100

print("\n=== Sub-indicator Mean Scores ===")
print(f"C1 (Information Chunking):              {C1_mean:.3f}   ({C1_pct:.1f}%)")
print(f"C2 (Pacing & Step Control):             {C2_mean:.3f}   ({C2_pct:.1f}%)")
print(f"C3 (Progressive Disclosure):            {C3_mean:.3f}   ({C3_pct:.1f}%)")
print(f"C4 (Cognitive Support & Reassurance):   {C4_mean:.3f}   ({C4_pct:.1f}%)")

print(f"\nOverall L4 Score (Cognitive Load Reduction): {L4_score:.3f}   ({L4_pct:.1f}%)")

Processing 1/20 | Letter=C1
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 2/20 | Letter=C1
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 3/20 | Letter=C1
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 4/20 | Letter=C1
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 5/20 | Letter=C1
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 6/20 | Letter=C2
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 7/20 | Letter=C2
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 8/20 | Letter=C2
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 9/20 | Letter=C2
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 10/20 | Letter=C2
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 11/20 | Letter=C3
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 12/20 | Letter=C3
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 13/20 | Letter=C3
  Run 1/5
  Run 2/5
  Run 3/5
  