In [3]:
import os
import pandas as pd
from google.genai import Client

# PUT YOUR REAL GEMINI API KEY HERE
API_KEY = "xxx"
client = Client(api_key=API_KEY)

# Test the key works
try:
    test = client.models.list()
    print("✅ API key works!")
except Exception as e:
    print("❌ API key error:", e)

✅ API key works!


In [7]:
"""
L4 Senior-focused Simplification & Stepwise Guidance
End-to-end evaluation script using Gemini API.

Inputs:
    - CSV file with columns:
        Category(L4), Source, Sub-Indicators, Letter, Prompt

Outputs:
    - JSON file with model answers and scores per prompt
    - Printed summary of mean scores per sub-indicator and overall L4 score
"""

import json
import pandas as pd
from google.genai import types

# Configuration
MODEL_NAME = "models/gemini-2.5-pro"

CSV_PATH = "Simplification_stepwise.csv"
OUTPUT_JSON_PATH = "Simplification_stepwise_gemini.json"

# Number of repetitions of the prompt
N_RUNS = 5

# Rubrics for each sub-indicator
TASK_DECOMPOSITION_RUBRIC = """
0 – No decomposition
- One long paragraph, no step separation.
- Steps not distinguished.
- Multiple actions combined.
- Not executable by seniors.

1 – Partial decomposition
- Some steps but incomplete.
- Steps merged together.
- Missing important actions.
- Granularity still too large.

2 – Full decomposition
- Clear, separate, actionable steps.
- One action per sentence.
- No missing steps.
- Uses markers (Step 1, First, Next).
"""

STEPWISE_RUBRIC = """
0 – No sequencing
- No markers.
- Confusing order.
- User must infer steps.

1 – Partial sequencing
- Some markers but inconsistent.
- Occasional logical jumps.
- Some steps out of order.

2 – Full sequencing
- Consistent markers (“first → next → then → finally”).
- Logical, executable flow.
- No backtracking.
"""

LANGUAGE_RUBRIC = """
0 – Not simplified
- Long complex sentences.
- Unexplained technical terms.
- Assumes high digital literacy.

1 – Partially simplified
- Mostly plain language but some jargon.
- Some long sentences.
- Sometimes explains terms, sometimes not.

2 – Fully simplified
- Senior-friendly plain language.
- Short sentences, one idea per sentence.
- No jargon OR jargon fully explained.
- 6th–8th grade readability.
"""

PACING_RUBRIC = """
0 – No pacing
- Large blocks of text.
- No chunking.
- Overwhelming for seniors.

1 – Partial pacing
- Some chunking but still dense.
- Occasional pacing cues.
- Reduces load inconsistently.

2 – Full progressive disclosure
- Clear small chunks.
- 1–2 ideas per segment.
- Supportive pacing phrases.
- Strong cognitive-load mitigation.
"""

# Gemini helper functions
def generate_answer(prompt: str) -> str:
    response = client.models.generate_content(
        model=MODEL_NAME,
        contents=prompt,
    )
    return (response.text or "").strip()


def score_single_dimension(letter: str, prompt: str, answer: str) -> int:
    """
    Strict scoring for T/S/L/P based ONLY on the chosen dimension.
    Uses your exact Rubric + stricter operational rules to reduce over-scoring.
    """

    # Match dimension + rubric
    if letter == "T":
        dimension = "Task Decomposition"
        rubric = TASK_DECOMPOSITION_RUBRIC
        extra_rule = """
Scoring rule for Task Decomposition (STRICT):

- Start from 1 by default if there are *some* steps.
- Give **2** ONLY IF ALL conditions hold:
  • Steps are clearly separated and labeled (Step 1, Next, Then, etc.)
  • Each step contains **one small, senior-executable action**
  • Steps follow a complete, start-to-finish pipeline (no missing actions)
  • No step combines multiple actions (e.g., “open the browser and go to the website” = 1)
  • The decomposition is realistic for seniors with low digital literacy
- If steps are high-level, combined, incomplete, or mixed with explanations/options,
  score **1**.
- If there are no real steps, or just paragraphs → score **0**.
"""

    elif letter == "S":
        dimension = "Stepwise Sequencing"
        rubric = STEPWISE_RUBRIC
        extra_rule = """
Scoring rule for Stepwise Sequencing (STRICT):

- Start from 1 if some order exists.
- Give **2** ONLY IF:
  • Sequencing markers are consistent (first → next → then → finally OR numbered)
  • The order is fully logical with no jumps or backtracking
  • A senior could execute the sequence without guessing
- If the steps exist but ordering is imperfect / inconsistent → **1**
- If order is confusing, absent, or requires inference → **0**
"""

    elif letter == "L":
        dimension = "Language Simplification"
        rubric = LANGUAGE_RUBRIC
        extra_rule = """
Scoring rule for Language Simplification (STRICT):

- Start from 1 if answer is understandable.
- Give **2** ONLY IF:
  • Sentences are short and simple (one idea per sentence)
  • No jargon OR jargon is immediately explained
  • Tone + readability clearly match 6th–8th grade level
  • No dense or technical phrasing remains
- Occasional jargon, mixed clarity, long sentences → **1**
- Dense, technical, or not senior-friendly → **0**
"""

    elif letter == "P":
        dimension = "Progressive Disclosure"
        rubric = PACING_RUBRIC
        extra_rule = """
Scoring rule for Progressive Disclosure (STRICT):

- Start from 1 if some chunking exists.
- Give **2** ONLY IF:
  • Text is broken into clearly small chunks
  • Each chunk contains at most 1–2 ideas
  • Pacing cues appear (e.g., “tell me when you're ready for the next step”)
  • Overall structure clearly reduces cognitive load for seniors
- If chunking exists but text is still dense / no pacing cues → **1**
- If long blocks / overwhelmed pacing → **0**
"""

    else:
        raise ValueError(f"Unknown letter: {letter}")

    # Strict scoring prompt
    scoring_prompt = f"""
You are a STRICT evaluator for the dimension: **{dimension}**.
Ignore helpfulness, friendliness, or correctness. 
Rate ONLY the structural quality according to the rubric.

IMPORTANT:
- You MUST be conservative.
- Score **2 ONLY IF ALL requirements for 2-point level are fully satisfied**.
- If ANY part does NOT match the 2-point definition → score at most 1.
- If answer resembles 0-point issues → score 0.

User prompt:
\"\"\"{prompt}\"\"\"

Model answer:
\"\"\"{answer}\"\"\"

Rubric:
{rubric}

Additional strict scoring rules:
{extra_rule}

Return JSON ONLY:
{{
  "score": 0
}}
"""

    # Call Gemini to score
    resp = client.models.generate_content(
        model=MODEL_NAME,
        contents=scoring_prompt,
        config=types.GenerateContentConfig(
            response_mime_type="application/json"
        ),
    )

    data = json.loads(resp.text)

    # safety clamp
    score = int(data["score"])
    if score < 0: score = 0
    if score > 2: score = 2

    return score

# Run evaluation over CSV
df = pd.read_csv(CSV_PATH)
df = df.dropna(subset=["Prompt"]).reset_index(drop=True)
results = []

for idx, row in df.iterrows():
    category = row["Category(L4)"]
    sub_indicator = row["Sub-Indicators"]
    letter = row["Letter"]
    prompt = row["Prompt"]

    print(f"Processing {idx+1}/{len(df)} | Letter={letter}")

    run_scores = []
    run_answers = []

    for run in range(N_RUNS):
        print(f"  Run {run+1}/{N_RUNS}")
        answer = generate_answer(prompt)
        score = score_single_dimension(letter, prompt, answer)

        run_scores.append(score)
        run_answers.append(f"{run+1}. {answer}")

    avg_score = sum(run_scores) / len(run_scores)


    results.append({
        "category": category,
        "sub_indicator": sub_indicator,
        "letter": letter,
        "prompt": prompt,
        "answers": run_answers,
        "scores_all_runs": run_scores,
        "score": avg_score, 
    })

with open(OUTPUT_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"\n✅ JSON written to: {OUTPUT_JSON_PATH}")

# Compute mean scores + L4 score
results_df = pd.DataFrame(results)

T_mean = results_df[results_df["letter"] == "T"]["score"].mean()
S_mean = results_df[results_df["letter"] == "S"]["score"].mean()
L_mean = results_df[results_df["letter"] == "L"]["score"].mean()
P_mean = results_df[results_df["letter"] == "P"]["score"].mean()

L4_score = (
    0.30 * T_mean +
    0.25 * S_mean +
    0.25 * L_mean +
    0.20 * P_mean
)

# Percentages (2 means 100%)
T_pct = T_mean / 2 * 100
S_pct = S_mean / 2 * 100
L_pct = L_mean / 2 * 100
P_pct = P_mean / 2 * 100
L4_pct = L4_score / 2 * 100

print("\n=== Sub-indicator Mean Scores ===")
print(f"T (Task Decomposition):        {T_mean:.3f}   ({T_pct:.1f}%)")
print(f"S (Stepwise Sequencing):       {S_mean:.3f}   ({S_pct:.1f}%)")
print(f"L (Language Simplification):   {L_mean:.3f}   ({L_pct:.1f}%)")
print(f"P (Progressive Disclosure):    {P_mean:.3f}   ({P_pct:.1f}%)")

print(f"\nOverall L4 Score: {L4_score:.3f}   ({L4_pct:.1f}%)")

Processing 1/20 | Letter=T
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 2/20 | Letter=T
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 3/20 | Letter=T
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 4/20 | Letter=T
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 5/20 | Letter=T
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 6/20 | Letter=S
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 7/20 | Letter=S
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 8/20 | Letter=S
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 9/20 | Letter=S
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 10/20 | Letter=S
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 11/20 | Letter=L
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 12/20 | Letter=L
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 13/20 | Letter=L
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run