In [1]:
import os
import pandas as pd
from google.genai import Client

# PUT YOUR REAL GEMINI API KEY HERE
API_KEY = "xxx"
client = Client(api_key=API_KEY)

# Test the key works
try:
    test = client.models.list()
    print("✅ API key works!")
except Exception as e:
    print("❌ API key error:", e)

✅ API key works!


In [3]:
"""
L4 Plain-language & Readability Thresholds (senior-appropriate)
End-to-end evaluation script using Gemini API.

Inputs:
    - CSV file with columns:
        Category(L4), Source, Sub-Indicators, Letter, Prompt

Outputs:
    - JSON file with model answers and scores per prompt
    - Printed summary of mean scores per sub-indicator and overall L4 score
"""

import json
import pandas as pd
from google.genai import types

# Configuration
MODEL_NAME = "models/gemini-2.5-flash"

CSV_PATH = "Plain_language_readability.csv"
OUTPUT_JSON_PATH = "Plain_language_readability_gemini.json"

# Number of repetitions of the prompt
N_RUNS = 5

# Rubrics for each sub-indicator
LEXICAL_RUBRIC = """
Dimension 1 — L: Lexical Simplicity (Weight 0.30)

Measures whether vocabulary is simple, familiar, and free of jargon.

0 – Not simple
- Uses difficult or technical terms without explanation.
- Includes jargon (e.g., “authentication,” “optimize,” “synchronize”).
- Vocabulary level clearly above senior-friendly range.

1 – Partially simple
- Mostly simple words but still includes complex terms.
- Inconsistent explanation of unfamiliar words.
- Some jargon remains.

2 – Fully simple
- All vocabulary is clear, common, and everyday.
- Any necessary technical term is immediately explained.
- No jargon, no abstract or academic vocabulary.
"""

SENTENCE_RUBRIC = """
Dimension 2 — S: Sentence Brevity & Structure (Weight 0.30)

Measures whether sentences are short, direct, and structurally simple.

0 – Too complex / long
- Long sentences.
- Multiple clauses or nested structures.
- Hard for seniors to track.

1 – Moderately simple
- Some short sentences, but also a few long or compound ones.
- Occasional structural complexity.

2 – Very simple
- Mostly short sentences.
- Each sentence expresses only one action or idea.
- No unnecessary clauses.
"""

CONCEPT_RUBRIC = """
Dimension 3 — C: Conceptual Clarity & Cognitive Load (Weight 0.20)

Measures whether the explanation minimizes mental effort and avoids overloading seniors.

0 – High cognitive load
- Gives too many ideas at once.
- Explanations are abstract or overly conceptual.
- No chunking or stepwise support.

1 – Moderate
- Clear overall, but some steps combine multiple ideas.
- Occasional conceptual density.

2 – Very clear, low load
- One idea at a time.
- Clear chunking of information.
- No abstract concepts, no overload.
- Easy for seniors with low working memory.
"""

READABILITY_RUBRIC = """
Dimension 4 — R: Readability Level (Weight 0.20)

Measures whether the text naturally meets senior readability thresholds (approx. 6th–8th grade).

0 – High reading level (10th+ grade)
- Complex vocabulary and sentence structure.
- Clearly difficult for seniors.

1 – Medium reading level (8th–10th grade)
- Mostly simple, but some elements still too advanced.

2 – Meets senior-friendly readability (6th–8th grade)
- Very easy to read.
- Matches plain-language standards recommended for aging adults.
"""

# Gemini helper functions
def generate_answer(prompt: str) -> str:
    response = client.models.generate_content(
        model=MODEL_NAME,
        contents=prompt,
    )
    return (response.text or "").strip()


def score_single_dimension(letter: str, prompt: str, answer: str) -> int:
    """
    Strict scoring for L/S/C/R based ONLY on the chosen dimension.
    Uses your exact Rubric + stricter operational rules to reduce over-scoring.
    """

    # Match dimension + rubric
    if letter == "L":
        dimension = "Lexical Simplicity"
        rubric = LEXICAL_RUBRIC
        extra_rule = """
Scoring rule for Lexical Simplicity (STRICT):

- Start from 1 if the answer is generally understandable.
- Give **2** ONLY IF ALL conditions hold:
  • Vocabulary is entirely common, everyday, and senior-friendly.
  • Any necessary technical term is immediately explained in plain words.
  • No remaining jargon, abstract academic words, or unexplained acronyms.
- If there are complex terms, mixed with simpler wording, or inconsistent explanations → **1**.
- If vocabulary is clearly too technical, jargon-heavy, or hard for seniors → **0**.
"""
    elif letter == "S":
        dimension = "Sentence Brevity & Structure"
        rubric = SENTENCE_RUBRIC
        extra_rule = """
Scoring rule for Sentence Brevity & Structure (BALANCED STRICT):

- Default starting point: **1** if the answer is generally followable.

Give **2** ONLY IF ALL conditions hold:
  • Most sentences are short.
  • Each sentence expresses only one clear idea or action.
  • Very few (or no) unnecessary clauses or nested structures.
  • The structure is consistently clean and easy for seniors to track.

Give **1** if:
  • There is a mix of short and somewhat long sentences, OR
  • Some sentences contain clauses but remain understandable, OR
  • The structure is mostly clear even if not perfectly simple.

Give **0** ONLY IF:
  • Many sentences are very long, OR
  • Multiple nested clauses significantly increase complexity, OR
  • Ideas are merged together and structure becomes hard to follow for seniors.
"""
    elif letter == "C":
        dimension = "Conceptual Clarity & Cognitive Load"
        rubric = CONCEPT_RUBRIC
        extra_rule = """
Scoring rule for Conceptual Clarity & Cognitive Load (STRICT):

- Start from 1 if the explanation is mostly understandable.
- Give **2** ONLY IF:
  • The answer introduces one idea at a time.
  • Information is clearly chunked (e.g., separated steps or short paragraphs).
  • There are no abstract or overly conceptual explanations.
  • Overall mental effort required for seniors is low.
- If some steps combine multiple ideas or there is occasional conceptual density → **1**.
- If many ideas are given at once, explanations are abstract, or there is no chunking → **0**.
"""
    elif letter == "R":
        dimension = "Readability Level Compliance"
        rubric = READABILITY_RUBRIC
        extra_rule = """
Scoring rule for Readability Level (STRICT):

- Base your judgment on overall sentence length, vocabulary difficulty,
  and how it would score on typical readability indices (e.g., FKGL/SMOG),
  even if you estimate informally.

- Give **2** ONLY IF:
  • The text clearly feels like 6th–8th grade reading level.
  • It is very easy to read for aging adults with average literacy.

- If it feels around 8th–10th grade (mostly simple but with some advanced elements) → **1**.
- If it clearly requires high-school level (10th+ grade) or above → **0**.
"""
    else:
        raise ValueError(f"Unknown letter: {letter}")

    # Strict scoring prompt
    scoring_prompt = f"""
You are a STRICT evaluator for the dimension: **{dimension}**.
Ignore helpfulness, friendliness, or factual correctness. 
Rate ONLY the plain-language and readability quality according to the rubric.

IMPORTANT:
- You MUST be conservative.
- Score **2 ONLY IF ALL requirements for the 2-point level are fully satisfied**.
- If ANY part does NOT match the 2-point definition → score at most 1.
- If the answer resembles 0-point issues → score 0.

User prompt:
\"\"\"{prompt}\"\"\"

Model answer:
\"\"\"{answer}\"\"\"

Rubric:
{rubric}

Additional strict scoring rules:
{extra_rule}

Return JSON ONLY:
{{
  "score": 0
}}
"""

    # Call Gemini to score
    resp = client.models.generate_content(
        model=MODEL_NAME,
        contents=scoring_prompt,
        config=types.GenerateContentConfig(
            response_mime_type="application/json"
        ),
    )

    data = json.loads(resp.text)

    # safety clamp
    score = int(data["score"])
    if score < 0:
        score = 0
    if score > 2:
        score = 2

    return score


# Run evaluation over CSV
df = pd.read_csv(CSV_PATH)
df = df.dropna(subset=["Prompt"]).reset_index(drop=True)
results = []

for idx, row in df.iterrows():
    category = row["Category(L4)"]
    sub_indicator = row["Sub-Indicators"]
    letter = row["Letter"]
    prompt = row["Prompt"]

    print(f"Processing {idx+1}/{len(df)} | Letter={letter}")

    run_scores = []
    run_answers = []

    for run in range(N_RUNS):
        print(f"  Run {run+1}/{N_RUNS}")
        answer = generate_answer(prompt)
        score = score_single_dimension(letter, prompt, answer)

        run_scores.append(score)
        run_answers.append(f"{run+1}. {answer}")

    avg_score = sum(run_scores) / len(run_scores)


    results.append({
        "category": category,
        "sub_indicator": sub_indicator,
        "letter": letter,
        "prompt": prompt,
        "answers": run_answers,
        "scores_all_runs": run_scores,
        "score": avg_score, 
    })

with open(OUTPUT_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"\n✅ JSON written to: {OUTPUT_JSON_PATH}")

# Compute mean scores + L4 score
results_df = pd.DataFrame(results)

L_mean = results_df[results_df["letter"] == "L"]["score"].mean()
S_mean = results_df[results_df["letter"] == "S"]["score"].mean()
C_mean = results_df[results_df["letter"] == "C"]["score"].mean()
R_mean = results_df[results_df["letter"] == "R"]["score"].mean()

L4_score = (
    0.30 * L_mean +
    0.30 * S_mean +
    0.20 * C_mean +
    0.20 * R_mean
)

# Percentages (2 means 100%)
L_pct = L_mean / 2 * 100
S_pct = S_mean / 2 * 100
C_pct = C_mean / 2 * 100
R_pct = R_mean / 2 * 100
L4_pct = L4_score / 2 * 100

print("\n=== Sub-indicator Mean Scores ===")
print(f"L (Lexical Simplicity):                {L_mean:.3f}   ({L_pct:.1f}%)")
print(f"S (Sentence Brevity & Structure):      {S_mean:.3f}   ({S_pct:.1f}%)")
print(f"C (Conceptual Clarity & Cog. Load):    {C_mean:.3f}   ({C_pct:.1f}%)")
print(f"R (Readability Level Compliance):      {R_mean:.3f}   ({R_pct:.1f}%)")

print(f"\nOverall L4 Score (LSCR): {L4_score:.3f}   ({L4_pct:.1f}%)")

Processing 1/20 | Letter=L
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 2/20 | Letter=L
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 3/20 | Letter=L
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 4/20 | Letter=L
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 5/20 | Letter=L
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 6/20 | Letter=S
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 7/20 | Letter=S
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 8/20 | Letter=S
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 9/20 | Letter=S
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 10/20 | Letter=S
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 11/20 | Letter=C
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 12/20 | Letter=C
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 13/20 | Letter=C
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run