In [9]:
import os
import pandas as pd
from google.genai import Client

# PUT YOUR REAL GEMINI API KEY HERE
API_KEY = "xxx"
client = Client(api_key=API_KEY)

# Test the key works
try:
    test = client.models.list()
    print("✅ API key works!")
except Exception as e:
    print("❌ API key error:", e)

✅ API key works!


In [11]:
"""
L4 Senior motor and voice accessibility options
(voice input, large targets, reduced precision)

End-to-end evaluation script using Gemini API.

Inputs:
    - CSV file with columns:
        Category(L4), Source, Sub-Indicators, Letter, Prompt

Outputs:
    - JSON file with model answers and scores per prompt
    - Printed summary of mean scores per sub-indicator and overall L4 score

Sub-indicators & weights:
    V = Voice Recognition Robustness         (0.30)
    M = Motor-Ease & Large-Target Support    (0.25)
    E = Error-Tolerance & Recovery           (0.25)
    A = Accessibility Personalization & Alternatives (0.20)
"""

import json
import pandas as pd
from google.genai import types

# Configuration
MODEL_NAME = "models/gemini-2.5-flash"

CSV_PATH = "Motor_voice_accessibility.csv"
OUTPUT_JSON_PATH = "Motor_voice_accessibility_gemini.json"

# Number of repetitions of the prompt
N_RUNS = 5

# Rubrics for each sub-indicator
VOICE_RUBRIC = """
0 – No robustness
- The model does not handle unclear, fragmented, or imprecise voice-style input.
- Completely literal interpretation; no attempt to repair recognition errors.
- No confirmation, no clarification questions.
- The model fails if the input resembles real senior speech patterns (pauses, hesitations, short fragments).

1 – Partial robustness
- The model makes some effort to interpret unclear voice input.
- Occasionally asks for clarification or repeats what it understood.
- Can infer intent in simple cases but fails with more challenging senior-like utterances.
- Limited correction of potential ASR (speech-to-text) errors.

2 – Full robustness
- Actively interprets fragmented, hesitant, or low-precision voice-style commands.
- Proactively clarifies uncertainty (“Did you mean…?”).
- Repeats back the understood action for confirmation.
- Shows high tolerance for ASR-like errors, restarts gracefully, and accurately infers intended meaning.
"""

MOTOR_RUBRIC = """
0 – No motor support
- The model gives instructions requiring fine motor precision (e.g., small icons, tiny menu items).
- No acknowledgment of senior motor limitations.
- No simplification of physical interactions.

1 – Partial motor support
- The model avoids very small targets but still includes some precise actions.
- Occasionally suggests easier taps or larger UI elements.
- Some awareness of reduced precision but inconsistent application.

2 – Full motor support
- The model consistently recommends large buttons, simple gestures, and low-precision actions.
- Provides alternative interaction paths that minimize fine-motor difficulty.
- Instructions are clearly designed for tremors, slow tapping, or unsteady hands (e.g., “tap the big blue button at the bottom”).
"""

ERROR_RUBRIC = """
0 – No error tolerance
- Assumes perfect execution; fails if user mis-taps or mis-speaks.
- No recovery steps, no backtracking, no reassurance.
- The model becomes unusable after one mistake.

1 – Partial error tolerance
- Offers basic recovery instructions (“go back” / “try again”).
- Handles some missteps but not consistently.
- Sometimes provides reassurance or simple fallback strategies.

2 – Full error tolerance
- Anticipates mis-taps, mis-swipes, or mis-recognition errors.
- Provides explicit, calm recovery paths (“If that happens, don’t worry—here’s how to fix it”).
- Maintains a safe, senior-friendly interaction that prevents frustration.
- Encourages retries without blame or anxiety.
"""

ACCESS_RUBRIC = """
0 – No accessibility adaptation
- Ignores senior needs entirely.
- No suggestions for alternative modes (voice, large targets, simplified controls).
- Does not acknowledge physical limitations.

1 – Partial adaptation
- Suggests one alternative input mode (e.g., “you can also use voice”).
- Occasionally mentions accessibility settings but not systematically.
- Does not personalize options to the senior’s own limitations.

2 – Full adaptation
- Proactively recommends multiple alternative inputs: voice input, large buttons, high contrast, simplified menus, etc.
- Tailors suggestions to senior limitations (“If tapping is hard, you can use the microphone icon to speak instead”).
- Encourages enabling accessibility features based on context.
"""

# Gemini helper functions
def generate_answer(prompt: str) -> str:
    response = client.models.generate_content(
        model=MODEL_NAME,
        contents=prompt,
    )
    return (response.text or "").strip()


def score_single_dimension(letter: str, prompt: str, answer: str) -> int:
    """
    Strict scoring for V/M/E/A based ONLY on the chosen dimension.
    Uses your Rubric + stricter operational rules to reduce over-scoring.
    """

    if letter == "V":
        dimension = "Voice Recognition Robustness"
        rubric = VOICE_RUBRIC
        extra_rule = """
Scoring rule for Voice Recognition Robustness (STRICT):

- Start from 1 if the model shows some attempt to understand voice-style / noisy input.
- Give **2** ONLY IF ALL conditions hold:
  • The answer explicitly handles unclear, fragmented, or hesitant speech.
  • The model uses clarification questions or repeats back what it understood.
  • It shows clear tolerance for ASR-like errors and repairs them.
- If it only partially infers intent or rarely clarifies → score **1**.
- If it just treats the text literally with no robustness → **0**.
"""

    elif letter == "M":
        dimension = "Motor-Ease & Large-Target Support"
        rubric = MOTOR_RUBRIC
        extra_rule = """
Scoring rule for Motor-Ease & Large-Target Support (STRICT):

- Start from 1 if the model shows some awareness of motor difficulty.
- Give **2** ONLY IF:
  • The answer consistently avoids fine-precision actions.
  • It clearly recommends large buttons, simple gestures, or low-precision interactions.
  • It offers alternative interaction paths that reduce small, difficult taps.
- If there is only occasional mention of easier taps / large targets → **1**.
- If instructions still rely on precise, tiny actions and ignore motor limits → **0**.
"""

    elif letter == "E":
        dimension = "Error-Tolerance & Recovery"
        rubric = ERROR_RUBRIC
        extra_rule = """
Scoring rule for Error-Tolerance & Recovery (STRICT):

- Start from 1 if the model gives some recovery advice after mistakes.
- Give **2** ONLY IF:
  • It anticipates likely mis-taps / mis-swipes / mis-recognition.
  • It gives explicit, calm, step-by-step recovery paths.
  • Tone is reassuring and prevents frustration.
- If it only says “try again” / “go back” without detailed recovery → **1**.
- If there is no real recovery strategy or reassurance → **0**.
"""

    elif letter == "A":
        dimension = "Accessibility Personalization & Alternatives"
        rubric = ACCESS_RUBRIC
        extra_rule = """
Scoring rule for Accessibility Personalization & Alternatives (STRICT):

- Start from 1 if the model mentions at least one alternative input mode.
- Give **2** ONLY IF:
  • It proactively recommends multiple alternative modes (voice, large buttons, high contrast, etc.).
  • Suggestions are clearly tailored to the senior’s specific limitations.
  • It encourages enabling accessibility features based on context.
- If it mentions only one simple alternative without personalization → **1**.
- If it ignores accessibility adaptations entirely → **0**.
"""

    else:
        raise ValueError(f"Unknown letter: {letter}")

    # Strict scoring prompt
    scoring_prompt = f"""
You are a STRICT evaluator for the dimension: **{dimension}**.
Ignore helpfulness, friendliness, or correctness.
Rate ONLY the behavioral quality according to the rubric.

IMPORTANT:
- You MUST be conservative.
- Score **2 ONLY IF ALL requirements for 2-point level are fully satisfied**.
- If ANY part does NOT match the 2-point definition → score at most 1.
- If answer resembles 0-point issues → score 0.

User prompt:
\"\"\"{prompt}\"\"\"

Model answer:
\"\"\"{answer}\"\"\"

Rubric:
{rubric}

Additional strict scoring rules:
{extra_rule}

Return JSON ONLY:
{{
  "score": 0
}}
"""

    resp = client.models.generate_content(
        model=MODEL_NAME,
        contents=scoring_prompt,
        config=types.GenerateContentConfig(
            response_mime_type="application/json"
        ),
    )

    data = json.loads(resp.text)

    score = int(data["score"])
    if score < 0:
        score = 0
    if score > 2:
        score = 2

    return score


# Run evaluation over CSV
df = pd.read_csv(CSV_PATH)
df = df.dropna(subset=["Prompt"]).reset_index(drop=True)
results = []

for idx, row in df.iterrows():
    category = row["Category(L4)"]
    sub_indicator = row["Sub-Indicators"]
    letter = row["Letter"]           # V / M / E / A
    prompt = row["Prompt"]

    print(f"Processing {idx+1}/{len(df)} | Letter={letter}")

    run_scores = []
    run_answers = []

    for run in range(N_RUNS):
        print(f"  Run {run+1}/{N_RUNS}")
        answer = generate_answer(prompt)
        score = score_single_dimension(letter, prompt, answer)

        run_scores.append(score)
        run_answers.append(f"{run+1}. {answer}")

    avg_score = sum(run_scores) / len(run_scores)


    results.append({
        "category": category,
        "sub_indicator": sub_indicator,
        "letter": letter,
        "prompt": prompt,
        "answers": run_answers,
        "scores_all_runs": run_scores,
        "score": avg_score, 
    })

with open(OUTPUT_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"\n✅ JSON written to: {OUTPUT_JSON_PATH}")


# Compute mean scores + overall L4
results_df = pd.DataFrame(results)

V_mean = results_df[results_df["letter"] == "V"]["score"].mean()
M_mean = results_df[results_df["letter"] == "M"]["score"].mean()
E_mean = results_df[results_df["letter"] == "E"]["score"].mean()
A_mean = results_df[results_df["letter"] == "A"]["score"].mean()

L4_score = (
    0.30 * V_mean +
    0.25 * M_mean +
    0.25 * E_mean +
    0.20 * A_mean
)

# Percentages (2 means 100%)
V_pct = V_mean / 2 * 100
M_pct = M_mean / 2 * 100
E_pct = E_mean / 2 * 100
A_pct = A_mean / 2 * 100
L4_pct = L4_score / 2 * 100

print("\n=== Sub-indicator Mean Scores ===")
print(f"V (Voice Recognition Robustness):               {V_mean:.3f}   ({V_pct:.1f}%)")
print(f"M (Motor-Ease & Large-Target Support):          {M_mean:.3f}   ({M_pct:.1f}%)")
print(f"E (Error-Tolerance & Recovery):                 {E_mean:.3f}   ({E_pct:.1f}%)")
print(f"A (Accessibility Personalization & Alternatives): {A_mean:.3f}   ({A_pct:.1f}%)")

print(f"\nOverall L4 Score (VMEA): {L4_score:.3f}   ({L4_pct:.1f}%)")

Processing 1/20 | Letter=V
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 2/20 | Letter=V
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 3/20 | Letter=V
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 4/20 | Letter=V
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 5/20 | Letter=V
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 6/20 | Letter=M
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 7/20 | Letter=M
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 8/20 | Letter=M
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 9/20 | Letter=M
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 10/20 | Letter=M
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 11/20 | Letter=E
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 12/20 | Letter=E
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 13/20 | Letter=E
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run