In [11]:
from openai import OpenAI

client = OpenAI(
    api_key="xxx",
    base_url="https://api.deepseek.com"
)

try:
    resp = client.chat.completions.create(
        model="deepseek-chat",
        max_tokens=50,
        messages=[
            {"role": "user", "content": "Say hi in one short sentence."}
        ]
    )

    print("DeepSeek V3.2 works!")
    print(resp.choices[0].message.content)

except Exception as e:
    print("DeepSeek API key error:", e)

DeepSeek V3.2 works!
Hello there!


In [15]:
"""
L4 User Education Scaffolds (tutorials, explainers)
End-to-end evaluation script using Claude API.

Inputs:
    - CSV file with columns:
        Category(L4), Source, Sub-Indicators, Letter, Prompt

Outputs:
    - JSON file with model answers and scores per prompt
    - Printed summary of mean scores per sub-indicator and overall L4 score

Sub-indicators & weights:
    T = Tutorial Structure             (0.30)
    E = Explanation Quality            (0.30)
    P = Progressive Skill-Building     (0.20)
    I = Interactive Clarification      (0.20)
"""

import re
import json
import pandas as pd

MODEL_NAME = "deepseek-chat"

CSV_PATH = "L4 user education.csv"
OUTPUT_JSON_PATH = "L4 user education deepseek.json"

# Number of repetitions of the prompt
N_RUNS = 5

# Rubrics
TUTORIAL_RUBRIC = """
1. T — Tutorial Structure (0–2)
What it measures: Whether the model organizes instructions into a clear, tutorial-like structure.

0 – No structure
• Response is a long, unorganized paragraph.
• No steps, sections, or headings.
• User cannot identify a clear sequence.

1 – Partial structure
• Some structure appears but is incomplete or inconsistent.
• Steps are merged, unclear, or missing.
• Parts of the explanation remain disorganized.

2 – Full structure
• Clear step-by-step or section-based organization.
• Steps are labeled and logically ordered.
• Each step contains a single actionable idea.
"""

EXPLANATION_RUBRIC = """
2. E — Explanation Quality (0–2)
What it measures: Whether the explanation is accurate, simple, and accessible.

0 – Poor explanation
• Vague, overly technical, or incorrect.
• Uses jargon without explanation.
• Difficult for beginners to understand.

1 – Adequate explanation
• Mostly correct and understandable.
• Occasional jargon or unclear phrasing.
• Not fully optimized for novices.

2 – High-quality explanation
• Clear, accurate, beginner-friendly wording.
• Uses plain language and helpful examples.
• Avoids unnecessary complexity.
"""

PROGRESSION_RUBRIC = """
3. P — Progressive Skill-Building (0–2)
What it measures: Whether the model gradually increases complexity.

0 – No progression
• Provides all information at once.
• No simple-to-complex staging.
• Likely to overwhelm beginners.

1 – Partial progression
• Some progression appears but inconsistently.
• Transitions exist but are not well controlled.
• Some sections contain excessive information.

2 – Strong progression
• Begins with basics, adds complexity gradually.
• Builds skills step-by-step.
• Reduces cognitive load effectively.
"""

INTERACTIVE_RUBRIC = """
4. I — Interactive Clarification (0–2)
What it measures: Whether the model checks understanding or adapts interactively.

0 – No interaction
• No clarifying questions.
• No checks for understanding.
• One-directional explanation.

1 – Limited interaction
• Occasional clarification or check-in.
• Some responsiveness but inconsistent.
• Interaction does not fully guide learning.

2 – Full interactive scaffolding
• Actively checks user understanding.
• Offers choices or alternate paths.
• Adjusts explanations like a tutor.
"""

# DeepSeek helper functions
def call_deepseek(
    user_prompt: str,
    system_prompt: str | None = None,
    max_tokens: int = 1024,
    temperature: float = 0.0,
) -> str:
    """
    General-purpose DeepSeek call wrapper.
    Returns plain text output.
    """
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": user_prompt})

    resp = client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature,
    )
    return resp.choices[0].message.content.strip()

def generate_answer(prompt: str) -> str:
    """
    Generate the model's natural explanation for the user-education task.
    The prompt is intentionally light so the model is not biased toward any
    specific scoring dimension (T/E/P/I).
    """
    system_msg = (
        "You are a helpful AI assistant. Provide a clear and useful response "
        "to help the user learn or understand the topic."
    )
    return call_deepseek(prompt, system_prompt=system_msg,
                         max_tokens=1024, temperature=0.5)

def extract_json_object(text: str) -> dict:
    """
    DeepSeek may output additional text around JSON.
    First try json.loads; if it fails, extract the first {...} block.
    """
    try:
        return json.loads(text)
    except Exception:
        pass

    match = re.search(r"\{.*\}", text, re.DOTALL)
    if not match:
        raise ValueError(f"Could not find JSON in DeepSeek response: {text[:200]}")
    return json.loads(match.group(0))


# Scoring per sub-indicator
def score_single_dimension(letter: str, prompt: str, answer: str) -> int:
    """
    Scoring for T/E/P/I based ONLY on the chosen dimension.
    Uses the rubric plus balanced (slightly generous) operational rules,
    to avoid ultra-strict under-scoring.
    """

    if letter == "T":
        dimension = "Tutorial Structure"
        rubric = TUTORIAL_RUBRIC
        extra_rule = """
Scoring guide (balanced, slightly generous):

• Start from score 1 if there is ANY visible structure (lists, steps, headings).
• Give **2** when the structure is clearly tutorial-like:
  - Most steps are labeled or clearly separated.
  - Order roughly follows a logical sequence.
  - Many steps are actionable, even if a few contain 2 small actions.
• Give **1** when structure exists but is messy, incomplete, or partly merged.
• Give **0** only when it is basically one big block with no clear sequence.
"""
    elif letter == "E":
        dimension = "Explanation Quality"
        rubric = EXPLANATION_RUBRIC
        extra_rule = """
Scoring guide (balanced, slightly generous):

• Start from score 1 if the explanation is mostly understandable.
• Give **2** when:
  - The explanation is generally accurate,
  - Uses mostly plain, beginner-friendly language,
  - Provides at least one simple example or analogy OR clearly avoids heavy jargon.
  Small bits of mild jargon or one slightly long sentence are acceptable.
• Give **1** when it is understandable but has noticeable jargon, complexity,
  or missing examples that would help a beginner.
• Give **0** only if it is confusing, very technical, or likely to mislead beginners.
"""
    elif letter == "P":
        dimension = "Progressive Skill-Building"
        rubric = PROGRESSION_RUBRIC
        extra_rule = """
Scoring guide (balanced, slightly generous):

• Start from score 1 if there is at least some sense of "start here, then go deeper".
• Give **2** when:
  - The answer clearly starts with basics,
  - Then introduces more advanced ideas or extra steps,
  - And the stages feel manageable for a novice (not all dumped at once).
  It does not need to be perfect; minor information overload is acceptable.
• Give **1** when there is some progression, but stages are uneven, or a lot of detail
  appears too early.
• Give **0** when almost everything is delivered at once with no visible staging.
"""
    elif letter == "I":
        dimension = "Interactive Clarification"
        rubric = INTERACTIVE_RUBRIC
        extra_rule = """
Scoring guide (balanced, slightly generous):

• Start from score 0 and look specifically for INTERACTION.
• Give **2** when:
  - The answer clearly asks one or more questions to the user,
  - OR offers explicit choices / branches (e.g., “If you are using a phone, do X; if a computer, do Y”),
  - AND uses these questions/branches to adapt the explanation or next steps.
  It does not need to be a full conversation, but it should feel like tutoring, not a monologue.
• Give **1** when there is some interaction (one or two check-in questions,
  invitations like “let me know if…”, or mild branching) but it is limited or not
  used strongly to adapt the explanation.
• Give **0** when the answer is purely one-way explanation with no questions,
  no checks for understanding, and no real choice offered to the user.
"""
    else:
        raise ValueError(f"Unknown letter: {letter}")

    scoring_prompt = f"""
You are evaluating ONLY the dimension: **{dimension}**.

Task:
- Ignore overall helpfulness, friendliness, or correctness outside this dimension.
- Use the rubric and scoring guide below.
- Be fair and slightly generous: if an answer clearly matches MOST of the 2-point
  description and only has minor issues, give **2** instead of **1**.

User prompt:
\"\"\"{prompt}\"\"\"

Model answer:
\"\"\"{answer}\"\"\"

Rubric:
{rubric}

Operational scoring guide:
{extra_rule}

Return JSON ONLY with this schema:
{{
  "score": 0
}}
"""

    raw = call_deepseek(scoring_prompt, max_tokens=256, temperature=0.0)
    data = extract_json_object(raw)

    score = int(data["score"])
    if score < 0:
        score = 0
    if score > 2:
        score = 2
    return score


# Main evaluation loop
df = pd.read_csv(CSV_PATH)
df = df.dropna(subset=["Prompt"]).reset_index(drop=True)

results = []

for idx, row in df.iterrows():
    category = row["Category(L4)"]
    sub_indicator = row["Sub-Indicators"]
    letter = row["Letter"]
    prompt = row["Prompt"]

    print(f"Processing {idx+1}/{len(df)} | Letter={letter}")

    run_scores = []
    run_answers = []

    for run in range(N_RUNS):
        print(f"  Run {run+1}/{N_RUNS}")
        answer = generate_answer(prompt)
        score = score_single_dimension(letter, prompt, answer)

        run_scores.append(score)
        run_answers.append(f"{run+1}. {answer}")

    avg_score = sum(run_scores) / len(run_scores)


    results.append({
        "category": category,
        "sub_indicator": sub_indicator,
        "letter": letter,
        "prompt": prompt,
        "answers": run_answers,
        "scores_all_runs": run_scores,
        "score": avg_score, 
    })

with open(OUTPUT_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"\nJSON written to: {OUTPUT_JSON_PATH}")

# Aggregation & print summary
results_df = pd.DataFrame(results)

T_mean = results_df[results_df["letter"] == "T"]["score"].mean()
E_mean = results_df[results_df["letter"] == "E"]["score"].mean()
P_mean = results_df[results_df["letter"] == "P"]["score"].mean()
I_mean = results_df[results_df["letter"] == "I"]["score"].mean()

L4_score = (
    0.30 * T_mean +
    0.30 * E_mean +
    0.20 * P_mean +
    0.20 * I_mean
)

# Percentages (2 means 100%)
T_pct = T_mean / 2 * 100
E_pct = E_mean / 2 * 100
P_pct = P_mean / 2 * 100
I_pct = I_mean / 2 * 100
L4_pct = L4_score / 2 * 100

print("\n=== Sub-indicator Mean Scores ===")
print(f"T (Tutorial Structure):             {T_mean:.3f}   ({T_pct:.1f}%)")
print(f"E (Explanation Quality):            {E_mean:.3f}   ({E_pct:.1f}%)")
print(f"P (Progressive Skill-Building):     {P_mean:.3f}   ({P_pct:.1f}%)")
print(f"I (Interactive Clarification):      {I_mean:.3f}   ({I_pct:.1f}%)")

print(f"\nOverall L4 Score (User Education Scaffolds): {L4_score:.3f}   ({L4_pct:.1f}%)")

Processing 1/20 | Letter=T
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 2/20 | Letter=T
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 3/20 | Letter=T
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 4/20 | Letter=T
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 5/20 | Letter=T
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 6/20 | Letter=E
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 7/20 | Letter=E
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 8/20 | Letter=E
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 9/20 | Letter=E
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 10/20 | Letter=E
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 11/20 | Letter=P
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 12/20 | Letter=P
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run 5/5
Processing 13/20 | Letter=P
  Run 1/5
  Run 2/5
  Run 3/5
  Run 4/5
  Run