In [2]:
# ==== 0) Setup (Colab-friendly) ====
!pip -q install requests

import os, json, re, textwrap, requests
from typing import List, Dict, Any
from getpass import getpass
from datetime import datetime

# Provide the OpenRouter API key securely (key looks like: sk-or-v1-xxxxxxxx)
if "OPENROUTER_API_KEY" not in os.environ or not os.environ["OPENROUTER_API_KEY"]:
    os.environ["OPENROUTER_API_KEY"] = getpass("Enter your OPENROUTER_API_KEY: ")

OPENROUTER_API_KEY = os.environ["OPENROUTER_API_KEY"]

# OpenRouter endpoint (OpenAI-compatible schema)
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
MODEL_ID = "anthropic/claude-3.5-haiku"   # change here if needed (e.g., anthropic/claude-3.5-sonnet)

# -------------------------
# 1) File paths (uploaded)
# -------------------------
PATHS = [
    "teaching.json",
    "psychology.json",
    "it.json",
    "engineering.json",
    "accounting.json",
]

def load_json(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

# -------------------------
# 2) Rubric -> readable text
# -------------------------
def rubric_to_text(rubric: Dict[str, Any]) -> str:
    """Flatten rubric JSON to a readable block of text for the model."""
    lines = []
    rid = rubric.get("rubric_id", "rubric_id_unknown")
    lines.append(f"Rubric ID: {rid}")
    lines.append("Criteria:")
    for c in rubric.get("criteria", []):
        cid = c.get("criterion_id","")
        name = c.get("name","")
        desc = c.get("description","")
        lines.append(f"- {cid}: {name}")
        if desc:
            lines.append(f"  Description: {desc}")
        pd = c.get("performance_descriptors", {})
        if pd:
            lines.append("  Performance descriptors:")
            for k, v in pd.items():
                lines.append(f"    - {k}: {v}")
    return "\n".join(lines)

# -------------------------
# 3) Base prompts (YOUR VERSION: plain text only)
# -------------------------
SYSTEM_PROMPT = "You are a careful academic assistant. Be precise and give clear structured output (not JSON, not CSV, no files)."

def build_detection_prompt(submission: str, few_shots: List[Dict[str, Any]]) -> List[Dict[str, str]]:
    """
    Academic Integrity Detector Prompt (Plain Text)
    Expected Output:
        Label: Human | AI | Hybrid
        Rationale:
        - short bullet point 1
        - short bullet point 2
        Flags: style_inconsistency / high_verbatim / generic_phrasing / none
    """
    shot_texts = []
    for s in few_shots:
        shot_texts.append(
            f'Submission: """{s.get("final_submission","")}"""\n'
            f'Your analysis (2–4 bullet points): <analysis>\n'
            f'Label: {s.get("label_type","")}\n'
        )
    examples_block = "\n\n".join(shot_texts) if shot_texts else "/* no examples available */"

    user = f"""
You are an AI text-source classifier for academic integrity.
Decide whether the student submission is Human, AI, or Hybrid (AI-assisted).

Guidelines:
- Consider discourse features (specificity, subjectivity, personal context), style consistency, local/global coherence, repetitiveness, and cliché patterns.
- Hybrid = meaningful human writing with some AI assistance, or explicit admission of mixed use.

Examples:
{examples_block}

Now analyze the NEW submission and respond in plain text with the following structure:
Label: ...
Rationale:
- point 1
- point 2
Flags: ...
NEW submission:
\"\"\"{submission}\"\"\"\n
"""
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user},
    ]

def build_feedback_prompt(domain: str, assignment_prompt: str, rubric_text: str, submission: str) -> List[Dict[str, str]]:
    """
    Rubric-Aligned Feedback Prompt (Plain Text)
    Expected Output:
        Overall Summary:
        <2–4 sentence overview>

        Criteria Feedback:
        Criterion: <criterion_id>
        Rating: Excellent | Good | Average | Needs Improvement | Poor
        Evidence:
        - point 1
        - point 2
        Improvement Tip: one concrete suggestion

        Overall Rating: Excellent | Good | Average | Needs Improvement | Poor
    """
    user = f"""
You are a supportive assessor. Provide actionable feedback aligned to the rubric.
Return plain structured text only (no JSON, no files).

Sections to include:
1) Overall Summary: 2–4 sentences on strengths and priorities.
2) Criteria Feedback: for each rubric criterion include:
   - Criterion
   - Rating (excellent, good, average, needs_improvement, poor)
   - Evidence (1–3 bullet points citing excerpts or behaviors)
   - Improvement Tip (one concrete step)
3) Overall Rating: Excellent | Good | Average | Needs Improvement | Poor

Context:
- Domain: {domain}
- Assignment prompt: {assignment_prompt}

Rubric (verbatim):
{rubric_text}

Student submission:
\"\"\"{submission}\"\"\"\n
"""
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user},
    ]

# ------------------------------------------------
# 4) Claude (OpenRouter) call helper
# ------------------------------------------------
def call_claude_openrouter(messages: List[Dict[str, str]],
                           model_id: str = MODEL_ID,
                           temperature: float = 0.2,
                           max_tokens: int = 1536,
                           top_p: float = 0.95) -> str:
    """
    Calls OpenRouter's OpenAI-compatible /chat/completions endpoint.
    Returns the assistant's text content.

    Notes:
    - OpenRouter supports 'system' role with this endpoint; we'll pass it through as-is.
    - We keep outputs plain text (no JSON).
    """
    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        # Optional but recommended headers:
        "HTTP-Referer": "https://colab.research.google.com/",
        "X-Title": "AAIE-PlainText-Evaluator",
    }

    payload = {
        "model": model_id,
        "messages": messages,      # [{"role": "system/user/assistant", "content": "..."}]
        "temperature": temperature,
        "max_tokens": max_tokens,
        "top_p": top_p,
        # "stream": False,         # non-streaming for simplicity
    }

    resp = requests.post(OPENROUTER_URL, headers=headers, data=json.dumps(payload), timeout=120)
    try:
        resp.raise_for_status()
    except requests.HTTPError as e:
        # Include server error text to help debug
        raise RuntimeError(f"OpenRouter API error {resp.status_code}: {resp.text}") from e

    data = resp.json()
    # OpenAI-compatible shape: choices[0].message.content
    try:
        content = data["choices"][0]["message"]["content"]
        if isinstance(content, list):
            # Some providers can return a list of segments; join as text
            content = "".join(seg.get("text", "") if isinstance(seg, dict) else str(seg) for seg in content)
        return str(content).strip()
    except Exception as e:
        raise RuntimeError(f"Unexpected response format: {json.dumps(data)[:800]}") from e

# -------------------------
# 5) Runner
# -------------------------
def run_all():
    all_outputs: List[Dict[str, Any]] = []

    for path in PATHS:
        data = load_json(path)
        domain = data.get("domain", "Unknown")
        assign_prompt = data.get("prompt", "")
        rubric = data.get("rubric", {})
        submissions = data.get("submissions", [])

        rubric_text = rubric_to_text(rubric)
        few_shots = submissions[:2] if len(submissions) >= 2 else submissions[:1]

        dataset_block = {
            "path": path,
            "domain": domain,
            "assignment_prompt": assign_prompt,
            "n_submissions": len(submissions),
            "items": []
        }

        for idx, sub in enumerate(submissions, start=1):
            submission_text = sub.get("final_submission", "")

            # A) Detection
            det_msgs = build_detection_prompt(submission_text, few_shots=few_shots)
            det_text = call_claude_openrouter(det_msgs, max_tokens=800)

            # B) Feedback
            fb_msgs = build_feedback_prompt(domain, assign_prompt, rubric_text, submission_text)
            fb_text = call_claude_openrouter(fb_msgs, max_tokens=1600)

            dataset_block["items"].append({
                "index": idx,
                "detection": det_text,
                "feedback": fb_text
            })

        all_outputs.append(dataset_block)

    return all_outputs

# -------------------------
# 6) Execute & print
# -------------------------
if __name__ == "__main__":
    results = run_all()

    print("\n===== CLAUDE (OpenRouter) EVALUATION RUN =====")
    print(f"Timestamp: {datetime.now().isoformat(timespec='seconds')}")
    for ds in results:
        print("\n--------------------------------------------")
        print(f"Source : {ds['path']}")
        print(f"Domain : {ds['domain']}")
        print(f"Prompt : {ds['assignment_prompt']}")
        print(f"#Subs  : {ds['n_submissions']}")
        for item in ds["items"]:
            print(f"\n  Submission #{item['index']}:")
            print("  --- Detection ---")
            print(textwrap.indent(item["detection"] or "[No output]", "    "))
            print("  --- Feedback ---")
            print(textwrap.indent(item["feedback"] or "[No output]", "    "))


RuntimeError: OpenRouter API error 402: {"error":{"message":"This request requires more credits, or fewer max_tokens. You requested up to 1600 tokens, but can only afford 1531. To increase, visit https://openrouter.ai/settings/credits and upgrade to a paid account","code":402,"metadata":{"provider_name":null}},"user_id":"user_31aZdvvNI5alVnWWz4yzn3CHVWq"}