In [4]:
# ==== 0) Setup (Colab-friendly) ====
!pip -q install google-generativeai>=0.7.2

import os, json, re, textwrap
from typing import List, Dict, Any
from getpass import getpass
from datetime import datetime
import google.generativeai as genai

# Secure API key (Colab / local)
if "GEMINI_API_KEY" not in os.environ or not os.environ["GEMINI_API_KEY"]:
    os.environ["GEMINI_API_KEY"] = getpass("Enter your GEMINI_API_KEY: ")
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# -------------------------
# 1) File paths (uploaded)
# -------------------------
PATHS = [
    "teaching.json",
    "psychology.json",
    "it.json",
    "engineering.json",
    "accounting.json",
]

def load_json(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

# -------------------------
# 2) Rubric -> readable text
# -------------------------
def rubric_to_text(rubric: Dict[str, Any]) -> str:
    """Flatten rubric JSON to a readable block of text for the model."""
    lines = []
    rid = rubric.get("rubric_id", "rubric_id_unknown")
    lines.append(f"Rubric ID: {rid}")
    lines.append("Criteria:")
    for c in rubric.get("criteria", []):
        cid = c.get("criterion_id","")
        name = c.get("name","")
        desc = c.get("description","")
        lines.append(f"- {cid}: {name}")
        if desc:
            lines.append(f"  Description: {desc}")
        pd = c.get("performance_descriptors", {})
        if pd:
            lines.append("  Performance descriptors:")
            for k, v in pd.items():
                lines.append(f"    - {k}: {v}")
    return "\n".join(lines)

# -------------------------
# 3) Base prompts (YOUR VERSION: plain text only)
# -------------------------
SYSTEM_PROMPT = "You are a careful academic assistant. Be precise and give clear structured output (not JSON, not CSV, no files)."

def build_detection_prompt(submission: str, few_shots: List[Dict[str, Any]]) -> List[Dict[str, str]]:
    """
    Academic Integrity Detector Prompt (Plain Text)
    Expected Output:
        Label: Human | AI | Hybrid
        Rationale:
        - short bullet point 1
        - short bullet point 2
        Flags: style_inconsistency / high_verbatim / generic_phrasing / none
    """
    shot_texts = []
    for s in few_shots:
        shot_texts.append(
            f'Submission: """{s.get("final_submission","")}"""\n'
            f'Your analysis (2–4 bullet points): <analysis>\n'
            f'Label: {s.get("label_type","")}\n'
        )
    examples_block = "\n\n".join(shot_texts) if shot_texts else "/* no examples available */"

    user = f"""
You are an AI text-source classifier for academic integrity.
Decide whether the student submission is Human, AI, or Hybrid (AI-assisted).

Guidelines:
- Consider discourse features (specificity, subjectivity, personal context), style consistency, local/global coherence, repetitiveness, and cliché patterns.
- Hybrid = meaningful human writing with some AI assistance, or explicit admission of mixed use.

Examples:
{examples_block}

Now analyze the NEW submission and respond in plain text with the following structure:
Label: ...
Rationale:
- point 1
- point 2
Flags: ...
NEW submission:
\"\"\"{submission}\"\"\"\n
"""
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user},
    ]

def build_feedback_prompt(domain: str, assignment_prompt: str, rubric_text: str, submission: str) -> List[Dict[str, str]]:
    """
    Rubric-Aligned Feedback Prompt (Plain Text)
    Expected Output:
        Overall Summary:
        <2–4 sentence overview>

        Criteria Feedback:
        Criterion: <criterion_id>
        Rating: Excellent | Good | Average | Needs Improvement | Poor
        Evidence:
        - point 1
        - point 2
        Improvement Tip: one concrete suggestion

        Overall Rating: Excellent | Good | Average | Needs Improvement | Poor
    """
    user = f"""
You are a supportive assessor. Provide actionable feedback aligned to the rubric.
Return plain structured text only (no JSON, no files).

Sections to include:
1) Overall Summary: 2–4 sentences on strengths and priorities.
2) Criteria Feedback: for each rubric criterion include:
   - Criterion
   - Rating (excellent, good, average, needs_improvement, poor)
   - Evidence (1–3 bullet points citing excerpts or behaviors)
   - Improvement Tip (one concrete step)
3) Overall Rating: Excellent | Good | Average | Needs Improvement | Poor

Context:
- Domain: {domain}
- Assignment prompt: {assignment_prompt}

Rubric (verbatim):
{rubric_text}

Student submission:
\"\"\"{submission}\"\"\"\n
"""
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user},
    ]

# ------------------------------------------------
# 4) Gemini call helper (no 'system' role allowed)
# ------------------------------------------------
def call_gemini(messages, model_name="gemini-1.5-pro", temperature=0.2, max_output_tokens=1536):
    """
    google-generativeai >= 0.7.x
    - Merges any 'system' content into the first 'user' message
    - Converts to 'parts' format
    - Requests plain text (no JSON)
    """
    # 0) Separate & merge system messages
    system_buf = []
    user_msgs = []
    for m in messages:
        role = m.get("role", "user")
        content = m.get("content", "")
        if role == "system":
            if isinstance(content, list):
                content = "\n".join(str(x) for x in content)
            system_buf.append(str(content))
        else:
            user_msgs.append({"role": role, "content": content})

    if not user_msgs:
        user_msgs = [{"role": "user", "content": ""}]

    if system_buf:
        preface = "\n".join(system_buf).strip()
        user_msgs[0]["content"] = (preface + "\n\n" + user_msgs[0].get("content","")).strip()

    # 1) Convert to parts
    norm_msgs = []
    for m in user_msgs:
        norm_msgs.append({"role": m.get("role", "user"), "parts": [m.get("content", "")]})

    # 2) Create model & generate
    model = genai.GenerativeModel(model_name)
    resp = model.generate_content(
        norm_msgs,
        generation_config={
            "temperature": temperature,
            "max_output_tokens": max_output_tokens,
            # no response_mime_type -> plain text
        }
    )

    # 3) Return text robustly
    if hasattr(resp, "text") and isinstance(resp.text, str) and resp.text.strip():
        return resp.text.strip()

    try:
        cand0 = resp.candidates[0]
        parts = getattr(cand0, "content", cand0).parts
        out = "".join(getattr(p, "text", "") for p in parts)
        return out.strip()
    except Exception:
        return ""

# -------------------------
# 5) Runner
# -------------------------
def run_all():
    all_outputs: List[Dict[str, Any]] = []

    for path in PATHS:
        data = load_json(path)
        domain = data.get("domain", "Unknown")
        assign_prompt = data.get("prompt", "")
        rubric = data.get("rubric", {})
        submissions = data.get("submissions", [])

        rubric_text = rubric_to_text(rubric)
        few_shots = submissions[:2] if len(submissions) >= 2 else submissions[:1]

        dataset_block = {
            "path": path,
            "domain": domain,
            "assignment_prompt": assign_prompt,
            "n_submissions": len(submissions),
            "items": []
        }

        for idx, sub in enumerate(submissions, start=1):
            submission_text = sub.get("final_submission", "")

            # A) Detection
            det_msgs = build_detection_prompt(submission_text, few_shots=few_shots)
            det_text = call_gemini(det_msgs, max_output_tokens=1024)

            # B) Feedback
            fb_msgs = build_feedback_prompt(domain, assign_prompt, rubric_text, submission_text)
            fb_text = call_gemini(fb_msgs, max_output_tokens=2048)

            dataset_block["items"].append({
                "index": idx,
                "detection": det_text,
                "feedback": fb_text
            })

        all_outputs.append(dataset_block)

    return all_outputs

# -------------------------
# 6) Execute & print
# -------------------------
if __name__ == "__main__":
    results = run_all()

    print("\n===== EVALUATION RUN =====")
    print(f"Timestamp: {datetime.now().isoformat(timespec='seconds')}")
    for ds in results:
        print("\n--------------------------------------------")
        print(f"Source : {ds['path']}")
        print(f"Domain : {ds['domain']}")
        print(f"Prompt : {ds['assignment_prompt']}")
        print(f"#Subs  : {ds['n_submissions']}")
        for item in ds["items"]:
            print(f"\n  Submission #{item['index']}:")
            print("  --- Detection ---")
            print(textwrap.indent(item["detection"] or "[No output]", "    "))
            print("  --- Feedback ---")
            print(textwrap.indent(item["feedback"] or "[No output]", "    "))



===== EVALUATION RUN =====
Timestamp: 2025-09-01T13:19:27

--------------------------------------------
Source : teaching.json
Domain : Teaching
Prompt : Examine current research on early literacy development and evaluate evidence-based approaches for supporting pre-reading skills in diverse learners aged 3 to 6 years.
#Subs  : 6

  Submission #1:
  --- Detection ---
    Label: AI

    Rationale:
    - The text exhibits a high level of academic precision and fluency, characteristic of AI-generated text trained on academic corpora.  The vocabulary and sentence structures are sophisticated, but lack a distinct authorial voice.
    - The paragraph reads as a general overview of the topic, lacking specific examples or deeper critical analysis that would typically be present in human-written academic work.  While it cites sources, it doesn't engage with them deeply.
    - The concluding sentence, "The argument presented is grounded in current evidence, framed within a structured and academ