In [1]:
import json
import re
import time
from statistics import mean, stdev
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

### 🔧 Added by assistant: JSON dataset loader & runner
These cells load `Llama3_CodeQA_llm_as_judge.json` and provide a runner to iterate examples.
If your notebook defines a function like `run_example(example, studentLLM_score)`, the runner will call it for each record.


In [2]:

# === Added: JSON loader (Llama3_CodeQA_llm_as_judge.json) ===
from pathlib import Path as _Path
import json as _json

_JSON_PATH = _Path("Llama3_CodeQA_llm_as_judge.json")

def _extract_scores(_rec):
    def _get_score(_k):
        _v = _rec.get(_k, {})
        if isinstance(_v, dict):
            return int(_v.get("score", 0))
        if isinstance(_v, (int, float)):
            return int(_v)
        return 0
    return {
        "accuracy": _get_score("accuracy"),
        "completeness": _get_score("completeness"),
        "relevance": _get_score("relevance"),
        "clarity": _get_score("clarity"),
    }

with _JSON_PATH.open("r", encoding="utf-8") as _f:
    _data = _json.load(_f)

if not isinstance(_data, list):
    raise ValueError("Expected top-level list in Llama3_CodeQA_llm_as_judge.json")

# DATASET: list of (example, studentLLM_score) tuples
DATASET = []
for _rec in _data:
    _example = {
        "id": _rec.get("id"),
        "code": _rec.get("code"),
        "question": _rec.get("question"),
        "reference": _rec.get("reference"),
        "prediction": _rec.get("prediction"),
    }
    _scores = _extract_scores(_rec)
    DATASET.append((_example, _scores))

print(f"Loaded {len(DATASET)} examples from {_JSON_PATH}")


Loaded 21 examples from Llama3_CodeQA_llm_as_judge.json


In [3]:
!pip install langchain langchain-community

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from langchain_community.chat_models import ChatOllama
from langchain.schema import SystemMessage, HumanMessage

In [5]:
def should_teacher_intervene(scores, threshold=0.8):
    """
    Determine if the teacher LLM should intervene based on scores.
    
    Args:
        scores (dict): Dictionary with keys 'accuracy', 'completeness', 'relevance', 'clarity',
                       each with integer score 1 to 3.
        threshold (float): Penalty threshold for intervention.
        
    Returns:
        bool: True if teacher should intervene, False otherwise.
    """
    a = scores['accuracy']
    c = scores['completeness']
    r = scores['relevance']
    l = scores['clarity']

    # Compute weighted penalty
    penalty = 0.5 * (3 - a) + 0.2 * (3 - c) + 0.15 * (3 - r) + 0.15 * (3 - l)

    # Suspicious patterns
    suspicious = (
        (a == 1 and l == 3) or                          # Fluent but inaccurate
        (a >= 2 and r == 1) or                          # Correct but off-topic
        (a == 2 and r == 2 and l == 2) or               # Ambiguous scores
        (a + r + l <= 5) or                             # Generally low scores
        (abs(a - r) >= 2) or                            # Large inconsistency
        (a == 3 and (r <= 1 or l <= 1))                 # Overconfident accuracy
    )

    # Decide intervention
    return penalty > threshold or suspicious


In [6]:
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_community.chat_models import ChatOllama

STUDENT_MODEL = "llama3.1:8b"
TEACHER_MODEL = "llama3.1:8b"
SCORE_MAX = 3


def call_llm_LangChain_correct_v2(prompt, temperature=0.0):
    llm = ChatOllama(
        model="llama3.1:8b",
        temperature=temperature
    )

    system_prompt = """

You are a large language model acting as a judge for assessing the performance of a Teaching Assistant (TA) in an introductory Python programming course.

The TA is an LLM that answers student questions about Python code. Your job is to evaluate the quality of the TA's answer.

You will receive:
- A Python code snippet
- A student question about that code
- A reference (correct) answer
- A TA LLM-generated answer (called the prediction)

Your task is to evaluate how well the TA's prediction answers the student's question, using the following four dimensions. For each, provide:
- An integer score from 1 to 3



### Accuracy
Compare the prediction with the reference to assess factual correctness and understanding of the code’s behavior and intent.
You must judge whether the prediction reflects accurate behavior and matches core facts from the reference. 
You need to consider semantic meaning of code comprehension: understanding the structure, functionality, and intent behind the code.\n"

Score meanings:
- 1: Completely incorrect or irrelevant; does not address the reference answer.
- 2: Partially correct; some key facts are accurate, but major details are wrong or missing.
- 3: Fully correct; matches the reference answer in meaning and factual content.

### Completeness
Check if the prediction covers all important parts of the reference answer, including key concepts or conditions.

Score meanings:
- 1: Omits most key information or contains only a tiny fragment of relevant content.
- 2: Covers some elements but misses important parts.
- 3: Fully covers all essential information from the reference.

### Relevance
Assess whether the prediction directly addresses the question and stays on-topic.

Score meanings:
- 1: Completely irrelevant or mostly unrelated.
- 2: Partially related but misses the main point.
- 3: Fully focused and directly answers the question.

### Clarity
Evaluate how clearly and logically the prediction is expressed, ensuring it is easy to understand.

Score meanings:
- 1: Confusing, vague, or incoherent.
- 2: Understandable but awkwardly phrased or slightly unclear.
- 3: Clear, concise, and easy to follow.


Example:

Code:
```python
def count_even(nums):
    total = 0
    for x in nums:
        if x % 2 == 0:
            total += 1
    return total
Question: What does this function return when given a list of integers?
Reference Answer: It returns the count of even numbers in the list.
Prediction: It returns the count of odd numbers in the list.

Evaluation Output:
{

"accuracy": { "score": 1 },
"completeness": { "score": 1 },
"relevance": { "score": 2 },
"clarity": { "score": 3 }

}

Final Instructions:
For the given input (code, question, reference answer, and prediction), evaluate the prediction on the four metrics defined above.
Base your evaluation strictly on the content provided. Do not hallucinate missing information. Be consistent and objective.
Do not include reasoning or explanations.

Respond only with a JSON object in the exact format:
{
"accuracy": { "score": 1-3},
"completeness": {"score": 1-3},
"relevance": {"score": 1-3},
"clarity": {"score": 1-3}
}
"""

  

    messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=prompt)
    ]

    response = llm.invoke(messages)
    return response.content


In [7]:
def call_teacher_single_turn(code, question, reference, prediction, scores):
    # Build teacher system prompt
    system_prompt = """
You are a 70B teacher LLM model reviewing a student LLM-as-judge's evaluation of a Teaching Assistant's (TA) answer.

You will receive:
- Python code snippet
- Student's question
- Reference (correct) answer
- TA's predicted answer
- Scores assigned by the student LLM-as-judge

Your task:
- Examine the TA's predicted answer in context of the code, question, and reference.
- For any dimension (Accuracy, Completeness, Relevance, Clarity) where the score is less than 3,
  provide clear, concise feedback (2–4 sentences) explaining what could be improved.
- If a dimension has no issues, do not include it in your response.

Respond ONLY with a JSON object where keys are the dimension names (lowercase)
and values are the feedback strings.

Example output:
{
  "accuracy": "The prediction misrepresents the function’s return value.",
  "clarity": "The explanation lacks structure and is hard to follow."
}

Rubric:

### Accuracy
- 1: Completely incorrect or irrelevant.
- 2: Partially correct but with major mistakes or omissions.
- 3: Fully correct and matches the reference.

### Completeness
- 1: Omits most key information.
- 2: Covers some but misses important parts.
- 3: Fully covers all essential information.

### Relevance
- 1: Irrelevant or mostly unrelated.
- 2: Partially related but misses main point.
- 3: Fully focused and directly addresses the question.

### Clarity
- 1: Confusing, vague, or incoherent.
- 2: Understandable but awkwardly phrased or unclear.
- 3: Clear, concise, and easy to understand.
"""

    # Format input for the teacher
    user_prompt = f"""
```python
{code}
Question:
{question}

Reference Answer:
{reference}

TA's Predicted Answer:
{prediction}
"""

    try:
        llm = ChatOllama(model=TEACHER_MODEL, temperature=0.0)
        messages = [
            SystemMessage(content=system_prompt.strip()),
            HumanMessage(content=user_prompt.strip())
        ]
        response = llm.invoke(messages)
        critiques = json.loads(response.content.strip())
        return critiques
    except Exception as e:
        print(f"Error: {str(e)}")
        return {}


In [8]:
def student_reflect_and_revise(code, question, reference, prediction, old_score, critiques):
    print("\n Student reflecting on teacher feedback...\n")

    critique_text = "\n".join(
        f"{dim.upper()} Feedback: {critique.strip()}" for dim, critique in critiques.items()
    )

    prompt = f"""
Code:
```python
{code}
Question:
{question}

Reference Answer:
{reference}

TA's Predicted Answer:
{prediction}

Teacher's Feedback:
{critique_text}

Old prediction:
{old_score}

You have to update the old prediction by considering Teacher's Feedback. Please re-evaluate the TA's answer using:

Accuracy

Completeness

Relevance

Clarity

Only return JSON:
{{
"accuracy": {{ "score": 1-3 }},
"completeness": {{ "score": 1-3 }},
"relevance": {{ "score": 1-3 }},
"clarity": {{ "score": 1-3 }}
}}
"""

    try:
        response = call_llm_LangChain_correct_v2(prompt, temperature=0.0)
        revised_scores = json.loads(response)
        print(" Revised Scores:\n", revised_scores)
    except Exception as e:
        print(" Failed to parse revised scores:", str(e))
        revised_scores = None

    return revised_scores



In [9]:
print("### SINGLE-TURN PIPELINE ###")


### SINGLE-TURN PIPELINE ###


In [10]:
# Step 1: Student judge model outputs scores
# The output is from CS1QA dataset on CodeLlama
#)


# example = {"id": "q132",
#    "code": "from cs1robots import* load_world(\"worlds/harvest1.wld\") #load world(\"worlds/harvest3.wld\") #load world(\"worlds/harvest4.wld\") hubo=Robot() hubo.set_trace('blue') def harvest(): if hubo.on_beeper(): hubo.pick_beeper() else: hubo.move() def right(): for i in range(3): hubo.turn_left() def right_up(): if hubo.on_beeper(): hubo.pick_beeper() hubo.turn_left() hubo.move() hubo.turn_left() def left_up(): right() hubo.move() right() def loop(): while hubo.front_is_clear(): harvest() right_up() while hubo.front_is_clear(): harvest() for i in range(2): loop() left_up() loop()",
#     "question": "In task4, do I have to pass only the beeper?",
#     "reference": "No, you don't need to, just pick up all the beepers!",
#     "prediction": "\nHello! I understand that you are asking about a specific task related to programming in Python. However, your question is quite broad and doesn't provide enough context for me to fully understand what you need help with. Could you please provide more details or clarify your question so that I can better assist you?",
#   }
# studentLLM_score = {"accuracy": 1, "completeness": 2, "relevance": 1, "clarity": 3}

In [13]:


# Step 2 (batch): teacher checks & feedback over the whole JSON dataset, with compact output

from pathlib import Path
import json

JSON_PATH = Path("Llama3_CodeQA_llm_as_judge.json")

def _extract_scores(rec):
    def _get(k):
        v = rec.get(k, {})
        if isinstance(v, dict):
            return int(v.get("score", 0))
        if isinstance(v, (int, float)):
            return int(v)
        return 0
    return {
        "accuracy": _get("accuracy"),
        "completeness": _get("completeness"),
        "relevance": _get("relevance"),
        "clarity": _get("clarity"),
    }

# Load the dataset (top-level list of records)
with JSON_PATH.open("r", encoding="utf-8") as f:
    records = json.load(f)
if not isinstance(records, list):
    raise ValueError("Expected a list at the top level of the JSON file.")

results = []  # collect a minimal summary if you want to use it later

for idx, rec in enumerate(records, 1):
    # Build the per-example payload expected by your teacher functions
    example_local = {
        "id": rec.get("id"),
        "code": rec.get("code"),
        "question": rec.get("question"),
        "reference": rec.get("reference"),
        "prediction": rec.get("prediction"),
    }
    studentLLM_score_local = _extract_scores(rec)

    # --- Your existing teacher logic ---
    do_intervene = should_teacher_intervene(studentLLM_score_local, threshold=0.8)

    teacher_feedbacks = {}
    if do_intervene:
        teacher_feedbacks = call_teacher_single_turn(
            example_local["code"],
            example_local["question"],
            example_local["reference"],
            example_local["prediction"],
            studentLLM_score_local
        )

    # --- Compact printout (no “example” spam) ---
    ex_id = example_local.get("id", f"#{idx}")
    if teacher_feedbacks:
        print(f"\n[{ex_id}] feedback:")
        for dim, fb in teacher_feedbacks.items():
            print(f" - {dim}: {fb}")
    else:
        print(f"\n[{ex_id}] no intervention." if not do_intervene else f"\n[{ex_id}] intervention, but no feedback returned.")

    # Save a tiny summary (optional)
    results.append({
        "id": ex_id,
        "intervene": bool(do_intervene),
        "num_feedback_dims": len(teacher_feedbacks),
    })

print(f"\nProcessed {len(records)} examples.")



  llm = ChatOllama(model=TEACHER_MODEL, temperature=0.0)



[q1] feedback:
 - completeness: The predicted answer is missing the crucial detail that it should be a 'set' of test cases, not just any test cases.
 - relevance: Although the predicted answer is related to the question, it doesn't directly address what the code makes. It's more about what the function returns or generates.

[q2] no intervention.

[q3] no intervention.

[q4] feedback:
 - completeness: The TA's answer is partially correct but misses important details. It should specify that the received messages are from a pull subscription.
 - relevance: The TA's answer is mostly relevant to the question but could be more direct and focused on the specific context of a pull subscription.

[q5] feedback:
 - completeness: The TA's answer lacks essential information about how the explicit budget is used, specifically in creating a campaign.
 - relevance: The TA's answer partially addresses the question but misses the main point of how an explicit budget is utilized.

[q6] no intervention

In [16]:
# Step 3: Student reflects and revises scores for each example

for example_local, studentLLM_score_local in DATASET:
    ex_id = example_local.get("id")
    print(f"\n=== Student reflection for example {ex_id} ===")

    # Make sure teacher_feedbacks were computed in Step 2 (or set default)
    teacher_feedbacks_local = globals().get("teacher_feedbacks", {}) or {}

    revised_scores = student_reflect_and_revise(
        example_local["code"],
        example_local["question"],
        example_local["reference"],
        example_local["prediction"],
        studentLLM_score_local,
        teacher_feedbacks_local
    )

    print("Final revised scores (Single Turn):", revised_scores)
    print("-" * 60)



=== Student reflection for example q1 ===

 Student reflecting on teacher feedback...

 Revised Scores:
 {'accuracy': {'score': 2}, 'completeness': {'score': 2}, 'relevance': {'score': 3}, 'clarity': {'score': 2}}
Final revised scores (Single Turn): {'accuracy': {'score': 2}, 'completeness': {'score': 2}, 'relevance': {'score': 3}, 'clarity': {'score': 2}}
------------------------------------------------------------

=== Student reflection for example q2 ===

 Student reflecting on teacher feedback...

 Revised Scores:
 {'accuracy': {'score': 2}, 'completeness': {'score': 2}, 'relevance': {'score': 3}, 'clarity': {'score': 3}}
Final revised scores (Single Turn): {'accuracy': {'score': 2}, 'completeness': {'score': 2}, 'relevance': {'score': 3}, 'clarity': {'score': 3}}
------------------------------------------------------------

=== Student reflection for example q3 ===

 Student reflecting on teacher feedback...

 Revised Scores:
 {'accuracy': {'score': 2}, 'completeness': {'score':

In [17]:

# === Added: Runner over DATASET ===
def _default_run_example(_example, _scores):
    # Fallback if your notebook doesn't define `run_example`
    print(f"[{_example.get('id')}] Q: {_example.get('question')}")
    print(f"   prediction: {_example.get('prediction')}")
    print(f"   reference : {_example.get('reference')}")
    print(f"   scores    : {_scores}")

# Use notebook's `run_example` if it exists, otherwise fallback.
_run_fn = globals().get("run_example", _default_run_example)

for _i, (_example, _scores) in enumerate(DATASET, 1):
    _run_fn(_example, _scores)
    if _i < len(DATASET):
        print("-" * 60)


[q1] Q: What does the code make ?
   prediction: Test Suite
   reference : a suite
   scores    : {'accuracy': 2, 'completeness': 1, 'relevance': 3, 'clarity': 2}
------------------------------------------------------------
[q2] Q: Does the code make a suite ?
   prediction: Yes
   reference : Yes
   scores    : {'accuracy': 3, 'completeness': 3, 'relevance': 3, 'clarity': 3}
------------------------------------------------------------
[q3] Q: Does the code receive a message from a pull subscription ?
   prediction: Yes, it receives messages from a pull subscription.
   reference : Yes
   scores    : {'accuracy': 3, 'completeness': 2, 'relevance': 3, 'clarity': 3}
------------------------------------------------------------
[q4] Q: What does the code receive from a pull subscription ?
   prediction: Yes, messages
   reference : a message
   scores    : {'accuracy': 2, 'completeness': 1, 'relevance': 3, 'clarity': 2}
------------------------------------------------------------
[q5] Q: W

### Final runner (auto-sets `example` & `studentLLM_score` for each record)
This cell loops over `DATASET`, sets `example` and `studentLLM_score` **as globals** (so legacy cells/functions work),
and then calls your pipeline if available:

- If `should_teacher_intervene` exists, it runs your teacher flow.
- Else if `run_example` exists, it calls that.
- Else, it prints a minimal fallback.


In [19]:

# === FINAL DATASET RUNNER ===
from pathlib import Path as _Path
import json as _json

def _extract_scores(_rec):
    def _get_score(_k):
        _v = _rec.get(_k, {})
        if isinstance(_v, dict):
            return int(_v.get("score", 0))
        if isinstance(_v, (int, float)):
            return int(_v)
        return 0
    return {
        "accuracy": _get_score("accuracy"),
        "completeness": _get_score("completeness"),
        "relevance": _get_score("relevance"),
        "clarity": _get_score("clarity"),
    }

# Ensure DATASET exists; if not, build it from JSON
try:
    DATASET  # noqa: F821
except NameError:
    _JSON_PATH = _Path("/mnt/data/Llama3_CodeQA_llm_as_judge.json")
    with _JSON_PATH.open("r", encoding="utf-8") as _f:
        _data = _json.load(_f)
    if not isinstance(_data, list):
        raise ValueError("Expected top-level list in Llama3_CodeQA_llm_as_judge.json")
    DATASET = []
    for _rec in _data:
        _example = {
            "id": _rec.get("id"),
            "code": _rec.get("code"),
            "question": _rec.get("question"),
            "reference": _rec.get("reference"),
            "prediction": _rec.get("prediction"),
        }
        _scores = _extract_scores(_rec)
        DATASET.append((_example, _scores))
    print(f"[Final runner] Built DATASET with {len(DATASET)} examples.")

# Choose behavior based on what's defined in the notebook
_has_teacher = "should_teacher_intervene" in globals()
_has_feedback = "provide_teacher_feedback" in globals()
_has_run_example = "run_example" in globals()

for _idx, (_example, _scores) in enumerate(DATASET, 1):
    # Set globals for legacy code expecting these names
    globals()["example"] = _example
    globals()["studentLLM_score"] = _scores

    _ex_id = _example.get("id", f"#{_idx}")
    print(f"\n=== Running evaluation for example {_ex_id} ({_idx}/{len(DATASET)}) ===")

    try:
        if _has_teacher:
            # Execute your teacher pipeline
            _do_intervene = should_teacher_intervene(_scores, threshold=0.8)
            _teacher_feedbacks = 0
            if _do_intervene and _has_feedback:
                _teacher_feedbacks = provide_teacher_feedback(_example, _scores)

            print(f"Intervention: {_do_intervene}")
            print(f"Feedbacks  : {_teacher_feedbacks}")
            print(f"Scores     : {_scores}")

        elif _has_run_example:
            # Fallback to your per-example function
            run_example(_example, _scores)

        else:
            # Minimal fallback
            print(f"[{_ex_id}] Q: {_example.get('question')}")
            print(f"   prediction: {_example.get('prediction')}")
            print(f"   reference : {_example.get('reference')}")
            print(f"   scores    : {_scores}")
    except Exception as e:
        print(f"[ERROR] Example {_ex_id}: {e}")

    if _idx < len(DATASET):
        print("-" * 60)



=== Running evaluation for example q1 (1/21) ===
Intervention: True
Feedbacks  : 0
Scores     : {'accuracy': 2, 'completeness': 1, 'relevance': 3, 'clarity': 2}
------------------------------------------------------------

=== Running evaluation for example q2 (2/21) ===
Intervention: False
Feedbacks  : 0
Scores     : {'accuracy': 3, 'completeness': 3, 'relevance': 3, 'clarity': 3}
------------------------------------------------------------

=== Running evaluation for example q3 (3/21) ===
Intervention: False
Feedbacks  : 0
Scores     : {'accuracy': 3, 'completeness': 2, 'relevance': 3, 'clarity': 3}
------------------------------------------------------------

=== Running evaluation for example q4 (4/21) ===
Intervention: True
Feedbacks  : 0
Scores     : {'accuracy': 2, 'completeness': 1, 'relevance': 3, 'clarity': 2}
------------------------------------------------------------

=== Running evaluation for example q5 (5/21) ===
Intervention: True
Feedbacks  : 0
Scores     : {'accurac