# Notebook 2 — Baseline grader + Golden set evaluation + pick threshold

**Goal:** implement a simple baseline grader, evaluate on `data/golden.csv`, and choose `GRADE_THRESHOLD`.

In [2]:
import os
os.chdir(os.getcwd().split('\\notebooks')[0])
main = os.getcwd()

In [None]:
# 1) Setup
import re, numpy as np, pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report

DB_PATH = main + "\\data\\temporal\\exams.db"
GOLDEN_PATH = main + '\\data\\golden\\golden.csv'
import sqlite3
def qsol(ex_id):
    con = sqlite3.connect(DB_PATH); con.row_factory = sqlite3.Row
    r = con.execute("SELECT solution FROM questions WHERE exercise_id=?", (ex_id,)).fetchone()
    con.close()
    return r["solution"] if r else None

In [None]:
# 2) Baseline grader (simple, transparent, fast)
_word = re.compile(r"[a-zA-Z]+")
def norm(s): 
    return " ".join(_word.findall((s or "").lower()))

def cosine_tfidf(a, b):
    vec = TfidfVectorizer(min_df=1, max_df=0.95, ngram_range=(1,2))
    X = vec.fit_transform([norm(a), norm(b)]).toarray()
    num = (X[0]*X[1]).sum()
    den = (np.linalg.norm(X[0])*np.linalg.norm(X[1]) + 1e-9)
    return float(num/den)

def jaccard(a, b):
    A, B = set(norm(a).split()), set(norm(b).split())
    if not A and not B: return 0.0
    return float(len(A&B) / max(1, len(A|B)))

STOP = set(["the","and","or","to","of","in","on","for","with","a","an","is","are","be","that","this"])
def keyword_overlap(solution, student, k=8):
    toks = [t for t in norm(solution).split() if t not in STOP and len(t)>=4]
    if not toks: return 0.0
    from collections import Counter
    top = [w for w,_ in Counter(toks).most_common(k)]
    st = set(norm(student).split())
    return float(len([w for w in top if w in st]) / max(1,len(top)))

def baseline_grade(solution, student):
    c = cosine_tfidf(solution, student)
    j = jaccard(solution, student)
    k = keyword_overlap(solution, student, k=8)
    score = 0.6*c + 0.25*j + 0.15*k
    reasons = f"Cosine={c:.2f}, Jaccard={j:.2f}, Keywords={k:.2f}"
    hint = "Revisa los pasos clave y la definición usada."
    return {"score": float(score), "reasons": reasons, "hint": hint}

In [None]:
# 3) Load golden set, attach solutions
assert GOLDEN_PATH.exists(), "Put your golden.csv in data/golden.csv"
df = pd.read_csv(GOLDEN_PATH)
df["solution"] = df["exercise_id"].map(qsol)
df = df.dropna(subset=["solution"]).reset_index(drop=True)
label_map = {"correct":1, "partial":0, "incorrect":0}
df["y_true"] = df["label"].map(label_map).astype(int)
print("Golden rows ready:", len(df))
df.head(2)

In [None]:
# 4) Score with baseline
df["score"] = [baseline_grade(s, a)["score"] for s,a in zip(df["solution"], df["student_answer"])]
df.groupby("label")["score"].mean().sort_values(ascending=False)

In [None]:
# 5) Threshold sweep → pick best F1
THRS = np.round(np.linspace(0.30, 0.85, 23), 2)
def eval_thr(t):
    y_pred = (df["score"].values >= t).astype(int)
    acc = accuracy_score(df["y_true"], y_pred)
    pr,rc,f1,_ = precision_recall_fscore_support(df["y_true"], y_pred, average="binary", zero_division=0)
    return dict(t=t, acc=acc, prec=pr, rec=rc, f1=f1)

sweep = pd.DataFrame([eval_thr(t) for t in THRS]).sort_values("f1", ascending=False)
best = sweep.iloc[0]
best

In [None]:
# 6) Report @ best threshold
t = float(best["t"])
pred = (df["score"] >= t).astype(int)
print("Best threshold:", t)
print(classification_report(df["y_true"], pred, target_names=["not-correct","correct"], zero_division=0))
from sklearn.metrics import confusion_matrix
print("Confusion matrix:\n", confusion_matrix(df["y_true"], pred))

In [None]:
# 7) Save suggestion for .env (you'll add it later in app)
print(">>> Suggested GRADE_THRESHOLD =", round(t, 2))