In [1]:
import re, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import re
from datetime import datetime
import os

os.chdir(os.getcwd().split('\\notebooks')[0])
main = os.getcwd()

In [2]:
ruta = main + '\\data\\processed\\exams_flattened.parquet'

In [3]:
df = pd.read_parquet(ruta)

In [4]:
df.head()

Unnamed: 0,exam_id,exam_type,date,year,month,day,exercise_id,question,solution,q_clean,topic_pred,topic_top2
0,General_2013-01-31,General,2013-01-31,2013,1,31,Exercise 1,Consider a vector space V and a subset W of V....,. Before starting recall that spanW is the sma...,consider a vector space v and a subset w of v....,vector_spaces,"[vector_spaces, normed_vector_spaces]"
1,General_2013-01-31,General,2013-01-31,2013,1,31,Exercise 2,Consider a metric space (X;d). 1. Give the deÖ...,. See the lecture notes. !,consider a metric space (x;d). 1. give the deö...,metric_spaces,"[metric_spaces, banach_contraction_theorem]"
2,General_2013-01-31,General,2013-01-31,2013,1,31,Exercise 3,Consider a metric space (X;d) where X has at l...,. 1. See the lecture notes. 2. SinceX contains...,consider a metric space (x;d) where x has at l...,banach_contraction_theorem,"[banach_contraction_theorem, brouwer_fixed_poi..."
3,General_2013-01-31,General,2013-01-31,2013,1,31,Exercise 4,This exercise has two parts. 1. Referring to t...,1. (a) The steady state p =(x ;y ) such that f...,this exercise has two parts. 1. referring to t...,linear_functionals_and_operators,"[linear_functionals_and_operators, isomorphisms]"
4,General_2013-01-31,General,2013-01-31,2013,1,31,Exercise 5,State and prove Blackwellís theorem.,. See the lecture notes. !,state and prove blackwellís theorem.,banach_contraction_theorem,"[banach_contraction_theorem, brouwer_fixed_poi..."


# Baseline grading

In [5]:
if "topic_pred" not in df.columns:
    df["topic_pred"] = "unknown"

# Simple cleaner + keyword extractor
STOP = set("""the a an and or of to for with from in on at is are be was were by as that this these those into over under if then else such
we you they he she it there here whose which when where how what why can could should would may might not no yes""".split())

def clean(s:str) -> str:
    s = (s or "").lower()
    s = re.sub(r"\(cid:\d+\)", " ", s)
    s = re.sub(r"[^a-z0-9\-\+\*/\^\=\(\)\[\]\{\}\., ]+", " ", s)
    return re.sub(r"\s+"," ", s).strip()

def keywords(s:str):
    toks = re.findall(r"[a-z0-9\^\+\-\*/=]+", clean(s))
    return {t for t in toks if len(t)>=2 and t not in STOP}

def grade_answer(solution:str, student:str):
    sol = clean(solution); ans = clean(student)
    # char-level cosine (robust to typos)
    vec = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), min_df=1)
    X = vec.fit_transform([sol, ans])
    X = normalize(X)
    cos = float((X[0] @ X[1].T).A[0,0]) if X.shape[1] else 0.0
    # keyword Jaccard
    K_sol, K_ans = keywords(sol), keywords(ans)
    jac = len(K_sol & K_ans) / max(1, len(K_sol | K_ans))
    # final score
    score = 0.6*cos + 0.4*jac
    missing = list((K_sol - K_ans))[:8]
    return {
        "score": round(float(score), 4),
        "correct": bool(score >= 0.3),           # tune threshold
        "cosine": round(cos,4),
        "jaccard": round(jac,4),
        "missing_keywords": missing
    }

# Example usage:
# grade_answer(df.loc[0,"solution"], "my attempt ...")


In [6]:
grade_answer(df.loc[0,"solution"], "smallest vector spanw ith the given vectors =w recall")

{'score': 0.3007,
 'correct': True,
 'cosine': 0.3562,
 'jaccard': 0.2174,
 'missing_keywords': ['also',
  'follows',
  'rsubspace',
  'starting',
  'have',
  'contains',
  'wiisav',
  'before']}

Record attemp

In [8]:
# Create an "attempts" table for one user (extend with user_id later)
attempts = []

def submit_answer(exercise_id:str, student_answer:str):
    row = df.loc[df["exercise_id"]==exercise_id].iloc[0]
    g = grade_answer(row["solution"], student_answer)
    rec = {
        "exercise_id": exercise_id,
        "topic": row["topic_pred"],
        "score": g["score"],
        "correct": g["correct"],
        "cosine": g["cosine"],
        "jaccard": g["jaccard"],
        "missing_keywords": g["missing_keywords"],
        "student_answer": student_answer
    }
    attempts.append(rec)
    return rec

# Example (toy):
# submit_answer("Exercise 1", "Linear functional represented by inner product ...")
# pd.DataFrame(attempts)


In [9]:
submit_answer("Exercise 1", "Before starting recall that spanW is the smallest vector subspace that contains W. More- over, we also have that spanW = W (1) i Wiisav W ec \\i t!o W rsubspace If. If W is a vector subspace, it follows that W W and W is a vector subspace. By (1), we ! have that spanW W: Since spanW is the smallest vector subspace that contains W, we have that spanW W: ! We can conclude that spanW =W. Only if. Since spanW is a vector subspace, if W =spanW, then W is a vector subspace.")
pd.DataFrame(attempts)

Unnamed: 0,exercise_id,topic,score,correct,cosine,jaccard,missing_keywords,student_answer
0,Exercise 1,vector_spaces,0.9949,True,0.9915,1.0,[],Before starting recall that spanW is the small...


3) Recommend next questions (focus on weak topics)

In [10]:
import random

def recommend_next(df_questions: pd.DataFrame, attempts_df: pd.DataFrame, k:int=6, mix=0.7):
    if attempts_df.empty:
        # cold start: pick k from diverse topics
        return (df_questions.sample(min(k, len(df_questions)))["exercise_id"].tolist())

    # topic performance
    perf = (attempts_df.groupby("topic")["score"].mean()
            .sort_values(ascending=True))
    weak_topics = perf.index.tolist()

    attempted = set(attempts_df["exercise_id"])
    pool_weak, pool_other = [], []
    for t in weak_topics:
        pool_weak += df_questions[(df_questions["topic_pred"]==t) & (~df_questions["exercise_id"].isin(attempted))]["exercise_id"].tolist()
    pool_other = df_questions[(~df_questions["exercise_id"].isin(attempted)) &
                              (~df_questions["topic_pred"].isin(weak_topics[:1]))]["exercise_id"].tolist()

    n_weak = max(1, int(k*mix))
    choose_weak = random.sample(pool_weak, min(n_weak, len(pool_weak))) if pool_weak else []
    choose_other = random.sample(pool_other, max(0, k-len(choose_weak))) if pool_other else []
    return choose_weak + choose_other

# After a few submit_answer(...) calls:
# recs = recommend_next(df, pd.DataFrame(attempts), k=5)
# recs


# Probar

In [11]:
df[["exercise_id","exam_type","date","topic_pred"]].head(10)

Unnamed: 0,exercise_id,exam_type,date,topic_pred
0,Exercise 1,General,2013-01-31,vector_spaces
1,Exercise 2,General,2013-01-31,metric_spaces
2,Exercise 3,General,2013-01-31,banach_contraction_theorem
3,Exercise 4,General,2013-01-31,linear_functionals_and_operators
4,Exercise 5,General,2013-01-31,banach_contraction_theorem
5,Exercise 6,General,2013-01-31,abstract_equations
6,Exercise 1,General,2013-02-01,linear_functionals_and_operators
7,Exercise 2,General,2013-02-01,metric_spaces
8,Exercise 3,General,2013-02-01,normed_vector_spaces
9,Exercise 4,General,2013-02-01,linear_functionals_and_operators


In [14]:
from IPython.display import display, Markdown
import pandas as pd

# storage for attempts (if you haven't created it yet)
try:
    attempts
except NameError:
    attempts = []

def ask(exercise_id: str):
    row = df.loc[df["exercise_id"]==exercise_id].iloc[0]
    display(Markdown(f"### {exercise_id} · {row['topic_pred']} · {row['date']}\n\n**Question**: {row['question']}"))
    ans = input("Your answer: ")
    res = submit_answer(exercise_id, ans)   # uses grade_answer under the hood
    display(Markdown(f"**Score:** {res['score']:.2f} · {'✅ Correct' if res['correct'] else '❌ Not yet'}\n\n"
                     f"**Missing keywords:** {', '.join(res['missing_keywords']) or '—'}"))
    return res

# example: pick one by id
# ask("Exercise 1")


In [15]:
ask("Exercise 1")

### Exercise 1 · vector_spaces · 2013-01-31

**Question**: Consider a vector space V and a subset W of V. 1. Prove that spanW =W if and only if W is a vector subspace of V.

**Score:** 0.07 · ❌ Not yet

**Missing keywords:** before, rsubspace, recall, more-, starting, =w, only, =spanw

{'exercise_id': 'Exercise 1',
 'topic': 'vector_spaces',
 'score': 0.0659,
 'correct': False,
 'cosine': 0.1099,
 'jaccard': 0.0,
 'missing_keywords': ['before',
  'rsubspace',
  'recall',
  'more-',
  'starting',
  '=w',
  'only',
  '=spanw'],
 'student_answer': 'Consider a metric space (X;d). 1. Give the deÖnition of open set. 2. Give the deÖnition of closed set. 3. Give an example of a set that is both open and closed.'}

In [16]:
def session(k_start=1, k_next=3):
    # start with a random question
    start_qid = df.sample(1)["exercise_id"].iloc[0]
    ask(start_qid)

    # then recommend next ones based on weak topics
    adf = pd.DataFrame(attempts)
    recs = recommend_next(df, adf, k=k_next)
    print("Next recommended:", recs)
    for qid in recs:
        ask(qid)

# session()


In [37]:
session()

### Exercise 6 · abstract_equations · 2013-01-31

**Question**: Consider the Hotelling problem in reduced form max 1 (tlog(9x x ) sub 0<x <9x , x >0 given t t+1 t+1 t 0 x ( t=0 X Write its Bellman equation and solve it by using as a guess the function v : (0; ) R deÖned by 1 ! v(x)=A+Blog(x).

**Score:** 0.00 · ❌ Not yet

**Missing keywords:** see, session, ta

Next recommended: ['Exercise 5', 'Exercise 4', 'Exercise 2']


### Exercise 5 · banach_contraction_theorem · 2013-01-31

**Question**: State and prove Blackwellís theorem.

**Score:** 0.00 · ❌ Not yet

**Missing keywords:** see, lecture, notes

### Exercise 4 · linear_functionals_and_operators · 2013-01-31

**Question**: This exercise has two parts. 1. Referring to the non linear autonomous system x_ =f(x;y) (3) ( y_ =g(x;y) (a) give the deÖnition of asymptotically stable equilibrium point; (b) then, state a stability criterion for the equilibrium point. 2. Consider the non linear system depending on the real parameter a R 2 x =(1+a)x2+y 0 y = ax+(1+a)y 0 # ( (a) Classify with respect to the parameter a, the trajectories of the linearized system in the neighborhood of the origin. (b) For a = 1, represent the horizontal- and vertical-slope isoclines and the direction of the ( orbits in the phase-plane; then, write the solution of the Cauchy problem x(0)=1, y(0)= 1=2 and represent it. (

**Score:** 0.00 · ❌ Not yet

**Missing keywords:** every, system, order, stabilityvialinearization, exercise, 1=2, any, solution

### Exercise 2 · metric_spaces · 2013-01-31

**Question**: Consider a metric space (X;d). 1. Give the deÖnition of open set. 2. Give the deÖnition of closed set. 3. Give an example of a set that is both open and closed.

**Score:** 0.88 · ✅ Correct

**Missing keywords:** —

In [17]:
def progress():
    adf = pd.DataFrame(attempts)
    if adf.empty:
        print("No attempts yet."); return
    print("\nAvg score by topic:")
    print(adf.groupby("topic")["score"].mean().sort_values())
    print("\nOverall:")
    print(adf["score"].describe()[["count","mean","min","max"]])

progress()



Avg score by topic:
topic
vector_spaces    0.685233
Name: score, dtype: float64

Overall:
count    3.000000
mean     0.685233
min      0.065900
max      0.994900
Name: score, dtype: float64


In [41]:
# Save
pd.DataFrame(attempts).to_csv(main+"\\data\\temporal\\attempts.csv", index=False)

# Persistance

## Phase A

In [18]:
import sqlite3, json, pandas as pd, re

In [19]:
DB_PATH = main + "\\data\\temporal\\exams.db"

In [None]:
# --- fallback baseline grader if you don't have one already ---
try:
    grade_answer
except NameError:
    import numpy as np
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.preprocessing import normalize
    def _clean(s:str) -> str:
        s = (s or "").lower()
        s = re.sub(r"\(cid:\d+\)", " ", s)
        s = re.sub(r"[^a-z0-9\-\+\*/\^\=\(\)\[\]\{\}\., ]+", " ", s)
        return re.sub(r"\s+"," ", s).strip()
    STOP = set("the a an and or of to for with from in on at is are be was were by as that this these those into over under if then else such".split())
    def _keywords(s:str):
        toks = re.findall(r"[a-z0-9\^\+\-\*/=]+", _clean(s))
        return {t for t in toks if len(t)>=2 and t not in STOP}
    def grade_answer(solution:str, student:str):
        sol = _clean(solution); ans = _clean(student)
        vec = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), min_df=1)
        X = vec.fit_transform([sol, ans]); X = normalize(X)
        cos = float((X[0] @ X[1].T).A[0,0]) if X.shape[1] else 0.0
        Ks, Ka = _keywords(sol), _keywords(ans)
        jac = len(Ks & Ka)/max(1,len(Ks|Ka))
        score = 0.6*cos + 0.4*jac
        return {"score": round(score,4), "correct": score>=0.6, "cosine": round(cos,4), "jaccard": round(jac,4),
                "missing_keywords": list((Ks-Ka))[:8]}

# --- 1) init schema ---
def init_db(db_path=DB_PATH):
    con = sqlite3.connect(db_path)
    cur = con.cursor()
    cur.executescript("""
    PRAGMA journal_mode=WAL;
    CREATE TABLE IF NOT EXISTS exams(
      exam_id TEXT PRIMARY KEY,
      exam_type TEXT,
      date TEXT,
      year INTEGER,
      month INTEGER,
      day INTEGER
    );

    CREATE TABLE IF NOT EXISTS questions(
      exercise_id TEXT PRIMARY KEY,
      exam_id TEXT,
      question TEXT,
      solution TEXT,
      topic_pred TEXT,
      topic_score REAL,
      difficulty INTEGER,
      FOREIGN KEY(exam_id) REFERENCES exams(exam_id)
    );

    CREATE TABLE IF NOT EXISTS users(
      user_id INTEGER PRIMARY KEY AUTOINCREMENT,
      username TEXT UNIQUE
    );

    CREATE TABLE IF NOT EXISTS attempts(
      attempt_id INTEGER PRIMARY KEY AUTOINCREMENT,
      user_id INTEGER,
      exercise_id TEXT,
      ts TEXT DEFAULT (datetime('now')),
      score REAL,
      correct INTEGER,
      cosine REAL,
      jaccard REAL,
      missing_keywords TEXT,
      student_answer TEXT,
      FOREIGN KEY(user_id) REFERENCES users(user_id),
      FOREIGN KEY(exercise_id) REFERENCES questions(exercise_id)
    );

    CREATE INDEX IF NOT EXISTS idx_attempts_user ON attempts(user_id);
    CREATE INDEX IF NOT EXISTS idx_attempts_ex ON attempts(exercise_id);
    """)
    con.commit()
    con.close()

init_db()
print("DB initialized at", DB_PATH)

# --- 2) load df -> DB (upsert) ---
def upsert_questions_from_df(df: pd.DataFrame, db_path=DB_PATH):
    required_cols = {"exam_id","exam_type","date","year","month","day","exercise_id","question","solution"}
    missing = required_cols - set(df.columns)
    if missing:
        raise ValueError(f"df is missing columns: {missing}")

    con = sqlite3.connect(db_path)
    cur = con.cursor()
    for _, r in df.iterrows():
        # exams
        cur.execute("""
            INSERT INTO exams(exam_id, exam_type, date, year, month, day)
            VALUES(?,?,?,?,?,?)
            ON CONFLICT(exam_id) DO UPDATE SET
              exam_type=excluded.exam_type, date=excluded.date,
              year=excluded.year, month=excluded.month, day=excluded.day
        """, (r["exam_id"], r["exam_type"], r["date"], int(r["year"]) if pd.notna(r["year"]) else None,
              int(r["month"]) if pd.notna(r["month"]) else None, int(r["day"]) if pd.notna(r["day"]) else None))
        # questions
        cur.execute("""
            INSERT INTO questions(exercise_id, exam_id, question, solution, topic_pred, topic_score, difficulty)
            VALUES(?,?,?,?,?,?,?)
            ON CONFLICT(exercise_id) DO UPDATE SET
              exam_id=excluded.exam_id, question=excluded.question, solution=excluded.solution,
              topic_pred=excluded.topic_pred, topic_score=excluded.topic_score, difficulty=excluded.difficulty
        """, (r["exercise_id"], r["exam_id"], r["question"], r["solution"],
              r.get("topic_pred"), float(r.get("topic_score")) if pd.notna(r.get("topic_score", None)) else None,
              int(r.get("difficulty")) if pd.notna(r.get("difficulty", None)) else None))
    con.commit()
    con.close()
    print(f"Upserted {len(df)} questions.")

upsert_questions_from_df(df)


DB initialized at d:\ESS\ocr_math_q\data\temporal\exams.db
Upserted 195 questions.


In [21]:
# --- 3) DB helpers: users, fetch, submit, summary ---
def get_user_id(username:str, db_path=DB_PATH) -> int:
    con = sqlite3.connect(db_path); cur = con.cursor()
    cur.execute("INSERT OR IGNORE INTO users(username) VALUES(?)", (username,))
    con.commit()
    cur.execute("SELECT user_id FROM users WHERE username=?", (username,))
    # uid = cur.fetchone()[0]
    con.close()
    return uid

def fetch_question(exercise_id:str, db_path=DB_PATH) -> dict:
    con = sqlite3.connect(db_path); cur = con.cursor()
    cur.execute("""
      SELECT q.exercise_id, q.question, q.solution, q.topic_pred, e.date, e.exam_type
      FROM questions q JOIN exams e ON q.exam_id=e.exam_id
      WHERE q.exercise_id=?""", (exercise_id,))
    row = cur.fetchone()
    con.close()
    if not row: raise KeyError("exercise_id not found")
    k = ["exercise_id","question","solution","topic","date","exam_type"]
    return dict(zip(k,row))

def submit_answer_db(username:str, exercise_id:str, student_answer:str, db_path=DB_PATH) -> dict:
    uid = get_user_id(username, db_path)
    q = fetch_question(exercise_id, db_path)
    g = grade_answer(q["solution"], student_answer)
    con = sqlite3.connect(db_path); cur = con.cursor()
    cur.execute("""
      INSERT INTO attempts(user_id, exercise_id, score, correct, cosine, jaccard, missing_keywords, student_answer)
      VALUES(?,?,?,?,?,?,?,?)
    """, (uid, exercise_id, g["score"], int(g["correct"]), g.get("cosine"), g.get("jaccard"),
          json.dumps(g.get("missing_keywords",[])), student_answer))
    con.commit(); con.close()
    return {"exercise_id": exercise_id, "topic": q["topic"], **g}

def topic_summary(username:str, db_path=DB_PATH) -> pd.DataFrame:
    uid = get_user_id(username, db_path)
    con = sqlite3.connect(db_path); cur = con.cursor()
    cur.execute("""
      SELECT q.topic_pred as topic, AVG(a.score) as avg_score, COUNT(*) as n
      FROM attempts a JOIN questions q ON a.exercise_id=q.exercise_id
      WHERE a.user_id=?
      GROUP BY q.topic_pred
      ORDER BY avg_score ASC
    """, (uid,))
    rows = cur.fetchall(); con.close()
    return pd.DataFrame(rows, columns=["topic","avg_score","n"])
