In [1]:
import pdfplumber, json
import re
from datetime import datetime
import os

In [2]:
os.chdir(os.getcwd().split('\\notebooks')[0])
main = os.getcwd()

In [3]:
data = main + '\\data\\raw\\exams.pdf'

In [4]:
with pdfplumber.open(data) as pdf, open(main + '\\data\\interim\\exams.json', 'w', encoding='utf-8') as f:
    exams = []
    for page in pdf.pages:
        text = page.extract_text()
        exams.append(text)
    json.dump(exams, f, ensure_ascii=False, indent=4)

In [5]:
with open(main + '\\data\\interim\\exams.json', 'r', encoding='utf-8') as f:
    exams = json.load(f)

In [6]:
exams

['Exam structure\nAbstract\nIn this (cid:133)le we explain how to navigate the collection of previous exams.\n1 How are the exams structured?\nFirst midterm exams October Those are the exams which are administered in\nOctober. They consist of four exercises. Exercises 1 and 2 are about vector spaces.\nExercises 3 and 4 are about metric spaces.\nGeneral exams January/February Exercise 1 is about vector spaces. Exercise 2 is\nabout metric spaces. Exercises 3 and 4 are typically about normed vector spaces and\nBanach contraction principle. Exercise 5 is typically about either convex analysis or\nBlackwell. Exercise 6 is about dynamic programming. In this case, the last 4 exercises\nform the second midterm.\nGeneral exams August/September Exercise 1 is about vector spaces. Exercise 2\nis about metric spaces. Exercises 3 and 4 are typically about normed vector spaces and\nBanach contraction principle. Exercise 5 is typically about either convex analysis or\nBlackwell. Exercise 6 is about dy

In [7]:
exam_pattern = r'(General|Midterm)\s+Exam.*?([A-Za-z]+\s+\d{1,2},\s+\d{4})'
#exam_pattern = r'(Advanced Mathematics)\s+Exam.*?([A-Za-z]+\s+\d{1,2},\s+\d{4})'
question_pattern = r'(Exercise)\s+Exam.*?([A-Za-z])'
solution_pattern = r'(Solution)\s+Exam.*?([A-Za-z]+\s+\d{1,2},\s+\d{4})'

In [8]:
# Dictionary to store classified exam data
classified_exams = {}

for exam_text in exams[1:]:
    #lines = exam_text.split('Exercise')
    current_exam_type = None
    current_date = None

    match = re.search(exam_pattern, exam_text)
    if match:
        exam_type = match.group(1)
        date_str = match.group(2)

        # Parse the date
        try:
            date_obj = datetime.strptime(date_str, '%B %d, %Y')
            formatted_date = date_obj.strftime('%Y-%m-%d')

            exam_key = f"{exam_type}_{formatted_date}"
            
            if exam_key not in classified_exams:
                classified_exams[exam_key] = {
                    'type': exam_type,
                    'date': formatted_date,
                    'year': date_obj.year,
                    'month': date_obj.month,
                    'day': date_obj.day,
                    'content': []
                }
            current_exam_type = exam_type
            current_date = formatted_date
        except ValueError:
            continue
    
    else:
        content_ = classified_exams[exam_key]['content']
        content_ += [exam_text]
        classified_exams[exam_key] = {
                            'type': exam_type,
                            'date': formatted_date,
                            'year': date_obj.year,
                            'month': date_obj.month,
                            'day': date_obj.day,
                            'content': content_
                        }
        
        # If we have identified an exam type/date, store the line
    if current_exam_type and current_date:
        exam_key = f"{current_exam_type}_{current_date}"
        classified_exams[exam_key]['content'].append(exam_text)

In [9]:
del classified_exams['General_2013-01-23']

In [10]:
solution_pattern = r'(Solution)\s+Exam.*?([A-Za-z]+\s+\d{1,2},\s+\d{4})'

In [11]:
for question in classified_exams['General_2025-08-29']['content']:
    lines = exam_text.split('Exercise')
    match = re.search(question_pattern, question)

In [12]:
temp = classified_exams['General_2025-08-29']['content']

In [13]:
text_ = ' '.join(temp)

In [14]:
match = re.search('Exercise', text_)

In [15]:
match.group()

'Exercise'

In [16]:
[x.strip() for x in text_.split('Exercise ')[1:]]

['1 Consider a functional f : R n R. State and prove the Riesz representation theorem\n!\n(that is, the theorem that provides a characterization for linear functionals).\nSolution See the lecture notes. (cid:4)',
 '2 Consider R n endowed with the d 1 distance.1 Consider a nonempty subset C of X. We\nsay that x in an algebraic interior point of C if and only if for each y R n there exists (cid:14) > 0 such\n2\nthat x+(cid:14)y C. We denote the set of all algebraic interior points of C by algC.\n2\n1. Prove that intC algC.\n(cid:18)\n2. Prove that if C is convex and x algC, then x C.\n2 2\n3. Prove that if C is convex, then intC = algC.\nSolution. 1. If intC = , then trivially we have that intC algC. If intC = , consider x intC.\n; (cid:18) 6 ; 2\nIt follows that there exists " > 0 such that B " (x) (cid:18) C. Consider y 2 R n. Set (cid:14) = k y k " 1 +1 > 0. Note\nthat d (x;x+(cid:14)y) = x+(cid:14)y x = (cid:14) y < ", proving that x+(cid:14)y B (x) C. Since x and y\n1 k (cid:0) k1 k

In [17]:
for x in [x.split('Solution') for x in [x.strip() for x in text_.split('Exercise ')[1:]] ]:
    print(x)

['1 Consider a functional f : R n R. State and prove the Riesz representation theorem\n!\n(that is, the theorem that provides a characterization for linear functionals).\n', ' See the lecture notes. (cid:4)']
['2 Consider R n endowed with the d 1 distance.1 Consider a nonempty subset C of X. We\nsay that x in an algebraic interior point of C if and only if for each y R n there exists (cid:14) > 0 such\n2\nthat x+(cid:14)y C. We denote the set of all algebraic interior points of C by algC.\n2\n1. Prove that intC algC.\n(cid:18)\n2. Prove that if C is convex and x algC, then x C.\n2 2\n3. Prove that if C is convex, then intC = algC.\n', '. 1. If intC = , then trivially we have that intC algC. If intC = , consider x intC.\n; (cid:18) 6 ; 2\nIt follows that there exists " > 0 such that B " (x) (cid:18) C. Consider y 2 R n. Set (cid:14) = k y k " 1 +1 > 0. Note\nthat d (x;x+(cid:14)y) = x+(cid:14)y x = (cid:14) y < ", proving that x+(cid:14)y B (x) C. Since x and y\n1 k (cid:0) k1 k k1 2 " 

In [18]:
for exam_key in classified_exams:
    full_text = ' '.join(classified_exams[exam_key]['content'])
    #classified_exams[exam_key]['questions'] = {}
    
    questions = [x.strip() for x in re.split(r'Exercise\s*\d+', full_text)[1:]]
    questions = [x.split('Solution') for x in questions ]

    classified_exams[exam_key]['questions'] = {}

    for i,q in enumerate(questions):
        classified_exams[exam_key]['questions']['Exercise '+str(i+1)] = {}
        classified_exams[exam_key]['questions']['Exercise '+str(i+1)]['question'] = q[0].strip()
        if len(q) > 1:
            classified_exams[exam_key]['questions']['Exercise '+str(i+1)]['solution'] = q[1].strip()

In [19]:
classified_exams

{'General_2025-08-29': {'type': 'General',
  'date': '2025-08-29',
  'year': 2025,
  'month': 8,
  'day': 29,
  'content': ['Advanced Mathematics (Cod. 20136)\nGeneral Exam (cid:150)August 29, 2025\nSurname Student ID\nName Exam Code 20136\nRules of conduct during exams or other tests:\nI hereby undertake to respect the regulations described in the Honor Code and undersign my presence\nat the exam.\nSignature:\nExercise 1 Consider a functional f : R n R. State and prove the Riesz representation theorem\n!\n(that is, the theorem that provides a characterization for linear functionals).\nSolution See the lecture notes. (cid:4)',
   'Exercise 2 Consider R n endowed with the d 1 distance.1 Consider a nonempty subset C of X. We\nsay that x in an algebraic interior point of C if and only if for each y R n there exists (cid:14) > 0 such\n2\nthat x+(cid:14)y C. We denote the set of all algebraic interior points of C by algC.\n2\n1. Prove that intC algC.\n(cid:18)\n2. Prove that if C is convex 

In [20]:
# First, add a helper function to clean text
def clean_text(text):
    # Remove "EXTRA PAGE" with any surrounding whitespace
    cleaned = re.sub(r'\s*EXTRA PAGE\s*', ' ', text)
    # Clean up any double spaces that might result
    cleaned = re.sub(r'\s+', ' ', cleaned)
    return cleaned.strip()

# Update the exam processing loop
for exam_key in classified_exams:
    # Clean the full text
    full_text = ' '.join(classified_exams[exam_key]['content'])
    full_text = clean_text(full_text)
    
    # Split and process questions
    questions = [x.strip() for x in re.split(r'Exercise\s*\d+', full_text)[1:]]
    questions = [x.split('Solution') for x in questions]

    classified_exams[exam_key]['questions'] = {}

    for i, q in enumerate(questions):
        classified_exams[exam_key]['questions']['Exercise '+str(i+1)] = {}
        # Clean question text
        classified_exams[exam_key]['questions']['Exercise '+str(i+1)]['question'] = clean_text(q[0])
        # Clean solution text if it exists
        if len(q) > 1:
            classified_exams[exam_key]['questions']['Exercise '+str(i+1)]['solution'] = clean_text(q[1])

In [21]:
classified_exams

{'General_2025-08-29': {'type': 'General',
  'date': '2025-08-29',
  'year': 2025,
  'month': 8,
  'day': 29,
  'content': ['Advanced Mathematics (Cod. 20136)\nGeneral Exam (cid:150)August 29, 2025\nSurname Student ID\nName Exam Code 20136\nRules of conduct during exams or other tests:\nI hereby undertake to respect the regulations described in the Honor Code and undersign my presence\nat the exam.\nSignature:\nExercise 1 Consider a functional f : R n R. State and prove the Riesz representation theorem\n!\n(that is, the theorem that provides a characterization for linear functionals).\nSolution See the lecture notes. (cid:4)',
   'Exercise 2 Consider R n endowed with the d 1 distance.1 Consider a nonempty subset C of X. We\nsay that x in an algebraic interior point of C if and only if for each y R n there exists (cid:14) > 0 such\n2\nthat x+(cid:14)y C. We denote the set of all algebraic interior points of C by algC.\n2\n1. Prove that intC algC.\n(cid:18)\n2. Prove that if C is convex 

In [22]:
len(classified_exams['General_2025-08-29']['questions']) 

6

In [23]:
final_exams = {}

In [25]:
for exam in classified_exams.keys():
    if len(classified_exams[exam]['questions']) <= 6 :
        final_exams[exam] = {
            'type': classified_exams[exam]['type'],
            'date': classified_exams[exam]['date'],
            'year': classified_exams[exam]['year'],
            'questions': classified_exams[exam]['questions']
        }
        


In [26]:
# Save the classified exams to a JSON file
output_path = os.path.join(main, 'data', 'processed', 'classified_exams.json')

# Create the processed directory if it doesn't exist
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Save to JSON with proper formatting and UTF-8 encoding
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(final_exams, f, ensure_ascii=False, indent=4)

print(f"Saved classified exams to: {output_path}")

Saved classified exams to: d:\ESS\ocr_math_q\data\processed\classified_exams.json


In [4]:
with open(main + '\\data\\processed\\classified_exams.json', 'r', encoding='utf-8') as f:
    final_exams = json.load(f)

# data frame

In [5]:
# 1) Flatten your dict -> rows: one per Exercise
import re, pandas as pd
from datetime import datetime

exams = final_exams  # <-- put your dict variable here

def clean_text(s:str) -> str:
    if not s: return ""
    s = re.sub(r"\(cid:\d+\)", " ", s)     # drop cid artifacts
    s = s.replace(" (cid:150) ", " - ")    # just in case
    s = re.sub(r"\s+", " ", s).strip()
    return s

rows = []
for exam_key, meta in exams.items():
    etype = meta.get("type")
    date_str = meta.get("date")            # looks like 'YYYY-MM-DD'
    dt = None
    try:
        dt = datetime.strptime(date_str, "%Y-%m-%d")
    except Exception:
        dt = None
    for qid, qa in meta.get("questions", {}).items():
        rows.append({
            "exam_id": exam_key,
            "exam_type": etype,
            "date": date_str,
            "year": meta.get("year"),
            "month": dt.month if dt else None,
            "day": dt.day if dt else None,
            "exercise_id": qid,
            "question": clean_text(qa.get("question","")),
            "solution": clean_text(qa.get("solution","")),
        })

df = pd.DataFrame(rows).sort_values(["date","exam_type","exercise_id"]).reset_index(drop=True)
df.head()


Unnamed: 0,exam_id,exam_type,date,year,month,day,exercise_id,question,solution
0,General_2013-01-31,General,2013-01-31,2013,1,31,Exercise 1,Consider a vector space V and a subset W of V....,. Before starting recall that spanW is the sma...
1,General_2013-01-31,General,2013-01-31,2013,1,31,Exercise 2,Consider a metric space (X;d). 1. Give the deÖ...,. See the lecture notes. !
2,General_2013-01-31,General,2013-01-31,2013,1,31,Exercise 3,Consider a metric space (X;d) where X has at l...,. 1. See the lecture notes. 2. SinceX contains...
3,General_2013-01-31,General,2013-01-31,2013,1,31,Exercise 4,This exercise has two parts. 1. Referring to t...,1. (a) The steady state p =(x ;y ) such that f...
4,General_2013-01-31,General,2013-01-31,2013,1,31,Exercise 5,State and prove Blackwellís theorem.,. See the lecture notes. !


In [8]:
df

Unnamed: 0,exam_id,exam_type,date,year,month,day,exercise_id,question,solution
0,General_2013-01-31,General,2013-01-31,2013,1,31,Exercise 1,Consider a vector space V and a subset W of V....,. Before starting recall that spanW is the sma...
1,General_2013-01-31,General,2013-01-31,2013,1,31,Exercise 2,Consider a metric space (X;d). 1. Give the deÖ...,. See the lecture notes. !
2,General_2013-01-31,General,2013-01-31,2013,1,31,Exercise 3,Consider a metric space (X;d) where X has at l...,. 1. See the lecture notes. 2. SinceX contains...
3,General_2013-01-31,General,2013-01-31,2013,1,31,Exercise 4,This exercise has two parts. 1. Referring to t...,1. (a) The steady state p =(x ;y ) such that f...
4,General_2013-01-31,General,2013-01-31,2013,1,31,Exercise 5,State and prove Blackwellís theorem.,. See the lecture notes. !
...,...,...,...,...,...,...,...,...,...
190,General_2025-08-29,General,2025-08-29,2025,8,29,Exercise 2,Consider R n endowed with the d 1 distance.1 C...,". 1. If intC = , then trivially we have that i..."
191,General_2025-08-29,General,2025-08-29,2025,8,29,Exercise 3,Prove that a linear operator T : V V between n...,See the lecture notes.
192,General_2025-08-29,General,2025-08-29,2025,8,29,Exercise 4,Consider R n endowed with the supnorm .3 We sa...,1. By the Banach contraction principle and sin...
193,General_2025-08-29,General,2025-08-29,2025,8,29,Exercise 5,Consider a metric space (X;d) and a continuous...,No! ConsiderX = Nendowedwiththediscretedistanc...


# tagging

In [9]:
pip install 'git+https://github.com/facebookresearch/detectron2.git@a59f05630a8f205756064244bf5beb8661f96180'

Note: you may need to restart the kernel to use updated packages.


ERROR: Invalid requirement: "'git+https://github.com/facebookresearch/detectron2.git@a59f05630a8f205756064244bf5beb8661f96180'"

[notice] A new release of pip available: 22.2.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
# pip install scikit-learn pandas numpy scipy
import re, numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from scipy.sparse import hstack

# df must have a 'question' column from your previous step
# df = ...

def clean(s):
    if not s: return ""
    s = re.sub(r"\(cid:\d+\)", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip().lower()

df["q_clean"] = df["question"].map(clean)

# Your topics + a few seed phrases each (edit freely)
topic_seeds = {
    "vector_spaces": [
        "vector space, basis, dimension, linear independence, subspace, span"
    ],
    "linear_functionals_and_operators": [
        "linear functional, bounded operator, operator norm, riesz representation"
    ],
    "isomorphisms": [
        "linear isomorphism, invertible linear map, isomorphic vector spaces"
    ],
    "finance_illustration": [
        "present value, discount factor, asset pricing, arbitrage, consumption savings"
    ],
    "metric_spaces": [
        "metric space, open set, closed set, cauchy sequence, convergence"
    ],
    "normed_vector_spaces": [
        "normed vector space, norm equivalence, banach space, complete norm"
    ],
    "abstract_equations": [
        "abstract equation, operator equation, functional equation, fixed point equation"
    ],
    "banach_contraction_theorem": [
        "banach contraction theorem, contraction mapping, unique fixed point, complete metric space"
    ],
    "brouwer_fixed_point_theorem": [
        "brouwer fixed point theorem, continuous map on compact convex set, fixed point exists"
    ],
    "optimization": [
        "optimization problem, argmax, argmin, lagrangian, first order conditions"
    ],
    "convexity_and_concavity": [
        "convex function, concavity, jensen inequality, subgradient, convex set"
    ],
    "dynamic_optimization": [
        "bellman equation, dynamic programming, value function, optimal policy, intertemporal"
    ],
}

topics = list(topic_seeds.keys())
topic_docs = [clean(" ; ".join(v)) for v in topic_seeds.values()]

# --- Vectorization: words (1–2 grams) + characters (3–5) for robustness to math/typos ---
word_vec = TfidfVectorizer(ngram_range=(1,2), min_df=1)
char_vec = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), min_df=1)

corpus = df["q_clean"].tolist() + topic_docs
word_vec.fit(corpus)
char_vec.fit(corpus)

Xq = hstack([
    word_vec.transform(df["q_clean"]),
    char_vec.transform(df["q_clean"])
])
Xt = hstack([
    word_vec.transform(topic_docs),
    char_vec.transform(topic_docs)
])

# cosine via normalized dot product
Xq = normalize(Xq)
Xt = normalize(Xt)
S = Xq @ Xt.T  # (num_questions, num_topics)

best = np.asarray(S.argmax(axis=1)).ravel()
conf = np.asarray(S.max(axis=1)).ravel()

df["topic_pred"]  = [topics[i] for i in best]
#df["topic_score"] = conf

# Optional: mark low confidence as 'uncertain' and keep top-2 for review
#THRESH = 0.28
#df.loc[df["topic_score"] < THRESH, "topic_pred"] = "uncertain"

top2 = np.argsort(-S.A, axis=1)[:, :2]
df["topic_top2"] = [[topics[i0], topics[i1]] for i0, i1 in top2]

#df[["exercise_id","topic_pred","topic_score","topic_top2"]].head()


In [7]:
df.to_parquet(main + '\\data\\processed\\exams_flattened.parquet', index=False)