In [None]:
!pip install -q transformers accelerate sentence-transformers scikit-learn pymupdf tqdm
!pip install -q datasets
!pip install -q transformers==4.46.3 sentence-transformers==3.0.1

In [None]:
# ===========================================
# Exam Question Generator (Ultra-Stable v3)
# Qwen2.5-3B-Instruct - PURE QUESTIONS ONLY
# ===========================================

import os, re, json
import torch
import numpy as np
import fitz
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans

# -----------------------------
# CONFIG
# -----------------------------
EMBED_MODEL = "sentence-transformers/all-MiniLM-L12-v2"
LLM = "Qwen/Qwen2.5-3B-Instruct"

CHUNK_SIZE = 4
NUM_CLUSTERS = 6
NUM_QUESTIONS = 4
MAX_NEW_TOKENS = 120
TEMPERATURE = 0.3          # lower = more stable
TOP_P = 0.85
OUTPUT_JSON = "exam_questions.json"

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using:", device)

# -----------------------------
# LOAD MODELS
# -----------------------------
embed_model = SentenceTransformer(EMBED_MODEL, device=device)

tok = AutoTokenizer.from_pretrained(LLM)
model = AutoModelForCausalLM.from_pretrained(
    LLM,
    device_map="auto",
    torch_dtype=torch.float16
)

gen = pipeline(
    "text-generation",
    model=model,
    tokenizer=tok,
    torch_dtype=torch.float16
)

print("Models loaded.")

# -----------------------------
# HELPERS
# -----------------------------
def split_sent(text):
    s = re.split(r'(?<=[.!?])\s+', text.replace("\n", " "))
    return [x.strip() for x in s if len(x.strip()) > 10]

def chunk_text(text, size=CHUNK_SIZE):
    s = split_sent(text)
    out = []
    for i in range(0, len(s), size):
        block = " ".join(s[i:i+size])
        if len(block) > 50:
            out.append(block)
    return out

def read_pdf(path):
    d = fitz.open(path)
    return "\n".join(p.get_text() for p in d)

def top_keywords(text, k=5):
    clean = re.sub(r"[^\w\s]", " ", text.lower())
    words = [w for w in clean.split() if len(w) > 5]
    freq = {}
    for w in words:
        freq[w] = freq.get(w, 0) + 1
    return [w for w,_ in sorted(freq.items(), key=lambda x: -x[1])][:k]

# -----------------------------
# PURE QUESTION GENERATOR (STRICT)
# -----------------------------
def generate_questions(chunk):
    prompt = f"""
Generate EXACTLY {NUM_QUESTIONS} exam-style questions.

Rules:
- Do NOT number the questions.
- Do NOT use bullets.
- No markdown.
- No explanations.
- No repeating the rules.
- No intro or outro.
- Output ONLY questions.
- Each question must be in a NEW LINE.
- Each line must be a complete question.

PASSAGE:
{chunk}

Write the questions now:
"""

    out = gen(
        prompt,
        max_new_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        top_p=TOP_P,
        do_sample=True
    )[0]["generated_text"]

    # remove prompt echo
    text = out.replace(prompt, "").strip()

    # Split lines
    lines = [l.strip() for l in text.split("\n")]

    # Clean invalid lines
    clean = []
    for l in lines:
        if len(l) < 10: continue
        if l.startswith("-") or l.startswith("*"): continue
        if re.match(r"^\d+[\). ]", l): continue   # strip numbered
        if "Rules:" in l: continue
        clean.append(l)

    # ensure exactly N questions
    clean = clean[:NUM_QUESTIONS]

    # force missing ones
    while len(clean) < NUM_QUESTIONS:
        clean.append("Cannot generate question.")

    return clean

# -----------------------------
# semantic best pick
# -----------------------------
def pick_best(chunk, qs):
    ce = embed_model.encode(chunk, convert_to_tensor=True)
    qe = embed_model.encode(qs, convert_to_tensor=True)
    scores = util.cos_sim(ce, qe)[0].cpu().numpy()
    return qs[int(np.argmax(scores))]

# -----------------------------
# PROCESS SAMPLE PDF
# -----------------------------
pdf_paths = ["sample_data/The_Wonders_Hidden_in_Everyday_Life.pdf"]

chunks = []
src = []
for p in pdf_paths:
    text = read_pdf(p)
    c = chunk_text(text)
    for i, block in enumerate(c):
        chunks.append(block)
        src.append({"file": p, "chunk_index": i})

print("Total chunks:", len(chunks))

# embeddings
emb = embed_model.encode(chunks, convert_to_tensor=True)

# clustering
if len(chunks) > NUM_CLUSTERS:
    cls = KMeans(n_clusters=NUM_CLUSTERS, random_state=42).fit_predict(emb.cpu().numpy())
else:
    cls = np.zeros(len(chunks), dtype=int)

# -----------------------------
# MAIN LOOP
# -----------------------------
results = []

for i, chunk in enumerate(tqdm(chunks)):
    qlist = generate_questions(chunk)
    best = pick_best(chunk, qlist)
    results.append({
        "chunk": chunk,
        "questions": qlist,
        "best_question": best,
        "cluster": int(cls[i]),
        "source": src[i]
    })

# save
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print("Saved as", OUTPUT_JSON)


Using: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Models loaded.
Total chunks: 7


  0%|          | 0/7 [00:00<?, ?it/s]

Saved as exam_questions.json
