# üß† Generador IA de bancos Moodle XML + Equilibrado y PDF

Este cuaderno:
1) **Sube un documento** base (PDF/DOCX/TXT).
2) La IA **genera preguntas MCQ** (autocontenidas, 1 correcta + 2 distractores plausibles).
3) **Equilibra** longitudes de opciones (¬±4 palabras) y **baraja** opciones.
4) Exporta **Moodle XML** listo para importar y un **PDF** de revisi√≥n (‚úÖ en la correcta).

**Uso:** ejecuta las celdas en orden. Cuando pida *subir archivo*, selecciona tu documento fuente.


In [None]:
!pip install openai==1.* beautifulsoup4 lxml reportlab pypdf python-docx tqdm --quiet

In [None]:
# üîë Establece tu API key de OpenAI de forma segura (no queda guardada en el cuaderno)
import os
from getpass import getpass

if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass("Pega tu OPENAI_API_KEY y pulsa Enter: ")
print("‚úÖ API key configurada en el entorno de ejecuci√≥n.")

In [None]:
# =========================
#  Generar √≠tems con IA + equilibrar + exportar XML + PDF
# =========================
import os, json, random, re, traceback
from tqdm import tqdm
from pypdf import PdfReader
from docx import Document
from google.colab import files
from bs4 import BeautifulSoup
from bs4.element import Tag
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet
from lxml import etree

# SDK OpenAI 1.x
from openai import OpenAI

# ===== Par√°metros =====
THRESH_DIFF = 4            # diferencia m√°x. de palabras entre correcta e incorrectas
RANDOM_SEED = 42           # usa None para aleatoriedad no determinista
ITEMS_PER_BLOCK = 6        # preguntas IA por bloque de texto
CHUNK_MAX_CHARS = 6000     # tama√±o aprox. de cada bloque del documento
OPENAI_MODEL = "gpt-4o-mini"  # modelo de generaci√≥n
DEBUG_JSON = False         # pon True si quieres ver limpieza de JSON del modelo

if RANDOM_SEED is not None:
    random.seed(RANDOM_SEED)

# ===== Carga de documento base =====
print("üìÅ Sube tu documento base (PDF/DOCX/TXT)")
up = files.upload()
SRC = list(up.keys())[0]
print(f"‚úÖ Cargado: {SRC}")

def load_text(path):
    p = path.lower()
    if p.endswith(".pdf"):
        reader = PdfReader(path)
        return "\n".join([(page.extract_text() or "") for page in reader.pages])
    if p.endswith(".docx"):
        doc = Document(path)
        return "\n".join([para.text for para in doc.paragraphs])
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

raw_text = load_text(SRC)
assert raw_text.strip(), "El documento parece vac√≠o o no se pudo extraer texto."

def chunk_text(text, max_chars=10000):
    paras = [p.strip() for p in text.split("\n") if p.strip()]
    chunks, cur = [], ""
    for p in paras:
        if len(cur) + len(p) + 1 <= max_chars:
            cur += ("\n" + p) if cur else p
        else:
            chunks.append(cur); cur = p
    if cur: chunks.append(cur)
    return chunks

chunks = chunk_text(raw_text, max_chars=CHUNK_MAX_CHARS)
print(f"üß© Bloques de texto creados: {len(chunks)} (‚âà{CHUNK_MAX_CHARS} chars c/u)")

# ===== Cliente OpenAI =====
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    from getpass import getpass
    api_key = getpass("Pega tu OPENAI_API_KEY y pulsa Enter: ")
    os.environ["OPENAI_API_KEY"] = api_key
client = OpenAI(api_key=api_key)

SYSTEM_PROMPT = (
    "Eres un generador de √≠tems universitarios, preciso, en espa√±ol, y devuelves JSON v√°lido."
)

# IMPORTANTE: llaves de JSON escapadas con {{ }}
USER_PROMPT_TMPL = """
Eres un generador de preguntas universitarias en psicolog√≠a del lenguaje/lectura.
Crea {n} preguntas tipo test (MCQ) AUTOCONTENIDAS a partir del CONTENIDO. Nivel: universitario.
- 1 correcta + 2 distractores plausibles (sin 'todas las anteriores' / 'ninguna').
- Redacci√≥n clara, sin ambig√ºedad; NO dependas de "seg√∫n el texto".
- Incluye justificaci√≥n breve (1‚Äì2 frases) para la correcta.
- RESPONDE √öNICAMENTE con JSON. NO incluyas explicaciones; NO uses ```json ni fences.

Estructura EXACTA:
{{
  "items": [
    {{
      "id": "BLOQUE1-Q1",
      "stem": "ENUNCIADO AUTOCONTENIDO...",
      "options": ["A...", "B...", "C..."],
      "correct_index": 1,
      "justification": "Por qu√© es correcta...",
      "difficulty": "media",
      "tags": ["efectos de priming","l√©xico"]
    }}
  ]
}}

CONTENIDO:
{content}
"""

def wc(s):
    s = re.sub(r"\s+", " ", s or "").strip()
    return len([w for w in s.split(" ") if w])

# --- Helpers robustos para limpiar JSON del modelo ---
def _strip_code_fences(s: str) -> str:
    s = s.strip()
    fence = re.compile(r"^```(?:json)?\s*([\s\S]*?)\s*```$", re.IGNORECASE)
    m = fence.match(s)
    return m.group(1).strip() if m else s

def _extract_json_object(s: str) -> str:
    start = s.find("{")
    if start == -1:
        return s
    depth = 0
    for i, ch in enumerate(s[start:], start=start):
        if ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0:
                return s[start:i+1]
    return s

def llm_items_from_text(content, block_id="B1", n=6, debug=DEBUG_JSON):
    prompt = USER_PROMPT_TMPL.format(content=content, n=n)

    # Intento 1: con response_format=json_object (si est√° disponible para tu cuenta)
    try:
        resp = client.chat.completions.create(
            model=OPENAI_MODEL,
            messages=[
                {"role":"system","content":SYSTEM_PROMPT},
                {"role":"user","content":prompt}
            ],
            temperature=0.4,
            response_format={"type":"json_object"}
        )
        raw = resp.choices[0].message.content
    except Exception:
        # Intento 2: sin response_format
        resp = client.chat.completions.create(
            model=OPENAI_MODEL,
            messages=[
                {"role":"system","content":SYSTEM_PROMPT},
                {"role":"user","content":prompt}
            ],
            temperature=0.4
        )
        raw = resp.choices[0].message.content

    # Parseo tolerante
    try:
        data = json.loads(raw)
    except Exception:
        cleaned = _strip_code_fences(raw)
        cleaned = _extract_json_object(cleaned)
        if debug:
            print("DEBUG raw[:400]:", raw[:400])
            print("DEBUG cleaned[:400]:", cleaned[:400])
        data = json.loads(cleaned)

    # Tolerancia a claves raras tipo '\n  "items"'
    if "items" not in data:
        for k in list(data.keys()):
            if "items" in k.replace("\n","").replace(" ",""):
                data["items"] = data.pop(k)
                break

    items = data.get("items", [])
    # Normaliza ids y estructura
    norm = []
    for i, it in enumerate(items, start=1):
        stem = it.get("stem","").strip()
        options = it.get("options", [])
        ci = it.get("correct_index", 0)
        if len(options) != 3:
            continue
        norm.append({
            "id": it.get("id") or f"{block_id}-Q{i}",
            "stem": stem,
            "options": [str(o).strip() for o in options],
            "correct_index": int(ci),
            "justification": it.get("justification","").strip(),
            "difficulty": it.get("difficulty","media"),
            "tags": it.get("tags",[])
        })
    return norm

def balance_and_shuffle(item, diff_threshold=4, seed=None):
    rnd = random.Random(seed)
    opts = item["options"]
    ci = item["correct_index"]
    Lc = wc(opts[ci])
    new_opts = opts[:]
    for i, opt in enumerate(new_opts):
        if i == ci:
            continue
        if (Lc - wc(opt)) > diff_threshold:
            extra = rnd.choice([
                " Este patr√≥n se ha descrito en estudios de priming y decisi√≥n l√©xica.",
                " La literatura lo vincula con activaci√≥n competitiva y control inhibitorio.",
                " Se replica en lectores con distintos niveles de proficiencia."
            ])
            new_opts[i] = (opt.strip() + extra)
    pairs = [(o, i==ci) for i,o in enumerate(new_opts)]
    rnd.shuffle(pairs)
    item["options"] = [p[0] for p in pairs]
    item["correct_index"] = next(i for i,p in enumerate(pairs) if p[1])
    return item

def validate_item(it):
    ok = True; reasons = []
    if len(it.get("options",[])) != 3:
        ok=False; reasons.append("No hay 3 opciones.")
    if not (0 <= it.get("correct_index", -1) < 3):
        ok=False; reasons.append("√çndice de correcta inv√°lido.")
    if ok:
        s = set([o.strip().lower() for o in it["options"]])
        if len(s) < 3:
            ok=False; reasons.append("Opciones duplicadas o id√©nticas.")
    if wc(it.get("stem","")) < 6:
        ok=False; reasons.append("Enunciado demasiado corto.")
    return ok, reasons

def to_moodle_xml(items, xml_path="equilibrado_IA.xml"):
    soup = BeautifulSoup('<?xml version="1.0" encoding="UTF-8"?><quiz></quiz>', "xml")
    quiz = soup.find("quiz")
    for it in items:
        q = soup.new_tag("question", type="multichoice")
        qt = soup.new_tag("questiontext", format="html")
        qt_text = soup.new_tag("text"); qt_text.string = it["stem"]
        qt.append(qt_text); q.append(qt)
        for i,opt in enumerate(it["options"]):
            ans = soup.new_tag("answer", fraction="100" if i==it["correct_index"] else "0")
            at = soup.new_tag("text"); at.string = opt
            ans.append(at); q.append(ans)
        quiz.append(q)
    # salida robusta con lxml
    xml_str = str(soup)
    parser = etree.XMLParser(recover=True)
    root = etree.fromstring(xml_str.encode("utf-8"), parser=parser)
    xml_bytes = etree.tostring(root, encoding="utf-8", xml_declaration=True, pretty_print=True)
    with open(xml_path, "wb") as f:
        f.write(xml_bytes)
    return xml_path

def to_pdf(items, pdf_path="equilibrado_IA.pdf"):
    doc = SimpleDocTemplate(pdf_path, pagesize=A4)
    styles = getSampleStyleSheet()
    story = [Paragraph("<b>Banco de preguntas (IA)</b>", styles["Title"]), Spacer(1,10)]
    for i,it in enumerate(items, start=1):
        story.append(Paragraph(f"<b>{i}. {it['stem']}</b>", styles["Normal"]))
        for j,opt in enumerate(it["options"]):
            mark = " ‚úÖ" if j==it["correct_index"] else ""
            story.append(Paragraph(f"{chr(97+j)}) {opt}{mark}", styles["Normal"]))
        if it.get("justification"):
            story.append(Paragraph(f"<i>Justificaci√≥n:</i> {it['justification']}", styles["Normal"]))
        story.append(Spacer(1,8))
    doc.build(story)
    return pdf_path

# ===== Generaci√≥n por bloques =====
all_items = []
for bi, ch in enumerate(tqdm(chunks, desc="Generando √≠tems IA"), start=1):
    try:
        items = llm_items_from_text(ch, block_id=f"B{bi}", n=ITEMS_PER_BLOCK)
        for it in items:
            it = balance_and_shuffle(it, diff_threshold=THRESH_DIFF, seed=RANDOM_SEED)
            ok, reasons = validate_item(it)
            if ok:
                all_items.append(it)
    except Exception as e:
        print(f"‚ö†Ô∏è Bloque {bi}: error de generaci√≥n ‚Üí", e)
        traceback.print_exc()

print(f"‚úÖ √çtems v√°lidos totales: {len(all_items)}")

# ===== Exportar =====
xml_path = to_moodle_xml(all_items, xml_path="equilibrado_IA.xml")
pdf_path = to_pdf(all_items, pdf_path="equilibrado_IA.pdf")
print("üì¶ XML:", xml_path)
print("üìÑ PDF:", pdf_path)

# Descargas
files.download(xml_path)
files.download(pdf_path)

