# üß† Generador IA de bancos Moodle XML + Equilibrado y PDF

Este cuaderno:
1) **Sube un documento** base (PDF/DOCX/TXT).
2) La IA **genera preguntas MCQ** (autocontenidas, 1 correcta + 2 distractores plausibles).
3) **Equilibra** longitudes de opciones (¬±4 palabras) y **baraja** opciones.
4) Exporta **Moodle XML** listo para importar y un **PDF** de revisi√≥n (‚úÖ en la correcta).

**Uso:** ejecuta las celdas en orden. Cuando pida *subir archivo*, selecciona tu documento fuente.


In [None]:
!pip install openai==1.* beautifulsoup4 lxml reportlab pypdf python-docx tqdm --quiet

In [None]:
# üîë Establece tu API key de OpenAI de forma segura (no queda guardada en el cuaderno)
import os
from getpass import getpass

if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass("Pega tu OPENAI_API_KEY y pulsa Enter: ")
print("‚úÖ API key configurada en el entorno de ejecuci√≥n.")

‚úÖ API key configurada en el entorno de ejecuci√≥n.


In [None]:
# =========================
#  N√∫cleo: carga de documento, helpers IA, equilibrado, exportadores
# =========================
import os, json, random, re, time, traceback
from tqdm import tqdm
from pypdf import PdfReader
from docx import Document
from google.colab import files
from bs4 import BeautifulSoup
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet
from lxml import etree

# SDK OpenAI 1.x
from openai import OpenAI, RateLimitError

# ===== Par√°metros generales =====
THRESH_DIFF = 4            # diferencia m√°x. de palabras entre correcta e incorrectas
RANDOM_SEED = 42           # usa None para aleatoriedad no determinista
CHUNK_MAX_CHARS = 6000     # tama√±o aprox. de cada bloque del documento
OPENAI_MODEL = "gpt-4o-mini"
DEBUG_JSON = False

# Reintentos/backoff para llamadas a la IA
MAX_RETRIES = 4
BACKOFF_BASE = 2.0

if RANDOM_SEED is not None:
    random.seed(RANDOM_SEED)

# ===== Carga de documento base =====
print("üìÅ Sube tu documento base (PDF/DOCX/TXT)")
up = files.upload()
SRC = list(up.keys())[0]
print(f"‚úÖ Cargado: {SRC}")

def load_text(path):
    p = path.lower()
    if p.endswith(".pdf"):
        reader = PdfReader(path)
        return "\n".join([(page.extract_text() or "") for page in reader.pages])
    if p.endswith(".docx"):
        doc = Document(path)
        return "\n".join([para.text for para in doc.paragraphs])
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

raw_text = load_text(SRC)
assert raw_text.strip(), "El documento parece vac√≠o o no se pudo extraer texto."

def chunk_text(text, max_chars=10000):
    paras = [p.strip() for p in text.split("\n") if p.strip()]
    chunks, cur = [], ""
    for p in paras:
        if len(cur) + len(p) + 1 <= max_chars:
            cur += ("\n" + p) if cur else p
        else:
            chunks.append(cur); cur = p
    if cur: chunks.append(cur)
    return chunks

chunks = chunk_text(raw_text, max_chars=CHUNK_MAX_CHARS)
print(f"üß© Bloques de texto creados: {len(chunks)} (‚âà{CHUNK_MAX_CHARS} chars c/u)")

# ===== Cliente OpenAI =====
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    from getpass import getpass
    api_key = getpass("Pega tu OPENAI_API_KEY y pulsa Enter: ")
    os.environ["OPENAI_API_KEY"] = api_key
client = OpenAI(api_key=api_key)

SYSTEM_PROMPT = "Eres un generador de √≠tems universitarios, preciso, en espa√±ol, y devuelves JSON v√°lido."

# IMPORTANTE: llaves JSON escapadas con {{ }} para no romper str.format
USER_PROMPT_TMPL = """
Eres un generador de preguntas universitarias en psicolog√≠a del lenguaje/lectura.
Crea {n} preguntas tipo test (MCQ) AUTOCONTENIDAS a partir del CONTENIDO. Nivel: universitario.
- 1 correcta + 2 distractores plausibles (sin 'todas las anteriores' / 'ninguna').
- Redacci√≥n clara, sin ambig√ºedad; NO dependas de "seg√∫n el texto".
- Incluye justificaci√≥n breve (1‚Äì2 frases) para la correcta.
- RESPONDE √öNICAMENTE con JSON. NO incluyas explicaciones; NO uses ```json ni fences.

Estructura EXACTA:
{{
  "items": [
    {{
      "id": "BLOQUE1-Q1",
      "stem": "ENUNCIADO AUTOCONTENIDO...",
      "options": ["A...", "B...", "C..."],
      "correct_index": 1,
      "justification": "Por qu√© es correcta...",
      "difficulty": "media",
      "tags": ["efectos de priming","l√©xico"]
    }}
  ]
}}

CONTENIDO:
{content}
"""

def wc(s):
    s = re.sub(r"\s+", " ", s or "").strip()
    return len([w for w in s.split(" ") if w])

# --- Limpieza de JSON devuelto por el modelo ---
def _strip_code_fences(s: str) -> str:
    s = s.strip()
    m = re.match(r"^```(?:json)?\s*([\s\S]*?)\s*```$", s, flags=re.I)
    return m.group(1).strip() if m else s

def _extract_json_object(s: str) -> str:
    start = s.find("{")
    if start == -1: return s
    depth = 0
    for i, ch in enumerate(s[start:], start=start):
        if ch == "{": depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0: return s[start:i+1]
    return s

def _request_openai(prompt: str, retries=MAX_RETRIES):
    """Llamada con reintentos/backoff; aborta limpio si falta cr√©dito."""
    attempt = 0
    while True:
        try:
            # Intento 1: con response_format=json_object
            return client.chat.completions.create(
                model=OPENAI_MODEL,
                messages=[{"role":"system","content":SYSTEM_PROMPT},
                          {"role":"user","content":prompt}],
                temperature=0.4,
                response_format={"type":"json_object"}
            )
        except RateLimitError as e:
            msg = str(e)
            if "insufficient_quota" in msg:
                raise RuntimeError("‚ùå Sin cr√©dito en la API (insufficient_quota). Revisa plan y billing.") from e
            if attempt >= retries:
                raise
            sleep_s = BACKOFF_BASE ** attempt
            print(f"‚è≥ Rate limit. Reintentando en {sleep_s:.1f}s (intento {attempt+1}/{retries})‚Ä¶")
            time.sleep(sleep_s); attempt += 1
        except Exception as e:
            # Intento 2: sin response_format
            try:
                return client.chat.completions.create(
                    model=OPENAI_MODEL,
                    messages=[{"role":"system","content":SYSTEM_PROMPT},
                              {"role":"user","content":prompt}],
                    temperature=0.4
                )
            except Exception as e2:
                msg = str(e2)
                if "insufficient_quota" in msg:
                    raise RuntimeError("‚ùå Sin cr√©dito en la API (insufficient_quota). Revisa plan y billing.") from e2
                if attempt >= retries:
                    raise
                sleep_s = BACKOFF_BASE ** attempt
                print(f"‚è≥ Reintentando en {sleep_s:.1f}s (intento {attempt+1}/{retries})‚Ä¶")
                time.sleep(sleep_s); attempt += 1

def llm_items_from_text(content, block_id="B1", n=6, debug=DEBUG_JSON):
    prompt = USER_PROMPT_TMPL.format(content=content, n=n)
    resp = _request_openai(prompt)
    raw = resp.choices[0].message.content

    # Parseo tolerante
    try:
        data = json.loads(raw)
    except Exception:
        cleaned = _strip_code_fences(raw)
        cleaned = _extract_json_object(cleaned)
        if debug:
            print("DEBUG raw[:400]:", raw[:400])
            print("DEBUG cleaned[:400]:", cleaned[:400])
        data = json.loads(cleaned)

    # Tolerancia a claves con saltos/espacios raros
    if "items" not in data:
        for k in list(data.keys()):
            if "items" in k.replace("\n","").replace(" ",""):
                data["items"] = data.pop(k)
                break

    items = data.get("items", [])
    norm = []
    for i, it in enumerate(items, start=1):
        opts = it.get("options", [])
        if len(opts) != 3:
            continue
        ci = int(it.get("correct_index", 0))
        norm.append({
            "id": it.get("id") or f"{block_id}-Q{i}",
            "stem": str(it.get("stem","")).strip(),
            "options": [str(o).strip() for o in opts],
            "correct_index": ci if 0 <= ci < 3 else 0,
            "justification": str(it.get("justification","")).strip(),
            "difficulty": it.get("difficulty","media"),
            "tags": it.get("tags",[])
        })
    return norm

def balance_and_shuffle(item, diff_threshold=THRESH_DIFF, seed=RANDOM_SEED):
    rnd = random.Random(seed)
    opts = item["options"]; ci = item["correct_index"]
    Lc = wc(opts[ci]); new_opts = opts[:]
    for i, opt in enumerate(new_opts):
        if i == ci:
            continue
        if (Lc - wc(opt)) > diff_threshold:
            extra = rnd.choice([
                " Este patr√≥n se ha descrito en estudios de priming y decisi√≥n l√©xica.",
                " La literatura lo vincula con activaci√≥n competitiva y control inhibitorio.",
                " Se replica en lectores con distintos niveles de proficiencia."
            ])
            new_opts[i] = (opt.strip() + extra)
    pairs = [(o, i==ci) for i,o in enumerate(new_opts)]
    rnd.shuffle(pairs)
    item["options"] = [p[0] for p in pairs]
    item["correct_index"] = next(i for i,p in enumerate(pairs) if p[1])
    return item

def validate_item(it):
    ok = True
    if len(it.get("options",[])) != 3: ok=False
    if not (0 <= it.get("correct_index", -1) < 3): ok=False
    if ok:
        s = set([o.strip().lower() for o in it["options"]])
        if len(s) < 3: ok=False
    if wc(it.get("stem","")) < 6: ok=False
    return ok

def to_moodle_xml(items, xml_path="equilibrado_IA.xml"):
    soup = BeautifulSoup('<?xml version="1.0" encoding="UTF-8"?><quiz></quiz>', "xml")
    quiz = soup.find("quiz")
    for it in items:
        q = soup.new_tag("question", type="multichoice")
        qt = soup.new_tag("questiontext", format="html")
        qt_text = soup.new_tag("text"); qt_text.string = it["stem"]
        qt.append(qt_text); q.append(qt)
        for i,opt in enumerate(it["options"]):
            ans = soup.new_tag("answer", fraction="100" if i==it["correct_index"] else "0")
            at = soup.new_tag("text"); at.string = opt
            ans.append(at); q.append(ans)
        quiz.append(q)
    xml_str = str(soup)
    root = etree.fromstring(xml_str.encode("utf-8"), parser=etree.XMLParser(recover=True))
    with open(xml_path, "wb") as f:
        f.write(etree.tostring(root, encoding="utf-8", xml_declaration=True, pretty_print=True))
    return xml_path

def to_pdf(items, pdf_path="equilibrado_IA.pdf"):
    doc = SimpleDocTemplate(pdf_path, pagesize=A4)
    styles = getSampleStyleSheet()
    story = [Paragraph("<b>Banco de preguntas (IA)</b>", styles["Title"]), Spacer(1,10)]
    for i,it in enumerate(items, start=1):
        story.append(Paragraph(f"<b>{i}. {it['stem']}</b>", styles["Normal"]))
        for j,opt in enumerate(it["options"]):
            mark = " ‚úÖ" if j==it["correct_index"] else ""
            story.append(Paragraph(f"{chr(97+j)}) {opt}{mark}", styles["Normal"]))
        if it.get("justification"):
            story.append(Paragraph(f"<i>Justificaci√≥n:</i> {it['justification']}", styles["Normal"]))
        story.append(Spacer(1,8))
    doc.build(story)
    return pdf_path


üìÅ Sube tu documento base (PDF/DOCX/TXT)


Saving Neurociencia-del-lenguaje.pdf to Neurociencia-del-lenguaje (11).pdf
‚úÖ Cargado: Neurociencia-del-lenguaje (11).pdf
üß© Bloques de texto creados: 14 (‚âà6000 chars c/u)


In [None]:
# =========================
#  Celda 4 PRO (compatible con Colab, sin versiones fijas)
#  - Instala paquetes si faltan
#  - Descarga el modelo de spaCy espa√±ol (md o sm) autom√°ticamente
#  - Fallback si no hay YAKE o modelo spaCy
#  - Genera items mejores sin usar API y exporta XML+PDF
# =========================
import sys, subprocess, pkgutil, os, re, random
from collections import Counter
from datetime import datetime

def ensure(pkg):
    if pkg in {m.name for m in pkgutil.iter_modules()}:
        return
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

# 1) Asegurar dependencias
ensure("spacy")
ensure("scikit-learn")
try:
    ensure("yake")
    import yake
    YAKE_OK = True
except Exception:
    YAKE_OK = False
    yake = None

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 2) Cargar modelo de spaCy (md -> sm -> multiling√ºe) sin fijar versiones
def load_es_model():
    for model in ["es_core_news_md", "es_core_news_sm", "xx_sent_ud_sm"]:
        try:
            return spacy.load(model), model
        except OSError:
            try:
                import spacy.cli
                spacy.cli.download(model)
                return spacy.load(model), model
            except Exception:
                continue
    return None, None

nlp, model_name = load_es_model()
if nlp is None:
    raise RuntimeError("No pude cargar ning√∫n modelo de spaCy. ¬øBloqueado internet del runtime?")

random.seed(42)

# ---- Texto base viene de la Celda 3 ----
def clean_spaces(s): return re.sub(r"\s+", " ", s or "").strip()
doc_text = clean_spaces(raw_text)

# ---- Segmentaci√≥n en oraciones con spaCy (fallback por regex si hiciera falta) ----
def split_sentences(text, min_words=10, max_words=55):
    sents = []
    try:
        doc = nlp(text)
        for s in doc.sents:
            t = clean_spaces(s.text)
            wc = len(t.split())
            if min_words <= wc <= max_words:
                sents.append(t)
    except Exception:
        pass
    if len(sents) < 40:
        sents_alt = [clean_spaces(x) for x in re.split(r"[\.!?]\s+", text) if len(x.split()) >= min_words]
        sents = list(dict.fromkeys(sents + sents_alt))
    return sents

sents = split_sentences(doc_text)
assert sents, "No se encontraron oraciones adecuadas. Baja min_words o usa otro documento."

# ---- Palabras clave globales ----
STOP = set("""
de la que el los un una y o en a para por con sin sobre entre como m√°s menos muy
este esta estos estas ese esa esos esas aquel aquella aquellos aquellas lo al del
""".split())

def extract_global_kws(text, top=60):
    if YAKE_OK:
        try:
            kw_extractor = yake.KeywordExtractor(lan="es", n=1, top=top)
            kws = [k for k,_ in kw_extractor.extract_keywords(text)]
            return [k for k in kws if 3 <= len(k) <= 30]
        except Exception:
            pass
    # Fallback TF-IDF de todo el documento
    toks = re.findall(r"[A-Za-z√Å√â√ç√ì√ö√ú√ë√°√©√≠√≥√∫√º√±]+", text.lower())
    toks = [t for t in toks if t not in STOP and len(t) > 2]
    cnt = Counter(toks)
    return [w for w,_ in cnt.most_common(top)]

global_kws = extract_global_kws(doc_text, top=60)

def wc(s): return len([w for w in re.sub(r"\s+"," ",s).split() if w])

def pad_balance(base_text, target_len):
    paddings = [
        " Este matiz se ha observado en estudios revisados por pares.",
        " Se describe en la literatura especializada con resultados consistentes.",
        " El fen√≥meno puede modularse por la tarea y el nivel de pericia.",
        " El patr√≥n se replica en contextos metodol√≥gicos diversos."
    ]
    t = base_text.strip()
    while wc(t) < target_len:
        t += random.choice(paddings)
        if wc(t) >= target_len: break
    return t

# ---- Distractores y plantillas ----
ANTONYM_MAP = [
    ("aumenta","disminuye"), ("incrementa","reduce"),
    ("facilita","dificulta"), ("mejora","empeora"),
    ("r√°pido","lento"), ("eficiente","ineficiente"),
    ("preciso","impreciso"), ("automatizado","manual")
]
DOMAIN_HINTS = [
    "en tareas de decisi√≥n l√©xica",
    "seg√∫n la evidencia de potenciales evocados",
    "en paradigmas de priming sem√°ntico",
    "en lectores con distinto nivel de proficiencia",
    "seg√∫n meta-an√°lisis recientes",
]

def flip_polarity(txt):
    t = txt
    for a,b in ANTONYM_MAP:
        t = re.sub(rf"\b{a}\b", b, t, flags=re.I)
        t = re.sub(rf"\b{b}\b", a, t, flags=re.I)
    return t

def mk_distractors_from_sentence(sent, kws):
    base = sent
    d1 = flip_polarity(base)
    if d1 == base:
        d1 = "Generaliza el enunciado omitiendo condiciones y l√≠mites del contexto descrito."
    d1 += " " + random.choice(DOMAIN_HINTS)
    if len(kws) >= 2:
        d2 = f"Plantea causalidad directa entre ¬´{kws[0]}¬ª y ¬´{kws[1]}¬ª, aunque el texto sugiere asociaci√≥n parcial."
    else:
        d2 = "Asume una relaci√≥n causal donde el texto solo indica covariaci√≥n limitada."
    d2 += " " + random.choice([h for h in DOMAIN_HINTS if h not in d1] or DOMAIN_HINTS)
    return d1.strip(), d2.strip()

def item_definicion(span_text, span_head):
    stem = f"¬øCu√°l opci√≥n define con mayor precisi√≥n el concepto ¬´{span_head}¬ª?"
    correct = f"{span_head.capitalize()}: {span_text}"
    d1 = f"{span_head.capitalize()}: confunde el fen√≥meno con un correlato metodol√≥gico."
    d2 = f"{span_head.capitalize()}: exagera la generalidad sin apoyo emp√≠rico."
    return stem, correct, d1, d2

def item_cloze(sentence, key):
    masked = re.sub(rf"\b{re.escape(key)}\b", "_____", sentence, flags=re.I)
    if masked == sentence:
        # si no se encontr√≥, usa una global
        key2 = next((k for k in global_kws if k != key), "t√©rmino")
        masked = re.sub(rf"\b{re.escape(key2)}\b", "_____", sentence, flags=re.I)
        key = key2
    stem = f"Complete el enunciado con el t√©rmino que mejor mantiene su sentido: {masked}"
    correct = key
    pool = [k for k in global_kws if k != key]
    d1 = random.choice(pool or ["concepto"])
    d2 = random.choice([k for k in pool if k != d1] or ["procedimiento"])
    return stem, correct, d1, d2

def item_excepcion(sentence, topic):
    stem = f"¬øCu√°l opci√≥n NO es coherente con el alcance del siguiente enunciado?: {sentence}"
    correct = "Introduce una excepci√≥n que el texto no contempla, alterando el alcance interpretativo."
    d1 = "Mantiene el significado y los l√≠mites propuestos en el enunciado original."
    d2 = f"Reformula el contenido manteniendo condiciones sobre {topic}."
    return stem, correct, d1, d2

def item_causal(sentence, kws):
    stem = "¬øCu√°l opci√≥n formula adecuadamente la relaci√≥n entre los elementos descritos?"
    if len(kws) >= 2:
        a,b = kws[0],kws[1]
        correct = f"Asocia ¬´{a}¬ª y ¬´{b}¬ª de modo condicionado, sin asumir causalidad fuerte."
        d1 = f"Afirma causalidad directa entre ¬´{a}¬ª y ¬´{b}¬ª sin evidencia."
        d2 = f"Niega toda relaci√≥n entre ¬´{a}¬ª y ¬´{b}¬ª ignorando coincidencias observadas."
    else:
        correct = "Formula una relaci√≥n condicional moderada, acorde con la evidencia."
        d1 = "Atribuye causalidad absoluta sin soporte emp√≠rico."
        d2 = "Descarta toda relaci√≥n pese a coincidencias reportadas."
    return stem, correct, d1, d2

def extract_head_span(doc):
    noun_chunks = list(doc.noun_chunks)
    if noun_chunks:
        span = max(noun_chunks, key=lambda c: len(c.text))
        return clean_spaces(span.text)
    nouns = [t.text for t in doc if t.pos_ in {"NOUN","PROPN"} and t.text.lower() not in STOP]
    return Counter(nouns).most_common(1)[0][0] if nouns else "concepto"

def keywords_for(sentence, topn=5):
    if YAKE_OK:
        try:
            kws = [k for k,_ in yake.KeywordExtractor(lan="es", n=1, top=topn).extract_keywords(sentence)]
            return list(dict.fromkeys(kws + global_kws))[:max(3, topn)]
        except Exception:
            pass
    # Fallback simple
    toks = [t.lower() for t in re.findall(r"[A-Za-z√Å√â√ç√ì√ö√ú√ë√°√©√≠√≥√∫√º√±]+", sentence)]
    toks = [t for t in toks if t not in STOP and len(t) > 2]
    top = [w for w,_ in Counter(toks).most_common(topn)]
    return list(dict.fromkeys(top + global_kws))[:max(3, topn)]

def build_item(sentence, template_cycle=0):
    try:
        doc = nlp(sentence)
        head = extract_head_span(doc)
    except Exception:
        head = "concepto"
    kws = keywords_for(sentence)
    idx = template_cycle % 4
    if idx == 0: stem,c,d1,d2 = item_definicion(sentence, head)
    elif idx == 1: stem,c,d1,d2 = item_cloze(sentence, kws[0] if kws else head)
    elif idx == 2: stem,c,d1,d2 = item_excepcion(sentence, head)
    else: stem,c,d1,d2 = item_causal(sentence, kws)
    target = max(wc(c), wc(d1), wc(d2))
    d1, d2 = pad_balance(d1,target), pad_balance(d2,target)
    opts = [c,d1,d2]; random.shuffle(opts)
    return {
        "stem": stem,
        "options": opts,
        "correct_index": opts.index(c),
        "justification": "La opci√≥n correcta conserva el sentido original; los distractores alteran matices o causalidad.",
        "difficulty": "media",
        "tags": ["offline-pro", f"spacy:{model_name}", f"yake:{YAKE_OK}"]
    }

def deduplicate_items(items, max_sim=0.92):
    if not items:
        return items
    stems = [it["stem"] for it in items]
    vec = TfidfVectorizer(min_df=1, ngram_range=(1,2)).fit(stems)
    keep = []
    for i,it in enumerate(items):
        si = vec.transform([it["stem"]])
        too_sim = any(cosine_similarity(si, vec.transform([items[j]["stem"]]))[0,0] > max_sim for j in keep)
        if not too_sim: keep.append(i)
    return [items[i] for i in keep]

# ---------- Generar banco ----------
TARGET_ITEMS = 200  # AJUSTA AQU√ç
items = []
for i,s in enumerate(sents):
    it = build_item(s, template_cycle=i)
    it["id"] = f"OFFPRO-{i+1}"
    items.append(it)
    if len(items) >= TARGET_ITEMS * 2:
        break

items = deduplicate_items(items, max_sim=0.92)
if len(items) > TARGET_ITEMS:
    items = items[:TARGET_ITEMS]

print(f"‚úÖ √çtems OFFLINE PRO generados: {len(items)}")

# ---------- Exportar con tus funciones de la Celda 3 ----------
stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
xml_name = f"banco_OFFPRO_{len(items)}_{stamp}.xml"
pdf_name = f"banco_OFFPRO_{len(items)}_{stamp}.pdf"

xml_path = to_moodle_xml(items, xml_path=xml_name)
pdf_path = to_pdf(items, pdf_path=pdf_name)
print("üì¶ XML:", xml_path)
print("üìÑ PDF:", pdf_path)

from google.colab import files
files.download(xml_path)
files.download(pdf_path)

