PASAR TEXTO A FORMATO CONLLU

In [43]:
%pip install -r requirements.txt

Collecting en_core_web_sm@ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85 (from -r requirements.txt (line 35))
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------ --------------------------------- 2.1/12.8 MB 29.6 MB/s eta 0:00:01
     --------------------------- ------------ 8.9/12.8 MB 32.6 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 27.7 MB/s  0:00:00
Collecting es_core_news_sm@ https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl#sha256=e451a83d6df79b87e9eed0cb553f03e99e36a3bab18a7b79f0dcfd1fdf875e12 (from -r requirements.txt (line 36))
  Downloading https://github.com/e

In [44]:
%pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


IMPORTS

In [45]:
import re
import stanza
from typing import List, Dict, Tuple

INICIALIZAR STANZA (ejecutarlo solo una vez)

In [46]:
stanza.download("es")

nlp = stanza.Pipeline(
    lang="es",
    processors="tokenize,mwt,pos,lemma,depparse",
    tokenize_pretokenized=False,
    use_gpu=False
)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 5.33MB/s]                    
2025-12-30 19:16:28 INFO: Downloaded file to C:\Users\ivire\stanza_resources\resources.json
2025-12-30 19:16:28 INFO: Downloading default packages for language: es (Spanish) ...
2025-12-30 19:16:30 INFO: File exists: C:\Users\ivire\stanza_resources\es\default.zip
2025-12-30 19:16:46 INFO: Finished downloading models and saved to C:\Users\ivire\stanza_resources
2025-12-30 19:16:49 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 1.78MB/s]                    
2025-12-30 19:16:50 INFO: Downloaded file to C:\Users\ivire\stanza_resources\resources.json
2025-12-30 19:16:53 INFO: Loading the

LISTA BLANCA DE IMPERATIVOS CON CLÍTICOS

In [66]:
IMPERATIVE_MAP = {
    r"(?:C|c)óme((?:me|te|se|nos|os)?)((?:lo|la|los|las)?)": {
        "base": "come",
        "lemma": "comer",
        "upos": "VERB"
    },
    r"(?:v|V)íste((?:me|te|se|nos|os)?)((?:lo|la|los|las)?)": {
        "base": "viste",
        "lemma": "vestir",
        "upos": "VERB"
    },
    r"(?:a|A)cués(?:ta|te)((?:me|te|se|nos|os)?)((?:lo|la|le|los|las|les)?)": {
        "base": "acuesta",
        "lemma": "acostar",
        "upos": "VERB"
    },
    r"(?:a|A)costaos": {
        "base": "acosta",
        "lemma": "acostar",
        "upos": "VERB"
    },
    r"(?:d|D)[íi]((?:me|te|se|nos|os)?)((?:lo|la|los|las)?)": {
        "base": "di",
        "lemma": "decir",
        "upos": "VERB"
    },
    r"(?:m|M)uéstra((?:me|te|se|nos|os)?)((?:lo|la|los|las)?)": {
        "base": "muestra",
        "lemma": "mostrar",
        "upos": "VERB"
    },
    # Añadir más verbos según necesidad
}

PREPROCESADOR DE TEXTO (reescribe imperativos problemáticos)

In [56]:
CLITICS = ["me", "te", "se", "nos", "os", "lo", "la", "le", "los", "las", "les"]

In [57]:
def split_clitics(clitic_string: str):
    clitics = []
    remaining = clitic_string

    for cl in CLITICS:
        if remaining.startswith(cl):
            clitics.append(cl)
            remaining = remaining[len(cl):]

    return clitics


In [71]:
def preprocess_imperatives(text: str):
    imperative_meta = []

    def make_replacer(pattern: str, rule: Dict):
        regex = re.compile(pattern, flags=re.IGNORECASE)

        def replacer(match):
            groups = match.groups()
            clitic_string = "".join(g for g in groups if g) if groups else ""
            clitics = split_clitics(clitic_string)

            # Si no hay clíticos, no tocar la palabra original
            if not clitics:
                return match.group(0)
            

            replacement = " ".join([rule["base"]] + clitics)

            imperative_meta.append({
                "base": rule["base"],
                "lemma": rule["lemma"],
                "upos": rule["upos"]
            })

            # Mantener mayúscula inicial
            if match.group(0)[0].isupper():
                replacement = replacement.capitalize()

            return replacement

        return regex, replacer

    for pattern, rule in IMPERATIVE_MAP.items():
        regex, replacer = make_replacer(pattern, rule)
        text = regex.sub(replacer, text)

    return text, imperative_meta

TEXTO BRUTO → DOCUMENTO STANZA

In [59]:
def parse_text(text: str):
    preprocessed, imperative_meta = preprocess_imperatives(text)
    doc = nlp(preprocessed)
    return doc, imperative_meta

DOCUMENTO STANZA → FORMATO CoNLL-U

In [60]:
def stanza_doc_to_conllu(doc, imperative_meta) -> str:
    lines = []
    sent_id = 1

    for sent in doc.sentences:
        lines.append(f"# sent_id = {sent_id}")
        lines.append(f"# text = {sent.text}")

        for word in sent.words:
            lemma = word.lemma
            upos = word.upos

            # Corrección manual del lema si viene de whitelist
            for meta in imperative_meta:
                if word.text.lower() == meta["base"]:
                    lemma = meta["lemma"]
                    upos = meta["upos"]

            feats = word.feats if word.feats else "_"

            misc = []
            if word.start_char is not None and word.end_char is not None:
                misc.append(f"CharOffset={word.start_char}:{word.end_char}")
            misc = "|".join(misc) if misc else "_"

            lines.append("\t".join([
                str(word.id),        # ID
                word.text,           # FORM
                lemma or "_",         # LEMMA (corregido)
                upos or "_",          # UPOS
                word.xpos or "_",     # XPOS
                feats,                # FEATS
                str(word.head),       # HEAD
                word.deprel or "_",   # DEPREL
                "_",                  # DEPS
                misc                  # MISC
            ]))

        lines.append("")
        sent_id += 1

    return "\n".join(lines)

FUNCIÓN PRUNCIPAL: TEXTO(S) → CoNLL-U

In [61]:
def texts_to_conllu(texts: List[str]) -> str:
    docs = []

    for i, text in enumerate(texts, start=1):
        doc, imperative_meta = parse_text(text)
        conllu = stanza_doc_to_conllu(doc, imperative_meta)

        docs.append(f"# newdoc id = doc_{i}")
        docs.append(conllu)

    return "\n".join(docs)

EJEMPLO DE USO

In [72]:
corpus = [
    'le di el libro',
    'dilo'
]


conllu_output = texts_to_conllu(corpus)
print(conllu_output)

# newdoc id = doc_1
# sent_id = 1
# text = le di el libro
1	le	él	PRON	pp3csd00	Case=Dat|Number=Sing|Person=3|PronType=Prs	2	obl:arg	_	CharOffset=0:2
2	di	dar	VERB	vmis1s0	Mood=Ind|Number=Sing|Person=1|Tense=Past|VerbForm=Fin	0	root	_	CharOffset=3:5
3	el	el	DET	da0ms0	Definite=Def|Gender=Masc|Number=Sing|PronType=Art	4	det	_	CharOffset=6:8
4	libro	libro	NOUN	ncms000	Gender=Masc|Number=Sing	2	obj	_	CharOffset=9:14

# newdoc id = doc_2
# sent_id = 1
# text = di lo
1	di	decir	VERB	vmip1s0	VerbForm=Fin	0	root	_	CharOffset=0:2
2	lo	él	PRON	_	Case=Acc|Gender=Masc|Number=Sing|Person=3|PrepCase=Npr|PronType=Prs	1	obj	_	CharOffset=3:5



In [73]:
file_name = "entrada.conllu"

with open(file_name, "w", encoding="utf-8") as f:
    f.write(conllu_output)

print(f"Archivo guardado como {file_name}")

Archivo guardado como entrada.conllu
