PASAR TEXTO A FORMATO CONLLU

In [1]:
%pip install -r requirements.txt

Collecting es_core_news_sm@ https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl#sha256=e451a83d6df79b87e9eed0cb553f03e99e36a3bab18a7b79f0dcfd1fdf875e12 (from -r requirements.txt (line 35))
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
     ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
     ------------- -------------------------- 4.2/12.9 MB 31.5 MB/s eta 0:00:01
     ------------------------------- ------- 10.5/12.9 MB 29.7 MB/s eta 0:00:01
     ---------------------------------------- 12.9/12.9 MB 26.9 MB/s  0:00:00
Collecting es_dep_news_trf@ https://github.com/explosion/spacy-models/releases/download/es_dep_news_trf-3.8.0/es_dep_news_trf-3.8.0-py3-none-any.whl#sha256=afe76019ca75827db18f14f282f4678b1544277b29dcba8011b400837b1

In [12]:
%pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


IMPORTS

In [4]:
import re
import stanza
from typing import List

LISTA BLANCA DE IMPERATIVOS CON CLÍTICOS

In [5]:
IMPERATIVE_MAP = {
    r"cóme((?:me|te|se|nos|os)?)(?:lo|la|los|las)?": "come",
    r"víste((?:me|te|se|nos|os)?)(?:lo|la|los|las)?": "viste",
    r"acúes(?:ta|te)((?:me|te|se|nos|os)?)(?:lo|la|le|los|las|les)?": "acuesta",
    r"dí((?:me|te|se|nos|os)?)(?:lo|la|los|las)?": "di",
    r"múestra((?:me|te|se|nos|os)?)(?:lo|la|los|las)?": "muestra",
    # añadir más imperativos según necesidad
}

PREPROCESADOR DE TEXTO (reescribe imperativos problemáticos)

In [6]:
def preprocess_imperatives(text: str, imperative_map: dict) -> str:
    """
    Reemplaza imperativos problemáticos por su raíz + clíticos separados
    """
    def replace(match: re.Match) -> str:
        word = match.group(0)
        for pattern, base in imperative_map.items():
            m = re.fullmatch(pattern, word, flags=re.IGNORECASE)
            if m:
                clitics = [g for g in m.groups() if g]  # Solo los clíticos capturados
                replacement = base + (" " + " ".join(clitics) if clitics else "")
                # Mantener mayúscula inicial
                if word[0].isupper():
                    replacement = replacement.capitalize()
                return replacement
        return word

    if not imperative_map:
        return text

    # Crear patrón combinado para buscar palabras
    combined_pattern = re.compile("|".join(imperative_map.keys()), flags=re.IGNORECASE)
    return combined_pattern.sub(replace, text)

INICIALIZAR STANZA (ejecutarlo solo una vez)

In [7]:
stanza.download("es")

nlp = stanza.Pipeline(
    lang="es",
    processors="tokenize,mwt,pos,lemma,depparse",
    tokenize_pretokenized=False,
    use_gpu=False
)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 3.04MB/s]                    
2025-12-30 15:16:35 INFO: Downloaded file to C:\Users\ivire\stanza_resources\resources.json
2025-12-30 15:16:35 INFO: Downloading default packages for language: es (Spanish) ...
2025-12-30 15:16:39 INFO: File exists: C:\Users\ivire\stanza_resources\es\default.zip
2025-12-30 15:16:47 INFO: Finished downloading models and saved to C:\Users\ivire\stanza_resources
2025-12-30 15:16:47 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 2.69MB/s]                    
2025-12-30 15:16:48 INFO: Downloaded file to C:\Users\ivire\stanza_resources\resources.json
2025-12-30 15:16:49 INFO: Loading the

TEXTO BRUTO → DOCUMENTO STANZA

In [8]:
def parse_text(text: str):
    preprocessed = preprocess_imperatives(text, IMPERATIVE_MAP)
    return nlp(preprocessed)

DOCUMENTO STANZA → FORMATO CoNLL-U

In [9]:
def stanza_doc_to_conllu(doc) -> str:
    lines = []
    sent_id = 1
    for sent in doc.sentences:
        lines.append(f"# sent_id = {sent_id}")
        lines.append(f"# text = {sent.text}")
        for word in sent.words:
            feats = word.feats if word.feats else "_"
            misc = f"CharOffset={word.start_char}:{word.end_char}" if word.start_char is not None else "_"
            lines.append("\t".join([
                str(word.id),
                word.text,
                word.lemma or "_",
                word.upos or "_",
                word.xpos or "_",
                feats,
                str(word.head),
                word.deprel or "_",
                "_",
                misc
            ]))
        lines.append("")
        sent_id += 1
    return "\n".join(lines)

FUNCIÓN PRUNCIPAL: TEXTO(S) → CoNLL-U

In [10]:
def texts_to_conllu(texts: List[str]) -> str:
    all_docs = []
    for i, text in enumerate(texts, start=1):
        doc = parse_text(text)
        conllu = stanza_doc_to_conllu(doc)
        all_docs.append(f"# newdoc id = doc_{i}")
        all_docs.append(conllu)
    return "\n".join(all_docs)

EJEMPLO DE USO

In [11]:
corpus = [
    "Juan cómete la manzana.",
    "Vístete rápido, que ya nos vamos.",
    "Acuéstanos temprano, por favor.",
    "Acostaos ya, chicos.",
    "Dímelo ahora mismo.",
    "Muéstrame tu chaqueta nueva.",
    "Mi madre me dijo: cómetelo.",
    "No olvides cómelos todos."
]


conllu_output = texts_to_conllu(corpus)
print(conllu_output)

# newdoc id = doc_1
# sent_id = 1
# text = Juan come te la manzana.
1	Juan	Juan	PROPN	np00000	_	2	nsubj	_	CharOffset=0:4
2	come	comir	VERB	vmip3s0	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	0	root	_	CharOffset=5:9
3	te	tú	PRON	_	Case=Dat|Number=Sing|Person=2|PrepCase=Npr|PronType=Prs	2	obl:arg	_	CharOffset=10:12
4	la	el	DET	da0	Definite=Def|Gender=Fem|Number=Sing|PronType=Art	5	det	_	CharOffset=13:15
5	manzana	manzana	NOUN	ncfs000	Gender=Fem|Number=Sing	2	obj	_	CharOffset=16:23
6	.	.	PUNCT	fp	PunctType=Peri	2	punct	_	CharOffset=23:24

# newdoc id = doc_2
# sent_id = 1
# text = Viste te rápido, que ya nos vamos.
1	Viste	ver	VERB	vmip3s0	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	0	root	_	CharOffset=0:5
2	te	tú	PRON	_	Case=Dat|Number=Sing|Person=2|PrepCase=Npr|PronType=Prs	1	obl:arg	_	CharOffset=6:8
3	rápido	rápido	ADJ	aq0ms0	Gender=Masc|Number=Sing	1	advmod	_	CharOffset=9:15
4	,	,	PUNCT	fc	PunctType=Comm	8	punct	_	CharOffset=15:16
5	que	que	PRON	pr0cn000	PronType=R