In [11]:
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install spacy


Defaulting to user installation because normal site-packages is not writeable
Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 1.8/1.8 MB 32.3 MB/s eta 0:00:00
Installing collected packages: pip
Successfully installed pip-25.3
Defaulting to user installation because normal site-packages is not writeable


In [17]:
!{sys.executable} -m spacy download es_core_news_md


Defaulting to user installation because normal site-packages is not writeable
Collecting es-core-news-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.8.0/es_core_news_md-3.8.0-py3-none-any.whl (42.3 MB)
     ---------------------------------------- 0.0/42.3 MB ? eta -:--:--
      --------------------------------------- 0.8/42.3 MB 16.9 MB/s eta 0:00:03
     --------- ----------------------------- 10.5/42.3 MB 40.9 MB/s eta 0:00:01
     ------------------ -------------------- 20.2/42.3 MB 44.0 MB/s eta 0:00:01
     --------------------- ----------------- 23.3/42.3 MB 34.3 MB/s eta 0:00:01
     ------------------------ -------------- 26.7/42.3 MB 30.3 MB/s eta 0:00:01
     ---------------------------- ---------- 31.2/42.3 MB 28.3 MB/s eta 0:00:01
     --------------------------------- ----- 36.7/42.3 MB 27.8 MB/s eta 0:00:01
     --------------------------------------  42.2/42.3 MB 28.3 MB/s eta 0:00:01
     --------------------------

Ignorar lo de arriba si no se va a ejecutar en local y en caso de que no se reconozca los modelos de spacy

In [1]:
import os
import re
import json
from pathlib import Path
from typing import List, Dict, Tuple

import fitz  # PyMuPDF
import pandas as pd
from tqdm import tqdm

import spacy

### CONFIGURACIÓN 

In [2]:
INPUT_DIR = Path("./documentos_prueba")   # carpeta con PDFs aqui estimado cambielo dependiendo de que carpeta quiera usar, OJO debe de usar los pdf que desea pre procesar
OUTPUT_DIR = Path("./salida_con_reduccion_simple")              # carpeta de salida
ANONYMIZE = True                           # True / False, aqui va a depender si quiere anonimizar o no
SPACY_MODEL = "es_core_news_md"             # sm / md / lg, recomiendo md, me dio mejores resultados, aunque depende un poco cuanto preprocese
ENTITIES = ["PERSON", "LOC", "GPE", "ORG"]  # entidades a anonimizar, de igual forma puede agregar mas si lo desea, aunque estas son las mas comunes

### Regex y patrones

In [3]:
PATTERNS = {
    'tc_header': re.compile(r'TRIBUNAL CONSTITUCIONAL( DEL PERÚ)?', re.I),
    'page_num': re.compile(r'P(á|a)gina\s*\d+\s*(de\s*\d+)?', re.I),
    'line_only_numbers': re.compile(r'^\s*\d+\s*$', re.M),
    'expediente': re.compile(r'Exp\.?\s*N\.?\s*°?\s*\d{1,7}-\d{4}-[A-Z]{2}/TC', re.I),
    'stc': re.compile(r'(STC|Sentencia)\s*N\.?\s*°?\s*\d{1,7}-\d{4}-[A-Z]{2}/TC', re.I),
    'ley': re.compile(r'Ley\s*N\.?\s*°?\s*\d{1,6}', re.I),
    'articulo': re.compile(r'(Art\.?|Artículo)\s*\d+[A-Za-z]?\b', re.I),
    'folio': re.compile(r'\b(f\.?|fs\.?)\s*\d+(-\d+)?\b', re.I),
}

REPLACEMENTS = {
    'expediente': 'EXP_REF',
    'stc': 'STC_REF',
    'ley': 'LEY_REF',
    'articulo': 'ARTICULO_REF',
    'folio': 'FOLIO_REF',
}

### Stopwords

In [4]:
LEGAL_STOP_PHRASES = [
    r"\bpor\s+tanto\b",
    r"\ben\s+consecuencia\b",
    r"\bde\s+conformidad\s+con\b",
    r"\ben\s+ese\s+sentido\b",
    r"\basí\s+las\s+cosas\b",
    r"\bconforme\s+a\b",
    r"\bde\s+acuerdo\s+con\b",
    r"\bse\s+advierte\s+que\b",
    r"\bcorresponde\s+señalar\b",
    r"\bresulta\s+evidente\s+que\b",
]


### Funciones

In [5]:
def remove_legal_stop_phrases(text: str) -> str:
    for pattern in LEGAL_STOP_PHRASES:
        text = re.sub(pattern, " ", text, flags=re.I)
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip()

def reduce_tokens(text: str, nlp) -> str:
    """
    Reducción basada en stopwords comunes y ruido lingüístico.
    NO elimina vocabulario jurídico específico.
    """
    doc = nlp(text)
    tokens = []

    for token in doc:
        # stopwords estándar
        if token.is_stop:
            continue

        # ruido común
        if token.is_punct:
            continue
        if token.like_num:
            continue
        if not token.is_alpha:
            continue

        lemma = token.lemma_.lower().strip()

        if len(lemma) <= 2:
            continue

        # eliminar restos genéricos
        if lemma in {"ref", "url"}:
            continue

        # categorías informativas
        if token.pos_ not in {"NOUN", "VERB", "ADJ"}:
            continue

        tokens.append(lemma)

    return " ".join(tokens)



def segment_text_spacy(
    text: str,
    nlp,
    max_tokens: int = 480,
    overlap: int = 50
) -> List[str]:
    doc = nlp(text)
    tokens = [t.text for t in doc]

    segments = []
    start = 0

    while start < len(tokens):
        end = start + max_tokens
        chunk = tokens[start:end]
        segments.append(" ".join(chunk))
        start = end - overlap

    return segments

def extract_text_from_pdf(pdf_path: Path) -> str:
    doc = fitz.open(pdf_path)
    pages = [p.get_text("text") for p in doc]
    doc.close()
    return "\n\f\n".join(pages)


def normalize_citations(text: str) -> str:
    text = PATTERNS['tc_header'].sub(' ', text)
    text = PATTERNS['page_num'].sub(' ', text)
    text = PATTERNS['line_only_numbers'].sub(' ', text)

    for k, v in REPLACEMENTS.items():
        text = PATTERNS[k].sub(v, text)

    text = re.sub(r'http[s]?://\S+', 'URL_REF', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'[ \t]{2,}', ' ', text)

    return text.strip()


def remove_structural_noise(text: str) -> str:
    lines = []
    for l in text.splitlines():
        if len(l) > 30 and l.isupper():
            continue
        lines.append(l)
    return "\n".join(lines)


class Anonymizer:
    def __init__(self, model_name, entities):
        self.nlp = spacy.load(model_name)
        self.entities = set(entities)

    def anonymize_text(self, text):
        doc = self.nlp(text)
        chars = list(text)
        replaced = []

        for ent in reversed(doc.ents):
            if ent.label_ in self.entities:
                tag = f"{ent.label_}_REF"
                chars[ent.start_char:ent.end_char] = list(tag)
                replaced.append(ent.text)

        return "".join(chars), replaced


def save_text(path: Path, content: str):
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(content, encoding="utf-8")

### PIPELINE

In [None]:
def run_pipeline():
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    anonymizer = Anonymizer(SPACY_MODEL, ENTITIES) if ANONYMIZE else None
    nlp = spacy.load(SPACY_MODEL)

    pdfs = list(INPUT_DIR.rglob("*.pdf"))
    summary = []

    for pdf in tqdm(pdfs, desc="Procesando PDFs"):
        raw = extract_text_from_pdf(pdf)

        clean = remove_structural_noise(raw)
        normalized = normalize_citations(clean)

        legal_clean = remove_legal_stop_phrases(normalized)

        reduced_text = reduce_tokens(
            legal_clean,
            nlp
        )


        save_text(
            OUTPUT_DIR / "reduced" / f"{pdf.stem}.txt",
            reduced_text
        )


        if anonymizer:
            anon_text, entities = anonymizer.anonymize_text(legal_clean)
        else:
            anon_text = legal_clean
            entities = []

        save_text(
            OUTPUT_DIR / "anonymized" / f"{pdf.stem}.txt",
            anon_text
        )

        segments = segment_text_spacy(
            reduced_text,
            nlp,
            max_tokens=480,
            overlap=50
        )

        for i, seg in enumerate(segments):
            save_text(
                OUTPUT_DIR / "segments" / f"{pdf.stem}_seg_{i}.txt",
                seg
            )

        save_text(
            OUTPUT_DIR / "annotations" / f"{pdf.stem}.json",
            json.dumps(entities, ensure_ascii=False, indent=2)
        )

        summary.append({
            "file": pdf.name,
            "segments": len(segments),
            "chars": len(anon_text),
            "entities": len(entities)
        })

    pd.DataFrame(summary).to_csv(
        OUTPUT_DIR / "summary.csv", index=False
    )

    print("Pipeline optimizado y listo para entrenamiento wazaaaaa")


dale dale

In [7]:
run_pipeline()

Procesando PDFs: 100%|██████████| 330/330 [05:29<00:00,  1.00it/s]

Pipeline optimizado y listo para entrenamiento



