Limpieza de los art√≠culos (Eliminaci√≥n de datos innecesarios como imagenes y tablas) y transformaci√≥n de los pdf en un formato plano como .txt

In [9]:
import fitz
import re
import os
import glob

# --- Funci√≥n para limpiar el texto de un art√≠culo ---
def clean_scientific_text(text):
    """
    Limpia el texto extra√≠do de art√≠culos cient√≠ficos en ingl√©s.
    Elimina referencias, leyendas de figuras y tablas, ecuaciones y s√≠mbolos.
    """
    # Convertir a una sola l√≠nea y normalizar los espacios
    text = re.sub(r'\s+', ' ', text)

    # Eliminar los t√≠tulos de las figuras y tablas (por ejemplo, "Figura 1. Resultados de...", "La tabla 2 muestra...")
    text = re.sub(r'(Figure|Fig\.|Table)\s*\d+[.:]?.*?(?=\s[A-Z])', '', text, flags=re.IGNORECASE)

    # Remueve referencias como [1], [12, 13], (3), etc.
    text = re.sub(r'\[\d+(,\s*\d+)*\]', '', text)
    text = re.sub(r'\(\d+(,\s*\d+)*\)', '', text)

    # Eliminar los t√≠tulos de secci√≥n comunes que no son necesarios para el an√°lisis de texto
    text = re.split(r'(REFERENCES|References|Bibliography|Acknowledgments|ACKNOWLEDGEMENTS)', text, flags=re.IGNORECASE)[0]

    # Elimine las ecuaciones de estilo LaTeX o las f√≥rmulas matem√°ticas en l√≠nea.
    text = re.sub(r'\$.*?\$|\\\(.*?\\\)|\\\[.*?\\\]', '', text)

    # Eliminar caracteres especiales, s√≠mbolos y vi√±etas t√≠picos
    text = re.sub(r'[‚Ä¢‚óè‚ñ∂‚ñ∫‚ñ†‚ñ°‚ñ≥‚Üí‚Üê‚Üë‚Üì‚âà¬±‚â•‚â§‚â†√ó√∑‚àû¬µŒ©¬∞‚àë‚àö‚àÜ‚àÇ]', '', text)

    # Eliminar las menciones a figuras y tablas como "v√©ase la figura 1" o "en la tabla 3".
    text = re.sub(r'\b(see\s)?(Figure|Fig\.|Table)\s*\d+\b', '', text, flags=re.IGNORECASE)

    # Eliminar las referencias persistentes a "et al." y los DOI
    text = re.sub(r'et al\.', '', text, flags=re.IGNORECASE)
    text = re.sub(r'doi:\s*\S+', '', text, flags=re.IGNORECASE)

    # Eliminar espacios m√∫ltiples nuevamente
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

In [10]:
# --- Procesar m√∫ltiples art√≠culos PDF ---
def process_papers(input_folder, output_folder):
    """
    Procesa todos los archivos PDF en 'input_folder', limpia el texto,
    y guarda cada versi√≥n limpia como un archivo .txt en 'output_folder'.
    """
    os.makedirs(output_folder, exist_ok=True)
    pdf_files = glob.glob(os.path.join(input_folder, "*.pdf"))

    for pdf_path in pdf_files:
        file_name = os.path.splitext(os.path.basename(pdf_path))[0]
        output_path = os.path.join(output_folder, f"{file_name}_clean.txt")

        print(f"üìÑ Processing: {file_name}...")

        text = ""
        try:
            with fitz.open(pdf_path) as doc:
                text = ""
                for page in doc:
                    text += page.get_text("text") + "\n"
        except Exception as e:
            print(f"‚ö†Ô∏è Error reading {file_name}: {e}")
            continue
        cleaned_text = clean_scientific_text(text)

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(cleaned_text)

        print(f"Texto limpio y guardado en: {output_path}\n")

In [11]:
process_papers("articulos","articulos_limpios")

üìÑ Processing: 1903.10676...
Texto limpio y guardado en: articulos_limpios\1903.10676_clean.txt

üìÑ Processing: 1904.03323...
Texto limpio y guardado en: articulos_limpios\1904.03323_clean.txt

üìÑ Processing: 1_s20_S2472630325000433_main...
Texto limpio y guardado en: articulos_limpios\1_s20_S2472630325000433_main_clean.txt

üìÑ Processing: 1_s20_S2590041224000527_main...
Texto limpio y guardado en: articulos_limpios\1_s20_S2590041224000527_main_clean.txt

üìÑ Processing: 2303.18223...
Texto limpio y guardado en: articulos_limpios\2303.18223_clean.txt

üìÑ Processing: Advancing_equity_in_breast_cancer_care_Natural_language_processing_for_analysing_treatment_outcomes_in_underrepresented_...
Texto limpio y guardado en: articulos_limpios\Advancing_equity_in_breast_cancer_care_Natural_language_processing_for_analysing_treatment_outcomes_in_underrepresented__clean.txt

üìÑ Processing: articulo1...
Texto limpio y guardado en: articulos_limpios\articulo1_clean.txt

üìÑ Processing: 

Probar modelos el base y un ajustado por un autor en Hugging face

In [14]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

def run_ner(model_name, text):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    results = nlp(text)
    return results

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
text = ("Patients with diabetes often exhibit elevated levels of HbA1c and an increased risk of cardiovascular disease.")
    
model1 = "brad1141/bert-finetuned-ner"
print("=== Results with model1 ===")
print(run_ner(model1, text))
    
model2 = "dmis-lab/biobert-v1.1"
print("=== Results with model2 ===")
print(run_ner(model2, text))

=== Results with model1 ===


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of the model checkpoint at brad1141/bert-finetuned-ner were not used when initializing LongformerForTokenClassification: ['longformer.embeddings.position_ids']
- This IS expected if you are initializing LongformerForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForTokenClassification fro

[{'entity_group': 'Evidence', 'score': 0.54050946, 'word': ' Patients with diabetes often exhibit elevated levels of HbA1c and an increased risk of', 'start': 0, 'end': 86}, {'entity_group': 'Lead', 'score': 0.42033133, 'word': ' cardiovascular', 'start': 87, 'end': 101}, {'entity_group': 'Evidence', 'score': 0.4821935, 'word': ' disease.', 'start': 102, 'end': 110}]
=== Results with model2 ===


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


[{'entity_group': 'LABEL_1', 'score': 0.5935918, 'word': 'Patients with diabetes often exhibit elevated levels of', 'start': 0, 'end': 55}, {'entity_group': 'LABEL_0', 'score': 0.5039037, 'word': 'H', 'start': 56, 'end': 57}, {'entity_group': 'LABEL_1', 'score': 0.5493174, 'word': '##bA1', 'start': 57, 'end': 60}, {'entity_group': 'LABEL_0', 'score': 0.51290685, 'word': '##c', 'start': 60, 'end': 61}, {'entity_group': 'LABEL_1', 'score': 0.56482446, 'word': 'and an', 'start': 62, 'end': 68}, {'entity_group': 'LABEL_0', 'score': 0.50528085, 'word': 'increased', 'start': 69, 'end': 78}, {'entity_group': 'LABEL_1', 'score': 0.5670954, 'word': 'risk of cardiovascular disease.', 'start': 79, 'end': 110}]


In [12]:
MODEL_NAME = "ghadeermobasher/BC4-Original-biobert-v1.1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")


NameError: name 'AutoTokenizer' is not defined

In [12]:
import os
import json
# --- Funci√≥n para dividir el texto en chunks (por tokens o palabras) ---
def chunk_text(text, max_tokens=512):
    words = text.split()
    for i in range(0, len(words), max_tokens):
        yield " ".join(words[i:i+max_tokens])

In [13]:
import numpy as np

def make_serializable(obj):
    """Convierte objetos numpy (float32, int64, etc.) en tipos nativos de Python."""
    if isinstance(obj, dict):
        return {k: make_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [make_serializable(i) for i in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

In [14]:
def chunk_text(text: str, max_tokens: int = 400, overlap: int = 50, tokenizer=None):
    """
    Divide 'text' en chunks aptos para transformers.
    - Si pasas un tokenizer (AutoTokenizer), lo usar√° para contar tokens.
    - max_tokens: tokens m√°ximos por chunk (ajusta seg√∫n el modelo; 400 es conservador).
    - overlap: tokens que se solapan entre chunks para evitar cortar entidades.
    """
    if tokenizer is not None:
        # Tokenizer-based chunking (mejor precisi√≥n)
        tokens = tokenizer.tokenize(text)
        n = len(tokens)
        start = 0
        while start < n:
            end = min(start + max_tokens, n)
            chunk_tokens = tokens[start:end]
            # reconvertir tokens a texto de forma segura
            chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
            yield chunk_text
            if end == n:
                break
            start = end - overlap
    else:
        # Fallback simple: por palabras
        words = text.split()
        n = len(words)
        start = 0
        while start < n:
            end = min(start + max_tokens, n)
            yield " ".join(words[start:end])
            if end == n:
                break
            start = end - overlap

In [17]:
from tqdm import tqdm

def run_ner_folder(
    input_folder="articulos_limpios",
    output_folder="resultados_ner",
    model_name_or_pipeline=None,
    tokenizer=None,
    nlp_pipeline=None,
    max_tokens=400,
    overlap=50
):
    """
    Recorre todos los .txt en input_folder y corre NER en chunks.
    Guarda un JSON por archivo con las entidades encontradas.
    Par√°metros:
      - model_name_or_pipeline: si pasas el nombre del modelo, el caller deber√≠a haber creado 'nlp_pipeline' ya.
      - tokenizer: AutoTokenizer (opcional, usado para chunking).
      - nlp_pipeline: pipeline("ner", ...) ya creado (recomendado).
      - max_tokens, overlap: controlan tama√±o de chunks.
    """

    os.makedirs(output_folder, exist_ok=True)

    # Validaciones
    if nlp_pipeline is None:
        raise ValueError("Se requiere 'nlp_pipeline' (pipeline('ner', ...)). Pasa el pipeline al llamar la funci√≥n.")

    files = [f for f in os.listdir(input_folder) if f.lower().endswith(".txt")]
    print(f"üìÑ Encontrados {len(files)} archivos en '{input_folder}'.")

    for filename in tqdm(files):
        filepath = os.path.join(input_folder, filename)
        print(f"\nüîç Procesando: {filename}")

        with open(filepath, "r", encoding="utf-8", errors="ignore") as fh:
            text = fh.read().strip()

        if not text:
            print("  ‚ö†Ô∏è Archivo vac√≠o, se omite.")
            continue

        # Generar chunks
        chunk_iterator = chunk_text(text, max_tokens=max_tokens, overlap=overlap, tokenizer=tokenizer)
        all_entities = []

        for idx, chunk in enumerate(chunk_iterator, start=1):
            try:
                print(f"  ‚Üí Chunk {idx} ({len(chunk.split())} palabras)")
                entities = nlp_pipeline(chunk)  # pipeline devuelve lista de dicts
                # Si pipeline devuelve numpy types o objetos extra√±os, convertimos antes de guardar
                entities_serializable = make_serializable(entities)
                # Puedes agregar campo de chunk index si lo quieres
                for ent in entities_serializable:
                    ent["_chunk_index"] = idx
                all_entities.extend(entities_serializable)
            except Exception as e:
                print(f"    ‚ö†Ô∏è Error procesando chunk {idx}: {e}")

        # Serializar y guardar
        all_entities = make_serializable(all_entities)
        output_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}_ner.json")
        try:
            with open(output_path, "w", encoding="utf-8") as outfh:
                json.dump(all_entities, outfh, ensure_ascii=False, indent=2)
            print(f"‚úÖ Guardado: {output_path} (entidades: {len(all_entities)})")
        except Exception as e:
            print(f"‚ö†Ô∏è Error guardando {output_path}: {e}")

In [18]:
run_ner_folder(input_folder="articulos_limpios", output_folder="resultados_ner",
               nlp_pipeline=nlp, max_tokens=400, overlap=50)

üìÑ Encontrados 17 archivos en 'articulos_limpios'.


  0%|          | 0/17 [00:00<?, ?it/s]


üîç Procesando: 1903.10676_clean.txt
  ‚Üí Chunk 1 (400 palabras)


  6%|‚ñå         | 1/17 [00:00<00:09,  1.68it/s]

  ‚Üí Chunk 2 (400 palabras)
  ‚Üí Chunk 3 (103 palabras)
‚úÖ Guardado: resultados_ner\1903.10676_clean_ner.json (entidades: 0)

üîç Procesando: 1904.03323_clean.txt
  ‚Üí Chunk 1 (400 palabras)
  ‚Üí Chunk 2 (400 palabras)
  ‚Üí Chunk 3 (400 palabras)
  ‚Üí Chunk 4 (400 palabras)


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
 12%|‚ñà‚ñè        | 2/17 [00:00<00:05,  2.74it/s]

  ‚Üí Chunk 5 (400 palabras)
  ‚Üí Chunk 6 (400 palabras)
  ‚Üí Chunk 7 (400 palabras)
  ‚Üí Chunk 8 (158 palabras)
‚úÖ Guardado: resultados_ner\1904.03323_clean_ner.json (entidades: 5)

üîç Procesando: 1_s20_S2472630325000433_main_clean.txt
  ‚Üí Chunk 1 (400 palabras)
  ‚Üí Chunk 2 (400 palabras)
  ‚Üí Chunk 3 (400 palabras)
  ‚Üí Chunk 4 (400 palabras)
  ‚Üí Chunk 5 (400 palabras)
  ‚Üí Chunk 6 (400 palabras)
  ‚Üí Chunk 7 (400 palabras)
  ‚Üí Chunk 8 (400 palabras)
  ‚Üí Chunk 9 (400 palabras)
  ‚Üí Chunk 10 (400 palabras)
  ‚Üí Chunk 11 (400 palabras)
  ‚Üí Chunk 12 (400 palabras)
  ‚Üí Chunk 13 (400 palabras)
  ‚Üí Chunk 14 (400 palabras)


 18%|‚ñà‚ñä        | 3/17 [00:01<00:05,  2.64it/s]

  ‚Üí Chunk 15 (400 palabras)
  ‚Üí Chunk 16 (400 palabras)
‚úÖ Guardado: resultados_ner\1_s20_S2472630325000433_main_clean_ner.json (entidades: 117)

üîç Procesando: 1_s20_S2590041224000527_main_clean.txt
  ‚Üí Chunk 1 (400 palabras)
  ‚Üí Chunk 2 (400 palabras)
  ‚Üí Chunk 3 (400 palabras)
  ‚Üí Chunk 4 (400 palabras)
  ‚Üí Chunk 5 (400 palabras)
  ‚Üí Chunk 6 (400 palabras)
  ‚Üí Chunk 7 (400 palabras)


 24%|‚ñà‚ñà‚ñé       | 4/17 [00:01<00:04,  2.77it/s]

  ‚Üí Chunk 8 (400 palabras)
  ‚Üí Chunk 9 (400 palabras)
  ‚Üí Chunk 10 (400 palabras)
  ‚Üí Chunk 11 (400 palabras)
  ‚Üí Chunk 12 (400 palabras)
  ‚Üí Chunk 13 (191 palabras)
‚úÖ Guardado: resultados_ner\1_s20_S2590041224000527_main_clean_ner.json (entidades: 182)

üîç Procesando: 2303.18223_clean.txt
  ‚Üí Chunk 1 (400 palabras)
  ‚Üí Chunk 2 (400 palabras)
  ‚Üí Chunk 3 (400 palabras)
  ‚Üí Chunk 4 (400 palabras)
  ‚Üí Chunk 5 (400 palabras)
  ‚Üí Chunk 6 (400 palabras)
  ‚Üí Chunk 7 (345 palabras)


 29%|‚ñà‚ñà‚ñâ       | 5/17 [00:01<00:03,  3.03it/s]

‚úÖ Guardado: resultados_ner\2303.18223_clean_ner.json (entidades: 2)

üîç Procesando: Advancing_equity_in_breast_cancer_care_Natural_language_processing_for_analysing_treatment_outcomes_in_underrepresented__clean.txt
  ‚Üí Chunk 1 (400 palabras)
  ‚Üí Chunk 2 (400 palabras)
  ‚Üí Chunk 3 (400 palabras)
  ‚Üí Chunk 4 (400 palabras)
  ‚Üí Chunk 5 (400 palabras)
  ‚Üí Chunk 6 (400 palabras)
  ‚Üí Chunk 7 (400 palabras)
  ‚Üí Chunk 8 (400 palabras)
  ‚Üí Chunk 9 (400 palabras)


 35%|‚ñà‚ñà‚ñà‚ñå      | 6/17 [00:02<00:03,  2.88it/s]

  ‚Üí Chunk 10 (400 palabras)
  ‚Üí Chunk 11 (400 palabras)
  ‚Üí Chunk 12 (400 palabras)
  ‚Üí Chunk 13 (400 palabras)
  ‚Üí Chunk 14 (400 palabras)
  ‚Üí Chunk 15 (400 palabras)
  ‚Üí Chunk 16 (108 palabras)
‚úÖ Guardado: resultados_ner\Advancing_equity_in_breast_cancer_care_Natural_language_processing_for_analysing_treatment_outcomes_in_underrepresented__clean_ner.json (entidades: 54)

üîç Procesando: Automated_derivation_of_diagnostic_criteria_for_lung_cancer_using_natural_language_processing_on_electronic_health_recor_clean.txt
  ‚Üí Chunk 1 (400 palabras)
  ‚Üí Chunk 2 (400 palabras)
  ‚Üí Chunk 3 (400 palabras)
  ‚Üí Chunk 4 (400 palabras)
  ‚Üí Chunk 5 (400 palabras)
  ‚Üí Chunk 6 (400 palabras)
  ‚Üí Chunk 7 (400 palabras)
  ‚Üí Chunk 8 (400 palabras)
  ‚Üí Chunk 9 (400 palabras)
  ‚Üí Chunk 10 (400 palabras)
  ‚Üí Chunk 11 (400 palabras)


 41%|‚ñà‚ñà‚ñà‚ñà      | 7/17 [00:02<00:03,  2.87it/s]

  ‚Üí Chunk 12 (400 palabras)
  ‚Üí Chunk 13 (400 palabras)
  ‚Üí Chunk 14 (297 palabras)
‚úÖ Guardado: resultados_ner\Automated_derivation_of_diagnostic_criteria_for_lung_cancer_using_natural_language_processing_on_electronic_health_recor_clean_ner.json (entidades: 149)

üîç Procesando: baz068_clean.txt
  ‚Üí Chunk 1 (400 palabras)
  ‚Üí Chunk 2 (400 palabras)
  ‚Üí Chunk 3 (400 palabras)
  ‚Üí Chunk 4 (400 palabras)
  ‚Üí Chunk 5 (400 palabras)
  ‚Üí Chunk 6 (400 palabras)
  ‚Üí Chunk 7 (400 palabras)
  ‚Üí Chunk 8 (400 palabras)
  ‚Üí Chunk 9 (400 palabras)
  ‚Üí Chunk 10 (400 palabras)
  ‚Üí Chunk 11 (400 palabras)
  ‚Üí Chunk 12 (400 palabras)
  ‚Üí Chunk 13 (400 palabras)
  ‚Üí Chunk 14 (400 palabras)
  ‚Üí Chunk 15 (400 palabras)


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 8/17 [00:02<00:03,  2.64it/s]

  ‚Üí Chunk 16 (400 palabras)
  ‚Üí Chunk 17 (400 palabras)
  ‚Üí Chunk 18 (226 palabras)
‚úÖ Guardado: resultados_ner\baz068_clean_ner.json (entidades: 14)

üîç Procesando: bbaa074_clean.txt
  ‚Üí Chunk 1 (400 palabras)
  ‚Üí Chunk 2 (400 palabras)
  ‚Üí Chunk 3 (400 palabras)
  ‚Üí Chunk 4 (400 palabras)
  ‚Üí Chunk 5 (400 palabras)
  ‚Üí Chunk 6 (400 palabras)
  ‚Üí Chunk 7 (400 palabras)
  ‚Üí Chunk 8 (400 palabras)
  ‚Üí Chunk 9 (400 palabras)
  ‚Üí Chunk 10 (400 palabras)
  ‚Üí Chunk 11 (400 palabras)
  ‚Üí Chunk 12 (400 palabras)
  ‚Üí Chunk 13 (400 palabras)
  ‚Üí Chunk 14 (400 palabras)
  ‚Üí Chunk 15 (400 palabras)
  ‚Üí Chunk 16 (400 palabras)
  ‚Üí Chunk 17 (400 palabras)
  ‚Üí Chunk 18 (400 palabras)
  ‚Üí Chunk 19 (400 palabras)
  ‚Üí Chunk 20 (400 palabras)
  ‚Üí Chunk 21 (400 palabras)
  ‚Üí Chunk 22 (400 palabras)
  ‚Üí Chunk 23 (400 palabras)
  ‚Üí Chunk 24 (400 palabras)


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 9/17 [00:03<00:03,  2.03it/s]

  ‚Üí Chunk 25 (400 palabras)
  ‚Üí Chunk 26 (122 palabras)
‚úÖ Guardado: resultados_ner\bbaa074_clean_ner.json (entidades: 12)

üîç Procesando: NSSC_a_neurosymbolic_AI_system_for_enhancing_accuracy_of_named_entity_recognition_and_linking_from_oncologic_clinical_no_clean.txt
  ‚Üí Chunk 1 (400 palabras)
  ‚Üí Chunk 2 (400 palabras)
  ‚Üí Chunk 3 (400 palabras)
  ‚Üí Chunk 4 (400 palabras)
  ‚Üí Chunk 5 (400 palabras)
  ‚Üí Chunk 6 (400 palabras)
  ‚Üí Chunk 7 (400 palabras)
  ‚Üí Chunk 8 (400 palabras)
  ‚Üí Chunk 9 (400 palabras)
  ‚Üí Chunk 10 (400 palabras)
  ‚Üí Chunk 11 (400 palabras)
  ‚Üí Chunk 12 (400 palabras)
  ‚Üí Chunk 13 (400 palabras)
  ‚Üí Chunk 14 (400 palabras)
  ‚Üí Chunk 15 (400 palabras)
  ‚Üí Chunk 16 (400 palabras)
  ‚Üí Chunk 17 (400 palabras)
  ‚Üí Chunk 18 (400 palabras)
  ‚Üí Chunk 19 (400 palabras)
  ‚Üí Chunk 20 (400 palabras)
  ‚Üí Chunk 21 (400 palabras)
  ‚Üí Chunk 22 (400 palabras)
  ‚Üí Chunk 23 (400 palabras)
  ‚Üí Chunk 24 (400 palabras)
  ‚Üí Chunk 

 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 10/17 [00:04<00:04,  1.69it/s]

  ‚Üí Chunk 31 (400 palabras)
  ‚Üí Chunk 32 (400 palabras)
  ‚Üí Chunk 33 (400 palabras)
  ‚Üí Chunk 34 (57 palabras)
‚úÖ Guardado: resultados_ner\NSSC_a_neurosymbolic_AI_system_for_enhancing_accuracy_of_named_entity_recognition_and_linking_from_oncologic_clinical_no_clean_ner.json (entidades: 57)

üîç Procesando: OP-CBIO190693 1234..1240_clean.txt
  ‚Üí Chunk 1 (400 palabras)
  ‚Üí Chunk 2 (400 palabras)
  ‚Üí Chunk 3 (400 palabras)
  ‚Üí Chunk 4 (400 palabras)
  ‚Üí Chunk 5 (400 palabras)
  ‚Üí Chunk 6 (400 palabras)
  ‚Üí Chunk 7 (400 palabras)
  ‚Üí Chunk 8 (400 palabras)
  ‚Üí Chunk 9 (400 palabras)
  ‚Üí Chunk 10 (400 palabras)
  ‚Üí Chunk 11 (400 palabras)
  ‚Üí Chunk 12 (400 palabras)
  ‚Üí Chunk 13 (400 palabras)
  ‚Üí Chunk 14 (242 palabras)


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 11/17 [00:04<00:03,  1.93it/s]

‚úÖ Guardado: resultados_ner\OP-CBIO190693 1234..1240_clean_ner.json (entidades: 23)

üîç Procesando: s10916_025_02167_2_clean.txt
  ‚Üí Chunk 1 (400 palabras)
  ‚Üí Chunk 2 (400 palabras)
  ‚Üí Chunk 3 (400 palabras)
  ‚Üí Chunk 4 (400 palabras)
  ‚Üí Chunk 5 (400 palabras)
  ‚Üí Chunk 6 (400 palabras)
  ‚Üí Chunk 7 (400 palabras)
  ‚Üí Chunk 8 (400 palabras)


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 12/17 [00:05<00:02,  2.17it/s]

  ‚Üí Chunk 9 (400 palabras)
  ‚Üí Chunk 10 (400 palabras)
  ‚Üí Chunk 11 (400 palabras)
  ‚Üí Chunk 12 (400 palabras)
  ‚Üí Chunk 13 (391 palabras)
‚úÖ Guardado: resultados_ner\s10916_025_02167_2_clean_ner.json (entidades: 70)

üîç Procesando: s12859-019-2725-5_clean.txt
  ‚Üí Chunk 1 (400 palabras)
  ‚Üí Chunk 2 (400 palabras)
  ‚Üí Chunk 3 (400 palabras)
  ‚Üí Chunk 4 (400 palabras)


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 13/17 [00:05<00:01,  2.49it/s]

  ‚Üí Chunk 5 (400 palabras)
  ‚Üí Chunk 6 (400 palabras)
  ‚Üí Chunk 7 (400 palabras)
  ‚Üí Chunk 8 (400 palabras)
  ‚Üí Chunk 9 (400 palabras)
  ‚Üí Chunk 10 (400 palabras)
  ‚Üí Chunk 11 (75 palabras)
‚úÖ Guardado: resultados_ner\s12859-019-2725-5_clean_ner.json (entidades: 3)

üîç Procesando: s41746-025-01533-1_clean.txt
  ‚Üí Chunk 1 (400 palabras)
  ‚Üí Chunk 2 (400 palabras)
  ‚Üí Chunk 3 (400 palabras)
  ‚Üí Chunk 4 (400 palabras)
  ‚Üí Chunk 5 (400 palabras)
  ‚Üí Chunk 6 (400 palabras)
  ‚Üí Chunk 7 (400 palabras)
  ‚Üí Chunk 8 (400 palabras)
  ‚Üí Chunk 9 (400 palabras)
  ‚Üí Chunk 10 (400 palabras)
  ‚Üí Chunk 11 (400 palabras)


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 14/17 [00:05<00:01,  2.57it/s]

  ‚Üí Chunk 12 (400 palabras)
  ‚Üí Chunk 13 (400 palabras)
  ‚Üí Chunk 14 (400 palabras)
  ‚Üí Chunk 15 (390 palabras)
‚úÖ Guardado: resultados_ner\s41746-025-01533-1_clean_ner.json (entidades: 2)

üîç Procesando: s41746_025_01533_1_clean.txt
  ‚Üí Chunk 1 (400 palabras)
  ‚Üí Chunk 2 (400 palabras)
  ‚Üí Chunk 3 (400 palabras)
  ‚Üí Chunk 4 (400 palabras)
  ‚Üí Chunk 5 (400 palabras)
  ‚Üí Chunk 6 (400 palabras)
  ‚Üí Chunk 7 (400 palabras)
  ‚Üí Chunk 8 (400 palabras)
  ‚Üí Chunk 9 (400 palabras)
  ‚Üí Chunk 10 (400 palabras)
  ‚Üí Chunk 11 (400 palabras)
  ‚Üí Chunk 12 (400 palabras)
  ‚Üí Chunk 13 (400 palabras)
  ‚Üí Chunk 14 (400 palabras)


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 15/17 [00:06<00:00,  2.59it/s]

  ‚Üí Chunk 15 (390 palabras)
‚úÖ Guardado: resultados_ner\s41746_025_01533_1_clean_ner.json (entidades: 2)

üîç Procesando: SHTI_316_SHTI240794_clean.txt
  ‚Üí Chunk 1 (400 palabras)
  ‚Üí Chunk 2 (400 palabras)
  ‚Üí Chunk 3 (400 palabras)
  ‚Üí Chunk 4 (400 palabras)
  ‚Üí Chunk 5 (400 palabras)
  ‚Üí Chunk 6 (245 palabras)
‚úÖ Guardado: resultados_ner\SHTI_316_SHTI240794_clean_ner.json (entidades: 29)


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 16/17 [00:06<00:00,  3.22it/s]


üîç Procesando: Transfer Learning in Biomedical Natural Language Processing_ An Evaluation of BERT and ELMo on Ten Benchmarking Datasets_clean.txt
  ‚Üí Chunk 1 (400 palabras)
  ‚Üí Chunk 2 (400 palabras)
  ‚Üí Chunk 3 (400 palabras)
  ‚Üí Chunk 4 (400 palabras)
  ‚Üí Chunk 5 (400 palabras)
  ‚Üí Chunk 6 (400 palabras)
  ‚Üí Chunk 7 (400 palabras)
  ‚Üí Chunk 8 (400 palabras)
  ‚Üí Chunk 9 (400 palabras)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 17/17 [00:06<00:00,  2.57it/s]

  ‚Üí Chunk 10 (67 palabras)
‚úÖ Guardado: resultados_ner\Transfer Learning in Biomedical Natural Language Processing_ An Evaluation of BERT and ELMo on Ten Benchmarking Datasets_clean_ner.json (entidades: 2)





In [None]:
import os
import re
import requests
from bs4 import BeautifulSoup

# --- CONFIGURACI√ìN ---
TOPIC = "machine learning veterinary medicine"
OUTPUT_FOLDER = "articulos_limpios"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# --- FUNCIONES AUXILIARES ---
def limpiar_texto(text):
    """Limpia texto eliminando referencias, figuras, tablas y agradecimientos."""
    text = re.sub(r'\b(references?|bibliography|figures?|tables?|acknowledg(e)?ments?)\b.*', '', text, flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r'\s+', ' ', text)  # compactar espacios
    return text.strip()

# --- PUBMED ---
def buscar_pubmed(query, max_results=5):
    print(f"üîç Buscando en PubMed: {query}")
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {"db": "pubmed", "term": query, "retmax": max_results, "retmode": "json"}
    ids = requests.get(url, params=params).json()["esearchresult"]["idlist"]
    for pmid in ids:
        fetch_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
        params = {"db": "pubmed", "id": pmid, "retmode": "xml"}
        xml = requests.get(fetch_url, params=params).text
        soup = BeautifulSoup(xml, "lxml-xml")
        title = soup.find("ArticleTitle").text if soup.find("ArticleTitle") else "No Title"
        abstract = soup.find("AbstractText").text if soup.find("AbstractText") else ""
        authors = ", ".join([a.find("LastName").text for a in soup.find_all("Author") if a.find("LastName")])

        texto = f"{title}\nAutores: {authors}\n\n{abstract}"
        texto_limpio = limpiar_texto(texto)

        with open(os.path.join(OUTPUT_FOLDER, f"PubMed_{pmid}.txt"), "w", encoding="utf-8") as f:
            f.write(texto_limpio)

    print("‚úÖ Art√≠culos guardados desde PubMed")


# --- ARXIV ---
def buscar_arxiv(query, max_results=5):
    print(f"üîç Buscando en arXiv: {query}")
    url = "http://export.arxiv.org/api/query"
    params = {"search_query": query.replace(" ", "+"), "start": 0, "max_results": max_results}
    response = requests.get(url, params=params)
    soup = BeautifulSoup(response.text, "xml")
    entries = soup.find_all("entry")
    for i, entry in enumerate(entries, start=1):
        title = entry.find("title").text.strip()
        abstract = entry.find("summary").text.strip()
        authors = ", ".join([a.text for a in entry.find_all("name")])
        texto = f"{title}\nAutores: {authors}\n\n{abstract}"
        texto_limpio = limpiar_texto(texto)
        with open(os.path.join(OUTPUT_FOLDER, f"arXiv_{i}.txt"), "w", encoding="utf-8") as f:
            f.write(texto_limpio)
    print("‚úÖ Art√≠culos guardados desde arXiv")

# --- EJECUCI√ìN ---
buscar_pubmed(TOPIC, max_results=10)
buscar_arxiv(TOPIC, max_results=10)


üîç Buscando en PubMed: machine learning veterinary medicine


FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml-xml. Do you need to install a parser library?