# Pruebas de los módulos NLP

Ejecuta cada componente de `src/nlp` contra los artículos ubicados en `data_example/`, incorporando las etiquetas y clasificaciones generadas directamente sobre los archivos `.json` originales para que el vectorizador los tenga disponibles antes de la integración con el resto del sistema.


In [None]:
from pathlib import Path
import json
from pprint import pprint

from preprocessing import detect_language

PROJECT_ROOT = Path("/home/lia/Escritorio/Proyectos/NLP/Report_Generator/Data/Data_articles")
DATA_DIR = PROJECT_ROOT / "Data_articles2"

article_paths = sorted(DATA_DIR.glob("article_*.json"))


def load_article(path: Path) -> dict:
    with open(path, encoding="utf-8") as fh:
        return json.load(fh)


def persist_article(path: Path, data: dict) -> None:
    with open(path, "w", encoding="utf-8") as fh:
        json.dump(data, fh, ensure_ascii=False, indent=2)


def update_article(path: Path, updates: dict) -> dict:
    data = load_article(path)
    data.update(updates)
    persist_article(path, data)
    return data

sample = load_article(article_paths[0])
print(f"Artículos detectados: {len(article_paths)}")
print("Idioma detectado para el primero:", detect_language(sample.get("text", "")))


Artículos detectados: 4544
Idioma detectado para el primero: es


In [4]:
from preprocessing import TextPreprocessor

preprocessor = TextPreprocessor(use_spacy=False, remove_stopwords=True)
preprocessing_results = {}

for path in article_paths:
    article = load_article(path)
    text = article.get("text", "")
    if not text:
        continue
    processed = preprocessor.preprocess_full(text)
    updates = {
        "preprocessing": {
            "cleaned": processed["cleaned"],
            "tokens": processed["tokens"],
            "token_count": processed["token_count"],
            "sentence_count": processed["sentence_count"]
        }
    }
    update_article(path, updates)
    preprocessing_results[path.name] = {
        "token_count": updates["preprocessing"]["token_count"],
        "sentence_count": updates["preprocessing"]["sentence_count"]
    }

pprint(preprocessing_results.get("article_1.json"))


{'sentence_count': 11, 'token_count': 232}


In [5]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')
from pos_analyzer import POSAnalyzer

pos_analyzer = POSAnalyzer()
pos_results = {}

for path in article_paths:
    article = load_article(path)
    text = article.get("text", "")
    if not text:
        continue
    pos_info = pos_analyzer.analyze(text)
    patterns = pos_analyzer.get_top_patterns(pos_info, n=5)
    updates = {
        "pos_analysis": {
            "tag_freq": pos_info["tag_freq"],
            "bigram_freq": pos_info["bigram_freq"],
            "trigram_freq": pos_info["trigram_freq"],
            "top_patterns": patterns
        }
    }
    update_article(path, updates)
    pos_results[path.name] = patterns

pprint(pos_results.get("article_1.json"))


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/lia/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


{'top_bigrams': {'FW-FW': 64,
                 'FW-NNP': 14,
                 'JJ-NN': 15,
                 'NN-NN': 36,
                 'NNS-VBP': 16},
 'top_tags': {'FW': 102, 'JJ': 35, 'NN': 72, 'NNP': 35, 'NNS': 32},
 'top_trigrams': {'FW-FW-FW': 41,
                  'FW-FW-NNP': 11,
                  'JJ-NN-NN': 11,
                  'NN-FW-FW': 9,
                  'NN-NN-NN': 18}}


In [6]:
from grammar_analyzer import GrammarAnalyzer

grammar_analyzer = GrammarAnalyzer()
grammar_results = {}

for path in article_paths:
    article = load_article(path)
    text = article.get("text", "")
    if not text:
        continue
    grammar_info = grammar_analyzer.analyze(text)
    update_article(path, {"grammar_analysis": grammar_info})
    grammar_results[path.name] = {
        "chunk_counts": grammar_info["chunk_counts"],
        "top_rules": grammar_info["top_rules"][:5]
    }

pprint(grammar_results.get("article_1.json"))


{'chunk_counts': {'NP': 90, 'PP': 3, 'VP': 27},
 'top_rules': [{'frequency': 3,
                'rule': "NP -> ('La', 'NNP')",
                'weight': 0.0229},
               {'frequency': 3,
                'rule': "NP -> ('Gaza', 'NNP')",
                'weight': 0.0229},
               {'frequency': 3,
                'rule': "PP -> ('de', 'IN') NP",
                'weight': 0.0229},
               {'frequency': 2,
                'rule': "NP -> ('personas', 'NNS')",
                'weight': 0.0153},
               {'frequency': 1,
                'rule': "S -> NP VP ('la', 'FW') NP ('de', 'FW') NP NP VP "
                        "('16', 'CD') NP (',', ',') ('tras', 'EX') NP ('de', "
                        "'IN') ('bombardeos', 'FW') (',', ',') NP VP VP ('de', "
                        "'IN') ('la', 'FW') ('ocupación', 'FW') NP VP ('38', "
                        "'CD') NP ('en', 'FW') ('distintos', 'FW') ('puntos', "
                        "'FW') ('de', 'FW') ('la', 'FW') ('

In [8]:
from regex_annotator import RegexAnnotator

regex_annotator = RegexAnnotator()
regex_results = {}

for path in article_paths:
    article = load_article(path)
    text = article.get("text", "")
    if not text:
        continue
    annotations = regex_annotator.annotate(text)
    update_article(path, {"regex_annotations": annotations})
    regex_results[path.name] = annotations["categories"]

pprint(regex_results.get("article_1.json"))
pprint(regex_results.get("article_11.json"))
pprint(regex_results.get("article_24.json"))


['JURIDICO_LEGISLATIVO_CORRUPCION',
 'FECHAS_ESPECIFICAS',
 'PAISES',
 'RELACIONES_INTERNACIONALES',
 'PERIODOS_TEMPORALES',
 'CANTIDADES_NUMERICAS',
 'VIOLENCIA_CRIMEN_DERECHOS_HUMANOS',
 'LOCALIZACIONES_ESPECIFICAS',
 'INFRAESTRUCTURA_SERVICIOS',
 'CONFLICTOS_ARMADOS_OPERACIONES',
 'REGIONES_PROVINCIAS',
 'CRISIS_HUMANITARIA_SERVICIOS',
 'ECONOMIA_COMERCIO_FINANZAS',
 'GRUPOS_ESPECIFICOS']
['PAISES',
 'RELACIONES_INTERNACIONALES',
 'ANTIIMPERIALISMO_SOBERANIA',
 'JUSTICIA_SOCIAL',
 'GOBIERNOS_OFICIALES',
 'ECONOMIA_COMERCIO_FINANZAS',
 'ELECCIONES_PROCESOS_POLITICOS']
['FECHAS_ESPECIFICAS',
 'TIEMPO_RELATIVO',
 'APROBACION_RECHAZO',
 'RELACIONES_INTERNACIONALES',
 'PAISES',
 'ANTIIMPERIALISMO_SOBERANIA',
 'CANTIDADES_NUMERICAS',
 'ORGANISMOS_INTERNACIONALES',
 'CONFLICTOS_ARMADOS_OPERACIONES',
 'CIUDADES_CAPITALES',
 'LEYES_NORMATIVAS',
 'FECHAS_HORAS',
 'ACCIONES_LEGALES',
 'ECONOMIA_COMERCIO_FINANZAS',
 'ELECCIONES_PROCESOS_POLITICOS']


In [9]:
from relation_extractor import RelationExtractor

try:
    relation_extractor = RelationExtractor()
except ValueError as exc:
    relation_extractor = None
    print(f"No se pueden extraer relaciones: {exc}")

relation_results = {}

for path in article_paths:
    article = load_article(path)
    text = article.get("text", "")
    if not text:
        continue
    if relation_extractor:
        rel_info = relation_extractor.extract(text)
    else:
        rel_info = {"entidades": [], "relaciones": [], "error": "Modelo spaCy no disponible"}
    update_article(path, {"knowledge_graph": rel_info})
    relation_results[path.name] = rel_info

pprint(relation_results.get("article_1.json"))


{'entidades': [{'nombre': 'Masoud', 'tipo': 'Lugar'},
               {'nombre': 'Autoridad Palestina', 'tipo': 'Organizacion'},
               {'nombre': 'Franja de Gaza', 'tipo': 'Lugar'},
               {'nombre': 'Ministerio de Exteriores', 'tipo': 'Lugar'},
               {'nombre': 'plaza Al-Shawa', 'tipo': 'Lugar'},
               {'nombre': 'Hospital Al-Shifa', 'tipo': 'Lugar'},
               {'nombre': 'Al-Daraj', 'tipo': 'Lugar'},
               {'nombre': 'Al-Jalaa', 'tipo': 'Persona'},
               {'nombre': 'Seguridad General', 'tipo': 'Organizacion'},
               {'nombre': 'Gaza', 'tipo': 'Lugar'},
               {'nombre': 'Hospital Al-Aqsa', 'tipo': 'Lugar'},
               {'nombre': 'Hospital Al-Ma’amoud', 'tipo': 'Lugar'},
               {'nombre': 'El Hospital Al-Shifa', 'tipo': 'Lugar'}],
 'relaciones': [{'objeto': 'Hospital Al-Shifa',
                 'relacion': 'CONFIRMAR',
                 'sujeto': 'Gaza'},
                {'objeto': 'Gaza',
           

In [10]:
from sentiment_analyzer import SentimentAnalyzer

sentiment_analyzer = SentimentAnalyzer()
sentiment_results = {}

for path in article_paths:
    article = load_article(path)
    text = article.get("text", "")
    if not text:
        continue
    sentiment = sentiment_analyzer.analyze(text)
    update_article(path, {"sentiment": sentiment})
    sentiment_results[path.name] = sentiment

pprint(sentiment_results.get("article_1.json"))


{'compound': -0.296,
 'label': 'negative',
 'neg': 0.007,
 'neu': 0.993,
 'pos': 0.0}
