# Pruebas de los módulos NLP

Ejecuta cada componente de `src/nlp` contra los artículos ubicados en `data_example/`, incorporando las etiquetas y clasificaciones generadas directamente sobre los archivos `.json` originales para que el vectorizador los tenga disponibles antes de la integración con el resto del sistema.


In [None]:
from pathlib import Path
import json
from pprint import pprint

from preprocessing import detect_language

PROJECT_ROOT = Path("/home/lia/Escritorio/Proyectos/NLP/Report_Generator/Data/Data_articles")
DATA_DIR = PROJECT_ROOT / "Data_articles11"

article_paths = sorted(DATA_DIR.glob("article_*.json"))


def load_article(path: Path) -> dict:
    with open(path, encoding="utf-8") as fh:
        return json.load(fh)


def persist_article(path: Path, data: dict) -> None:
    with open(path, "w", encoding="utf-8") as fh:
        json.dump(data, fh, ensure_ascii=False, indent=2)


def update_article(path: Path, updates: dict) -> dict:
    data = load_article(path)
    data.update(updates)
    persist_article(path, data)
    return data

sample = load_article(article_paths[0])
print(f"Artículos detectados: {len(article_paths)}")
print("Idioma detectado para el primero:", detect_language(sample.get("text", "")))


Artículos detectados: 1190
Idioma detectado para el primero: es


In [68]:
from preprocessing import TextPreprocessor

preprocessor = TextPreprocessor(use_spacy=False, remove_stopwords=True)
preprocessing_results = {}

for path in article_paths:
    article = load_article(path)
    text = article.get("text", "")
    if not text:
        continue
    processed = preprocessor.preprocess_full(text)
    updates = {
        "preprocessing": {
            "cleaned": processed["cleaned"],
            "tokens": processed["tokens"],
            "token_count": processed["token_count"],
            "sentence_count": processed["sentence_count"]
        }
    }
    update_article(path, updates)
    preprocessing_results[path.name] = {
        "token_count": updates["preprocessing"]["token_count"],
        "sentence_count": updates["preprocessing"]["sentence_count"]
    }

pprint(preprocessing_results.get("article_1.json"))


{'sentence_count': 16, 'token_count': 273}


In [69]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')
from pos_analyzer import POSAnalyzer

pos_analyzer = POSAnalyzer()
pos_results = {}

for path in article_paths:
    article = load_article(path)
    text = article.get("text", "")
    if not text:
        continue
    pos_info = pos_analyzer.analyze(text)
    patterns = pos_analyzer.get_top_patterns(pos_info, n=5)
    updates = {
        "pos_analysis": {
            "tag_freq": pos_info["tag_freq"],
            "bigram_freq": pos_info["bigram_freq"],
            "trigram_freq": pos_info["trigram_freq"],
            "top_patterns": patterns
        }
    }
    update_article(path, updates)
    pos_results[path.name] = patterns

pprint(pos_results.get("article_1.json"))


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/lia/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


{'top_bigrams': {'FW-FW': 47,
                 'JJ-NN': 27,
                 'NN-IN': 21,
                 'NN-NN': 58,
                 'NNS-VBP': 15},
 'top_tags': {'FW': 89, 'JJ': 52, 'NN': 118, 'NNP': 38, 'NNS': 31},
 'top_trigrams': {'FW-FW-FW': 27,
                  'JJ-NN-IN': 9,
                  'JJ-NN-NN': 12,
                  'NN-IN-FW': 10,
                  'NN-NN-NN': 31}}


In [70]:
from grammar_analyzer import GrammarAnalyzer

grammar_analyzer = GrammarAnalyzer()
grammar_results = {}

for path in article_paths:
    article = load_article(path)
    text = article.get("text", "")
    if not text:
        continue
    grammar_info = grammar_analyzer.analyze(text)
    update_article(path, {"grammar_analysis": grammar_info})
    grammar_results[path.name] = {
        "chunk_counts": grammar_info["chunk_counts"],
        "top_rules": grammar_info["top_rules"][:5]
    }

pprint(grammar_results.get("article_1.json"))


{'chunk_counts': {'ADJP': 5, 'NP': 118, 'PP': 16, 'VP': 34},
 'top_rules': [{'frequency': 11,
                'rule': "PP -> ('de', 'IN') NP",
                'weight': 0.0582},
               {'frequency': 4,
                'rule': "PP -> ('en', 'IN') NP",
                'weight': 0.0212},
               {'frequency': 4,
                'rule': "NP -> ('Tulkarem', 'NNP')",
                'weight': 0.0212},
               {'frequency': 3,
                'rule': "ADJP -> ('los', 'JJ')",
                'weight': 0.0159},
               {'frequency': 2,
                'rule': "NP -> ('Gaza', 'NNP')",
                'weight': 0.0106}]}


In [71]:
from regex_annotator import RegexAnnotator

regex_annotator = RegexAnnotator()
regex_results = {}

for path in article_paths:
    article = load_article(path)
    text = article.get("text", "")
    if not text:
        continue
    annotations = regex_annotator.annotate(text)
    update_article(path, {"regex_annotations": annotations})
    regex_results[path.name] = annotations["categories"]

pprint(regex_results.get("article_1.json"))
pprint(regex_results.get("article_11.json"))
pprint(regex_results.get("article_24.json"))


['DESASTRES_ACCIDENTES_EMERGENCIAS',
 'CONFLICTOS_ARMADOS_OPERACIONES',
 'CULTURA_DEPORTE_SOCIEDAD',
 'FECHAS_ESPECIFICAS',
 'INFRAESTRUCTURA_SERVICIOS',
 'TIEMPO_RELATIVO',
 'PAISES',
 'CRISIS_HUMANITARIA_SERVICIOS',
 'JURIDICO_LEGISLATIVO_CORRUPCION',
 'ACCIONES_LEGALES',
 'REGIONES_PROVINCIAS',
 'RELACIONES_INTERNACIONALES',
 'ECONOMIA_COMERCIO_FINANZAS',
 'CANTIDADES_NUMERICAS',
 'LOCALIZACIONES_ESPECIFICAS',
 'VIOLENCIA_CRIMEN_DERECHOS_HUMANOS',
 'APROBACION_RECHAZO']
['GOBIERNOS_OFICIALES',
 'PAISES',
 'ORGANISMOS_INTERNACIONALES',
 'RELACIONES_INTERNACIONALES',
 'ECONOMIA_COMERCIO_FINANZAS',
 'CANTIDADES_NUMERICAS',
 'LEYES_NORMATIVAS',
 'JUSTICIA_SOCIAL',
 'APROBACION_RECHAZO',
 'CONFLICTOS_ARMADOS_OPERACIONES',
 'JURIDICO_LEGISLATIVO_CORRUPCION',
 'VIOLENCIA_CRIMEN_DERECHOS_HUMANOS',
 'RECURSOS_NATURALES',
 'INFRAESTRUCTURA_SERVICIOS',
 'ANTIIMPERIALISMO_SOBERANIA',
 'REGIONES_PROVINCIAS',
 'INSTITUCIONES_EDUCATIVAS',
 'CRISIS_HUMANITARIA_SERVICIOS',
 'ACCIONES_LEGALES',
 'LOC

In [72]:
from relation_extractor import RelationExtractor

try:
    relation_extractor = RelationExtractor()
except ValueError as exc:
    relation_extractor = None
    print(f"No se pueden extraer relaciones: {exc}")

relation_results = {}

for path in article_paths:
    article = load_article(path)
    text = article.get("text", "")
    if not text:
        continue
    if relation_extractor:
        rel_info = relation_extractor.extract(text)
    else:
        rel_info = {"entidades": [], "relaciones": [], "error": "Modelo spaCy no disponible"}
    update_article(path, {"knowledge_graph": rel_info})
    relation_results[path.name] = rel_info

pprint(relation_results.get("article_1.json"))


{'entidades': [{'nombre': 'Tel Aviv', 'tipo': 'Lugar'},
               {'nombre': 'Tulkarem', 'tipo': 'Lugar'},
               {'nombre': 'Nour Shams', 'tipo': 'Persona'},
               {'nombre': 'Kmail', 'tipo': 'Persona'},
               {'nombre': 'Cisjordania', 'tipo': 'Lugar'},
               {'nombre': 'al-Maslakh', 'tipo': 'Lugar'},
               {'nombre': 'Franja de Gaza', 'tipo': 'Lugar'},
               {'nombre': 'Gaza', 'tipo': 'Lugar'},
               {'nombre': 'Israel', 'tipo': 'Lugar'},
               {'nombre': 'Abdullah Kmail', 'tipo': 'Persona'}],
 'relaciones': [{'objeto': 'Israel',
                 'relacion': 'AUMENTAR_INFORMAR',
                 'sujeto': 'Franja de Gaza'},
                {'objeto': 'Israel',
                 'relacion': 'INICIAR',
                 'sujeto': 'Cisjordania'},
                {'objeto': 'Nour Shams',
                 'relacion': 'INICIAR',
                 'sujeto': 'Cisjordania'},
                {'objeto': 'Israel',
         

In [73]:
from sentiment_analyzer import SentimentAnalyzer

sentiment_analyzer = SentimentAnalyzer()
sentiment_results = {}

for path in article_paths:
    article = load_article(path)
    text = article.get("text", "")
    if not text:
        continue
    sentiment = sentiment_analyzer.analyze(text)
    update_article(path, {"sentiment": sentiment})
    sentiment_results[path.name] = sentiment

pprint(sentiment_results.get("article_1.json"))


{'compound': -0.0258,
 'label': 'neutral',
 'neg': 0.009,
 'neu': 0.981,
 'pos': 0.011}
