
# IE Metrics — Mentions vs Triples

Este notebook calcula métricas simples para validar la extracción de información:

- Conteos por documento (entidades y relaciones).
- Distribución de etiquetas de entidades y relaciones.
- **Support@Triples**: % de relaciones en *mentions* que tienen evidencia en *triples* (misma oración y relación canónica).
- Desglose por etiqueta y por documento.


In [12]:
import os, glob, json
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

MENTIONS_DIR = "outputs_mentions"
TRIPLES_DIR  = "outputs_triples"

def _canon_rel(x: str) -> str:
    # canónica simple acorde al pipeline (minúsculas, _)
    return str(x or "").strip().lower().replace(" ", "_")

def _get_sid(obj):
    # intenta varias rutas para sentence_idx
    if isinstance(obj, dict):
        if "sentence_idx" in obj and obj["sentence_idx"] is not None:
            return obj["sentence_idx"]
        meta = obj.get("meta", {}) or {}
        if "sentence_idx" in meta and meta["sentence_idx"] is not None:
            return meta["sentence_idx"]
    return None

def load_mentions_df(mentions_dir: str) -> pd.DataFrame:
    rows = []
    for f in glob.glob(os.path.join(mentions_dir, "*_mentions.json")):
        with open(f, "r", encoding="utf-8") as fh:
            data = json.load(fh)
        doc_id = data.get("doc_id") or Path(f).stem.replace("_mentions", "")
        for r in data.get("relations", []):
            label = r.get("canonical_label") or r.get("label")
            sid   = _get_sid(r)
            rows.append({
                "doc_id": str(doc_id),
                "sentence_idx": sid,
                "label": _canon_rel(label),
                "source_file": f,
            })
    return pd.DataFrame(rows)

def load_triples_df(triples_dir: str) -> pd.DataFrame:
    rows = []
    for f in glob.glob(os.path.join(triples_dir, "*_triples.json")):
        with open(f, "r", encoding="utf-8") as fh:
            data = json.load(fh)
        doc_id = data.get("doc_id") or Path(f).stem.replace("_triples", "")
        for t in data.get("triples", []):
            label = t.get("canonical_relation") or t.get("relation")
            sid   = _get_sid(t)
            rows.append({
                "doc_id": str(doc_id),
                "sentence_idx": sid,
                "label": _canon_rel(label),
                "source_file": f,
            })
    return pd.DataFrame(rows)

# --- Cargar
rel_df = load_mentions_df(MENTIONS_DIR)
tri_df = load_triples_df(TRIPLES_DIR)

In [13]:
# --- Limpieza básica
for df in [rel_df, tri_df]:
    # elimina filas sin sentence_idx o label
    df.dropna(subset=["sentence_idx", "label"], inplace=True)
    # sentence_idx como int (si falla, deja como int via astype err-safe)
    df["sentence_idx"] = df["sentence_idx"].astype(int, errors="ignore")
    # key unificada doc::sent::label
    df["key"] = df["doc_id"].astype(str) + "::" + df["sentence_idx"].astype(str) + "::" + df["label"].astype(str)