In [1]:
from contextizer.hybrid.metrics_ext import (
    entropy_topics,
    redundancy_score,
    keywords_diversity_ext,
    semantic_variance,
    coherence_semantic
)


[INFO] contextizer.router: Contextizer adaptativo cargado correctamente 


In [2]:
import json

with open('outputs_doc_topics/', 'r', encoding='utf-8') as f:
    data = json.load(f)

from contextizer.schemas import TopicsDocMeta
topics_meta = TopicsDocMeta(**data["meta"]["topics_doc"])


In [3]:
H = entropy_topics(topics_meta)
R = redundancy_score(topics_meta)
D = keywords_diversity_ext(topics_meta)

print(f"Entropy: {H:.3f}")
print(f"Redundancy: {R:.3f}")
print(f"Diversity: {D:.3f}")


Entropy: 0.000
Redundancy: 0.000
Diversity: 1.000


In [4]:
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('all-MiniLM-L6-v2')

V = semantic_variance(topics_meta, embedder)
C = coherence_semantic(topics_meta, embedder)

print(f"Semantic Variance: {V:.4f}")
print(f"Semantic Coherence: {C:.4f}")


  from .autonotebook import tqdm as notebook_tqdm


Semantic Variance: 0.0000
Semantic Coherence: 0.3574


In [5]:
import pandas as pd

metrics = {
    "Entropy": [H],
    "Redundancy": [R],
    "Keyword Diversity": [D],
    "Semantic Variance": [V],
    "Semantic Coherence": [C]
}

pd.DataFrame(metrics)


Unnamed: 0,Entropy,Redundancy,Keyword Diversity,Semantic Variance,Semantic Coherence
0,0.0,0.0,1.0,0.0,0.35735


In [12]:
from contextizer.metrics import (
    coverage, outlier_rate, topic_size_stats, keywords_diversity
)
from contextizer.hybrid.metrics_ext import (
    entropy_topics, redundancy_score, keywords_diversity_ext,
    semantic_variance, coherence_semantic
)

from contextizer.schemas import TopicsDocMeta
import json
from sentence_transformers import SentenceTransformer

# Carga el archivo de salida del Contextizer
data = json.load(open("outputs_doc_topics/DOC-D9BDC7EE25DC_doc_topics.json"))
dt = TopicsDocMeta(**data["meta"]["topics_doc"])

# Embedding model para las métricas semánticas
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Módulo base
base_metrics = {
    "Coverage": coverage(dt),
    "Outlier Rate": outlier_rate(dt),
    "Topic Size (Median)": topic_size_stats(dt)["median"],
    "Keyword Diversity (basic)": keywords_diversity(dt),
}

# Módulo extendido
semantic_metrics = {
    "Entropy": entropy_topics(dt),
    "Redundancy": redundancy_score(dt),
    "Keyword Diversity (ext)": keywords_diversity_ext(dt),
    "Semantic Variance": semantic_variance(dt, embedder),
    "Semantic Coherence": coherence_semantic(dt, embedder),
}

# Combinar resultados
import pandas as pd
pd.DataFrame([base_metrics | semantic_metrics])


Unnamed: 0,Coverage,Outlier Rate,Topic Size (Median),Keyword Diversity (basic),Entropy,Redundancy,Keyword Diversity (ext),Semantic Variance,Semantic Coherence
0,1.0,0.0,2.0,0.958333,0.982141,0.0,0.958333,0.424735,0.303534


In [9]:
from contextizer.hybrid.keyword_fusion import fuse_keywords
from sentence_transformers import SentenceTransformer

texts = ["Contrato de prestación de servicios profesionales entre TechNova y Innovate Consulting Group."]
embedder = SentenceTransformer("all-MiniLM-L6-v2")

merged, keybert_only, emb_matrix = fuse_keywords(texts, embedder, use_keybert=True)
print("Merged keywords:", merged)
print("KeyBERT-only keywords:", keybert_only)




Merged keywords: ['contrato', 'prestación', 'servicios', 'profesionales', 'entre', 'technova', 'innovate', 'consulting', 'group.', 'group']
KeyBERT-only keywords: ['consulting', 'innovate', 'technova', 'profesionales', 'servicios', 'prestación', 'group', 'entre', 'contrato']


In [11]:
context_quality = (
    coverage * (1 - outlier_rate)
    * (keyword_diversity_ext)
    * (1 - redundancy)
    * semantic_coherence
)


TypeError: unsupported operand type(s) for -: 'int' and 'function'

In [15]:
import json
from pathlib import Path
import pandas as pd

# Directorio donde se guardan los outputs
dir_docs = Path("outputs_doc_topics")

records = []
for file in dir_docs.glob("*.json"):
    data = json.loads(file.read_text(encoding="utf-8"))
    meta = data.get("meta", {}).get("topics_doc", {})
    metrics = meta.get("metrics_ext", {})
    if metrics:
        record = {"doc_id": data.get("doc_id", file.stem)}
        record.update(metrics)
        record["n_topics"] = meta.get("n_topics", None)
        records.append(record)

df = pd.DataFrame(records)
df.head()


In [16]:
import matplotlib.pyplot as plt

df.set_index("doc_id")[[
    "entropy_topics",
    "redundancy_score",
    "keywords_diversity_ext",
    "semantic_variance",
    "coherence_semantic"
]].plot(kind="bar", figsize=(12,6))
plt.title("Comparativo de métricas extendidas por documento")
plt.ylabel("Valor normalizado (0–1)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


KeyError: "None of ['doc_id'] are in the columns"