In [1]:
from sentence_transformers import SentenceTransformer
import numpy as np
import sys
import resource
import faiss
import time
import json
import re
from tqdm.auto import tqdm
from scipy.spatial.distance import cdist
from pathlib import Path
from textwrap import dedent


  from .autonotebook import tqdm as notebook_tqdm


In [4]:

# ---- tiny memory helpers (simple & cross-platform-ish) ----
def _ru_maxrss_mb() -> float:
    ru = resource.getrusage(resource.RUSAGE_SELF)
    # Linux: KB, macOS/BSD: bytes
    if sys.platform.startswith("linux"):
        return ru.ru_maxrss / 1024.0
    return ru.ru_maxrss / (1024.0 * 1024.0)

def _rss_now_mb() -> float:
    try:
        import psutil
        return psutil.Process().memory_info().rss / (1024.0 * 1024.0)
    except Exception:
        return 0.0

def _measure(callable_fn, *args, **kwargs):
    """Run callable_fn(*args, **kwargs) and return (result, elapsed_s, mem_delta_mb)."""
    t0 = time.perf_counter()
    ru0 = _ru_maxrss_mb()
    rss0 = _rss_now_mb()
    result = callable_fn(*args, **kwargs)
    elapsed = time.perf_counter() - t0
    mem_delta = max(0.0, _ru_maxrss_mb() - ru0, _rss_now_mb() - rss0)
    return result, elapsed, mem_delta

In [9]:
serialized_triples_list = []
sentences_list = []

with open("data/textualization/japan/original/gpt-4o-mini.txt", "r", encoding="utf-8") as f:
    for line in tqdm(f, desc="Reading lines"):
        obj = json.loads(line)
        temp_triples = [' '.join(triples) for triples in obj['triples']]
        serialized_triples_list.append(temp_triples)
        sentences_list.append(obj["sentence"])
serialized_triples_list = [text for ls in serialized_triples_list for text in ls]
print(f"Collected {len(serialized_triples_list)} items.")
for i in range(min(3, len(serialized_triples_list))):
    print(f"- serialized_triples: {serialized_triples_list[i]}")
    print(f"  sentence          : {sentences_list[i]}")

FileNotFoundError: [Errno 2] No such file or directory: 'data/textualization/japan/original/gpt-4o-mini.txt'

In [None]:


def parse_tsv_json_lines(data_path: Path):
    """Cada línea del archivo es un JSON con 'serialized_triples' y 'sentence'."""
    serialized_triples_list = []
    sentences_list = []

    with open(data_path, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc="Reading lines"):
            obj = json.loads(line)
            serialized_triples_list.append(obj["serialized_triples"])
            sentences_list.append(obj["sentence"])

    print(f"Collected {len(serialized_triples_list)} items.")
    for i in range(min(3, len(serialized_triples_list))):
        print(f"- serialized_triples: {serialized_triples_list[i]}")
        print(f"  sentence          : {sentences_list[i]}")

    return serialized_triples_list, sentences_list


Reading lines: 100000it [00:00, 170503.49it/s]

Collected 170210 items.
- serialized_triples: Mikhail Belyaev date of death 01 January 1918
  sentence          : Mikhail Alekseyevich Belyaev (Russian: ; December 23, 1863Â - 1918) was a Russian general of the Infantry, statesman, Chief of Staff of the Imperial Russian Army from August 1, 1914 to August 10, 1916, and was the last Minister of War of the Russian Empire from January 3, 1917 to February 28, 1917.
- serialized_triples: Mikhail Belyaev allegiance Russian Empire
  sentence          : Shiels Jewellers is an Australian jewellery retailer and was founded by Jack Shiels in Adelaide in 1945.
- serialized_triples: Mikhail Belyaev position held minister of war
  sentence          : The framing story is set in the 21st century and follows Desmond Miles as Assassin 's Creed II relives the genetic memories of his ancestor Ezio Auditore da Firenze.





In [3]:


def _validate_finite(name: str, x: np.ndarray) -> None:
    if not np.isfinite(x).all():
        bad_rows = np.unique(np.argwhere(~np.isfinite(x))[:, 0])[:10]
        raise ValueError(f"{name}: found non-finite values; first bad rows: {bad_rows}")

def generate_embeddings(
    originals: list[str],
    generated: list[str],
    model: SentenceTransformer,
    batch_size: int = 64,
) -> tuple[np.ndarray, np.ndarray]:
    print("Generating embeddings...")
    orig_emb = model.encode(
        originals, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True
    )
    gen_emb = model.encode(
        generated, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True
    )
    print("Embeddings generated.")
    return orig_emb, gen_emb


def bigs_scores_hnsw(
    original_embeddings: np.ndarray,
    generated_embeddings: np.ndarray,
    hnsw_M: int = 16,
    ef_construction: int = 200,
    ef_search: int = 128,
) -> tuple[float, float, float, float, float, float, float]:
    """
    Cosine distance via L2-normalized vectors + L2 metric (d2 = 2*(1-cos); cos_dist = d2/2).
    returns: (score_r, score_r_std, score_r_med, score_l, score_l_std, score_l_med, encode_elapsed)
    """

    X = np.ascontiguousarray(original_embeddings, dtype="float32")
    Y = np.ascontiguousarray(generated_embeddings, dtype="float32")

    if X.ndim != 2 or Y.ndim != 2 or X.shape[1] != Y.shape[1]:
        raise ValueError("Embeddings must be 2D and have matching dimensions.")

    _validate_finite("X (norm)", X)
    _validate_finite("Y (norm)", Y)

    print("X min/max:", X.min(), X.max(), "Y min/max:", Y.min(), Y.max())

    d = X.shape[1]

    # en macbook m1 necesitaba agregar esto o daba error:
    #try:
    #    faiss.omp_set_num_threads(1)
    #except Exception:
    #    pass

    print("Calculating BIGS -> ...")

    # --- Right: document -> graph ---
    idx_r = faiss.IndexHNSWFlat(d, hnsw_M)  # L2, devuelve distancias L2^2
    idx_r.hnsw.efConstruction = ef_construction
    idx_r.hnsw.efSearch = ef_search

    idx_r.add(Y)  # index sobre Y
    Dr, _ = idx_r.search(X, 1)  # (n_orig, 1) L2^2
    right_min = Dr[:, 0] / 2.0  # cos_dist

    print("Calculating BIGS <- ...")
    # --- Left: graph -> document ---
    idx_l = faiss.IndexHNSWFlat(d, hnsw_M)
    idx_l.hnsw.efConstruction = ef_construction
    idx_l.hnsw.efSearch = ef_search

    idx_l.add(X)  # index sobre X
    Dl, _ = idx_l.search(Y, 1)  # (n_gen, 1) L2^2
    left_min = Dl[:, 0] / 2.0  # cos_dist

    # Stats
    score_r = float(right_min.mean())
    score_r_std = float(right_min.std())
    score_r_med = float(np.median(right_min))

    score_l = float(left_min.mean())
    score_l_std = float(left_min.std())
    score_l_med = float(np.median(left_min))

    return (score_r, score_r_std, score_r_med, score_l, score_l_std, score_l_med)

In [5]:
def run_experiment_for_tsv(
    tsv_path: Path,
    model_name: str,
    *,
    batch_size: int = 64,
    hnsw_M: int = 16,
    ef_construction: int = 200,
    ef_search: int = 128,
) -> dict[str, object]:
    """Parse -> embed (timed/mem) -> BIGS (timed/mem). Returns a dict with all stats."""
    # 1) Parse
    serialized_triples, sentences = parse_tsv_json_lines(tsv_path)
    n = len(serialized_triples)

    # 2) Build model and generate embeddings (timed/mem)
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(model_name)

    (embeds, enc_time, enc_mem) = _measure(
        generate_embeddings,
        serialized_triples, sentences, model,
        batch_size=batch_size
    )
    orig_emb, gen_emb = embeds
    d = int(orig_emb.shape[1])

    # 3) BIGS with FAISS HNSW (timed/mem)
    (bigs, search_time, search_mem) = _measure(
        bigs_scores_hnsw,
        orig_emb, gen_emb,
        hnsw_M=hnsw_M,
        ef_construction=ef_construction,
        ef_search=ef_search,
    )
    (score_r, score_r_std, score_r_med,
     score_l, score_l_std, score_l_med) = bigs


    # # 3) BIGS Normal
    # (bigs, search_time, search_mem) = _measure(
    #     bigs_scores_normal,
    #     orig_emb, gen_emb
    # )
    # (score_r, score_r_std, score_r_med,
    #  score_l, score_l_std, score_l_med) = bigs

    # 4) Package results
    return {
        # dataset / model
        "file": tsv_path.name,
        "samples": n,
        "model_name": model_name,
        "embedding_dim": d,
        "batch_size": batch_size,
        # hnsw params
        "hnsw_M": hnsw_M,
        "ef_construction": ef_construction,
        "ef_search": ef_search,
        # timings
        "encode_time_s": round(enc_time, 4),
        "search_time_s": round(search_time, 4),
        "total_time_s": round(enc_time + search_time, 4),
        # memory (best-effort process delta)
        "encode_mem_delta_mb": round(enc_mem, 2),
        "search_mem_delta_mb": round(search_mem, 2),
        # BIGS stats
        "bigs_r_mean": round(score_r, 6),
        "bigs_r_std": round(score_r_std, 6),
        "bigs_r_med": round(score_r_med, 6),
        "bigs_l_mean": round(score_l, 6),
        "bigs_l_std": round(score_l_std, 6),
        "bigs_l_med": round(score_l_med, 6),
    }

In [8]:
japan = Path("data/triplet_lists/japan/langchain/gpt-4o-mini.txt")

In [None]:
results_50k = run_experiment_for_tsv(
    japan,
    model_name="sentence-transformers/all-mpnet-base-v2",  # "sentence-transformers/all-mpnet-base-v2"
    batch_size=256,
    hnsw_M=16, ef_construction=200, ef_search=128,
)
print(results_50k)