In [3]:
# ───────────────────────────────────────────────────────────────────────────────
# Preprocess: Save embeddings as memmap + small metadata CSV
# ───────────────────────────────────────────────────────────────────────────────

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd, numpy as np

# Paths in your Drive
csv_in   = "/content/drive/My Drive/79k_topic_entity_labeled_embedded.csv"
emb_npy  = "/content/drive/My Drive/embeddings.npy"
meta_csv = "/content/drive/My Drive/meta.csv"

# Load only the columns we need: topic_o, entity_e, emb_*
df = pd.read_csv(
    csv_in,
    usecols=lambda c: c.startswith("emb_") or c in ("topic_o","entity_e"),
    low_memory=False
)

# 1) Save embeddings (float32) as an on-disk memmap
emb_cols = [c for c in df.columns if c.startswith("emb_")]
E = df[emb_cols].values.astype("float32")
np.save(emb_npy, E)
print(f"✅ Saved embeddings ({E.shape}) → {emb_npy}")

# 2) Save just the tiny metadata table
meta = df[["topic_o","entity_e"]]
meta.to_csv(meta_csv, index=False)
print(f"✅ Saved metadata ({meta.shape}) → {meta_csv}")

# Clean up
del df, E, meta


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Saved embeddings ((390529, 768)) → /content/drive/My Drive/embeddings.npy
✅ Saved metadata ((390529, 2)) → /content/drive/My Drive/meta.csv


In [4]:
# Step 3: Reliability-Weighted Network & MISI (with fallback for large clusters)
# ───────────────────────────────────────────────────────────────────────────────

import pandas as pd, numpy as np, gc
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg     import cg
from tqdm                    import tqdm

# Paths
emb_npy   = "/content/drive/My Drive/embeddings.npy"
meta_csv  = "/content/drive/My Drive/meta.csv"
out_doc   = "/content/drive/My Drive/step3_results.csv"
out_te    = "/content/drive/My Drive/topic_entity_misi.csv"
out_topic = "/content/drive/My Drive/topic_misi.csv"

In [5]:
# 1) Load metadata & memory-map embeddings
meta     = pd.read_csv(meta_csv)
E_mmap   = np.load(emb_npy, mmap_mode="r")  # float32 array (n_docs,768)
n_docs, d = E_mmap.shape
print(f"⏱  Loaded metadata {meta.shape}, embeddings memmap {n_docs}×{d}")


⏱  Loaded metadata (390529, 2), embeddings memmap 390529×768


In [6]:
# 2) Precompute norms of every row (only ~3 MB)
norms = np.sqrt((E_mmap**2).sum(axis=1))

In [7]:
# 3) Prepare the output with header
with open(out_doc, "w") as f:
    f.write("topic_o,entity_e,w_tilde,misi\n")

In [8]:
# 4) Utility functions
def build_graph(X):
    # B_ij = 1 + cos(x_i,x_j)
    return 1 + cosine_similarity(X)

def refine_weights(B, w0, mu=0.1):
    D      = np.diag(B.sum(axis=1))
    A      = np.eye(len(w0)) + 2 * mu * (D - B)
    w_t, _ = cg(A, w0)           # drop tol arg for compatibility
    return w_t

def consensus_and_misi(X, w_tilde):
    x_star = (w_tilde[:,None] * X).sum(axis=0)
    x_star /= np.linalg.norm(x_star)
    return x_star, (1 - X.dot(x_star))

In [9]:
# 5) Parameters
MAX_CLUSTER = 5000   # threshold: above this, skip graph smoothing

In [10]:
# 6) Loop over each cluster with tqdm
grouped    = meta.groupby(["topic_o","entity_e"])
n_clusters = grouped.ngroups
print(f"🚀 Processing {n_clusters} clusters (threshold={MAX_CLUSTER})…")

for (topic,entity), sub in tqdm(grouped, total=n_clusters, desc="Clusters"):
    idx   = sub.index.values
    X     = E_mmap[idx] / norms[idx,None]   # (k,768) unit-vectors
    k     = len(idx)

    # initial weights = 1 for all docs
    w0    = np.ones(k, dtype=float)

    if k <= MAX_CLUSTER:
        # full graph smoothing
        B      = build_graph(X)
        w_t    = refine_weights(B, w0, mu=0.1)
        _, misi = consensus_and_misi(X, w_t)
    else:
        # fallback: simple weighted mean consensus
        w_t    = w0.copy()  # no smoothing
        # consensus = average of X since w0=1
        x_star = X.mean(axis=0)
        x_star /= np.linalg.norm(x_star)
        misi   = 1 - X.dot(x_star)

    # stream minimal results
    out = pd.DataFrame({
        "topic_o":  topic,
        "entity_e": entity,
        "w_tilde":  w_t,
        "misi":     misi
    })
    out.to_csv(out_doc, mode="a", header=False, index=False)

    # free memory
    del X, w0, w_t, misi, out
    if k <= MAX_CLUSTER:
        del B
    gc.collect()

print(f"✅ Document‐level MISI written to {out_doc}")

🚀 Processing 1096 clusters (threshold=5000)…


Clusters: 100%|██████████| 1096/1096 [01:28<00:00, 12.45it/s]

✅ Document‐level MISI written to /content/drive/My Drive/step3_results.csv





In [11]:
# 7) Aggregate Topic–Entity MISI
df3 = pd.read_csv(out_doc)
te_misi = (
    df3
    .groupby(["topic_o","entity_e"], as_index=False)["misi"]
    .mean()
    .rename(columns={"misi":"misi_topic_entity"})
)
te_misi.to_csv(out_te, index=False)
print(f"✅ Topic–Entity aggregated → {out_te}")

✅ Topic–Entity aggregated → /content/drive/My Drive/topic_entity_misi.csv


In [12]:
# 8) Aggregate Topic‐Level MISI
topic_misi = (
    te_misi
    .groupby("topic_o", as_index=False)["misi_topic_entity"]
    .mean()
    .rename(columns={"misi_topic_entity":"misi_topic"})
)
topic_misi.to_csv(out_topic, index=False)
print(f"✅ Topic‐Level aggregated → {out_topic}")


✅ Topic‐Level aggregated → /content/drive/My Drive/topic_misi.csv


In [13]:
# 9) Preview
print("\nSample Topic–Entity MISI:\n", te_misi.head(), sep="")
print("\nSample Topic‐Level MISI:\n", topic_misi.head(), sep="")


Sample Topic–Entity MISI:
    topic_o   entity_e  misi_topic_entity
0  Business  Agreement       9.560575e-08
1  Business      Ahmad      -5.378800e-08
2  Business      Ameri       5.414981e-01
3  Business      Apple       4.173550e-01
4  Business       Asia       3.726852e-01

Sample Topic‐Level MISI:
         topic_o  misi_topic
0       Business    0.213503
1       CrimeLaw    0.210306
2  Entertainment    0.204988
3    Environment    0.211383
4         Health    0.199918
