In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install bertopic umap-learn sentence-transformers hdbscan

Collecting bertopic
  Downloading bertopic-0.17.4-py3-none-any.whl.metadata (24 kB)
Downloading bertopic-0.17.4-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
Successfully installed bertopic-0.17.4


In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
"""
FULL TOPIC MODEL OPTIMIZATION SCRIPT
KMeans + LDA + BERTopic
- Grid search (4 configs per model)
- Scores: coherence (C_v, UMass), diversity (cosine, JSD), stability (Jaccard, Spearman)
- Select best config per model
- Refit best models, visualize, and store topics/assignments.

Requires:
    pip install sentence-transformers bertopic hdbscan umap-learn gensim seaborn
"""

# 0. IMPORTS & UTILITIES

import json
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# ML / NLP
import torch
from sentence_transformers import SentenceTransformer

from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP

from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jensenshannon
from scipy.optimize import linear_sum_assignment
from scipy.stats import spearmanr

# Gensim coherence
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel


# 1. CONFIG
DATA_PATH = "/content/drive/MyDrive/NLP /artificial_filtered_output.jsonl"
TEXT_COL = "proposition"
TIME_COL = "timestamp"

EMBEDDING_MODEL = "all-MiniLM-L6-v2"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# Topic-word settings
TOP_WORDS = 10

# Topic count constraints:
MAX_TOPICS_KMEANS = 10   # KMeans configs <= 10
MAX_TOPICS_LDA = 10      # LDA configs <= 10
MAX_TOPICS_BERTOPIC = 15 # we restrict BERTopic metrics to top 15 topics by frequency

# Stability runs
N_STABILITY_RUNS = 3

# Grids: 4 configs per model
kmeans_grid = [{"n_clusters": k} for k in [5, 7, 9, 10]]

lda_grid = [{"n_topics": k} for k in [5, 7, 9, 10]]

# Now we also vary min_dist for BERTopic
bertopic_grid = [
    {"min_topic_size": 30, "n_neighbors": 15, "min_dist": 0.0},
    {"min_topic_size": 30, "n_neighbors": 15, "min_dist": 0.30},
    {"min_topic_size": 50, "n_neighbors": 40, "min_dist": 0.0},
    {"min_topic_size": 50, "n_neighbors": 40, "min_dist": 0.30},
]

sns.set(style="whitegrid")


# 2. DATA LOADING

def load_jsonl(path):
    rows = []
    with open(path, "r") as f:
        for line in f:
            if not line.strip():
                continue
            rows.append(json.loads(line))
    return pd.DataFrame(rows)

df = load_jsonl(DATA_PATH)
df[TIME_COL] = pd.to_datetime(df[TIME_COL], errors="coerce")
df = df.dropna(subset=[TEXT_COL, TIME_COL]).reset_index(drop=True)

docs = df[TEXT_COL].astype(str).tolist()
timestamps = df[TIME_COL].tolist()

print(f"Loaded {len(df):,} rows.")


# 3. EMBEDDINGS

def generate_embeddings(texts, model_name, device):
    print(f"\n--- Generating embeddings with {model_name} ---")
    model = SentenceTransformer(model_name, device=device)
    emb = model.encode(
        texts,
        batch_size=256,
        show_progress_bar=True,
        convert_to_numpy=True
    )
    print("Embeddings shape:", emb.shape)
    return emb

embeddings = generate_embeddings(docs, EMBEDDING_MODEL, device)


# 4. COHERENCE & DIVERSITY

def compute_coherence_gensim(texts, topic_words, coherence_type="c_v"):
    """
    texts: list of raw strings
    topic_words: list of list-of-words, e.g. [["ai", "model"], ...]
    """
    if len(topic_words) == 0:
        return 0.0
    tokenized = [t.split() for t in texts]
    dictionary = Dictionary(tokenized)
    corpus = [dictionary.doc2bow(t) for t in tokenized]
    cm = CoherenceModel(
        topics=topic_words,
        texts=tokenized,
        corpus=corpus,
        dictionary=dictionary,
        coherence=coherence_type,
    )
    return cm.get_coherence()


def calculate_topic_diversity(topic_words, metric="cosine"):
    """
    topic_words: list of list-of-words
    metric: "cosine" -> mean cosine similarity
            "jsd"    -> mean Jensen-Shannon distance
    """
    if not topic_words:
        return 0.0

    vocab = sorted({w for words in topic_words for w in words})
    if not vocab:
        return 0.0

    vocab_map = {w: i for i, w in enumerate(vocab)}

    topic_vecs = []
    for words in topic_words:
        v = np.zeros(len(vocab))
        for w in words:
            if w in vocab_map:
                v[vocab_map[w]] = 1
        topic_vecs.append(v)

    if len(topic_vecs) < 2:
        return 0.0

    if metric == "cosine":
        sims = cosine_similarity(topic_vecs)
        upper = sims[np.triu_indices(len(topic_vecs), 1)]
        return float(upper.mean())

    if metric == "jsd":
        probs = []
        for v in topic_vecs:
            s = v.sum()
            if s == 0:
                probs.append(np.ones_like(v) / len(v))
            else:
                probs.append(v / s)
        jsd_vals = []
        for i in range(len(probs)):
            for j in range(i+1, len(probs)):
                jsd_vals.append(jensenshannon(probs[i], probs[j]))
        return float(np.mean(jsd_vals))

    raise ValueError("metric must be 'cosine' or 'jsd'")


# 5. STABILITY (JACCARD + SPEARMAN)

def align_topics(wordsets_A, wordsets_B):
    """
    Hungarian alignment of topics A → B using Jaccard-based costs.
    """
    nA, nB = len(wordsets_A), len(wordsets_B)
    if nA == 0 or nB == 0:
        return []

    cost = np.zeros((nA, nB))

    for i in range(nA):
        for j in range(nB):
            setA, setB = set(wordsets_A[i]), set(wordsets_B[j])
            if len(setA | setB) == 0:
                cost[i, j] = 1.0
            else:
                cost[i, j] = 1 - len(setA & setB) / len(setA | setB)

    row_ind, col_ind = linear_sum_assignment(cost)
    return list(zip(row_ind, col_ind))


def compute_jaccard(listA, listB):
    setA, setB = set(listA), set(listB)
    if len(setA | setB) == 0:
        return 0.0
    return len(setA & setB) / len(setA | setB)


def compute_spearman(listA, listB, max_rank):
    """
    Spearman correlation on ranked word positions.
    max_rank: default rank assigned to missing words.
    """
    words = list(set(listA) | set(listB))
    ranksA = [listA.index(w) if w in listA else max_rank for w in words]
    ranksB = [listB.index(w) if w in listB else max_rank for w in words]
    rho, _ = spearmanr(ranksA, ranksB)
    return 0.0 if np.isnan(rho) else float(rho)


def model_stability(topicwords_runs, max_rank=TOP_WORDS):
    """
    topicwords_runs: list of [run1_topics, run2_topics, run3_topics]
    where each runX_topics is list of list-of-words for that run.
    """
    if len(topicwords_runs) < 2:
        return pd.DataFrame()

    pairs = [
        ("run1", topicwords_runs[0], "run2", topicwords_runs[1]),
        ("run1", topicwords_runs[0], "run3", topicwords_runs[2]),
        ("run2", topicwords_runs[1], "run3", topicwords_runs[2]),
    ][:max(0, len(topicwords_runs) - 1)]  # safe if fewer runs

    results = []
    for nameA, A, nameB, B in pairs:
        matches = align_topics(A, B)
        if not matches:
            results.append({
                "pair": f"{nameA}_{nameB}",
                "mean_jaccard": 0.0,
                "mean_spearman": 0.0
            })
            continue

        jaccs = []
        rhos = []

        for a_idx, b_idx in matches:
            jacc = compute_jaccard(A[a_idx], B[b_idx])
            rho = compute_spearman(A[a_idx], B[b_idx], max_rank)
            jaccs.append(jacc)
            rhos.append(rho)

        results.append({
            "pair": f"{nameA}_{nameB}",
            "mean_jaccard": float(np.mean(jaccs)),
            "mean_spearman": float(np.mean(rhos)),
        })

    return pd.DataFrame(results)


# 6. MODEL RUNNERS (ONE RUN)

def run_kmeans_topic_words(texts, embeddings, n_clusters, n_top_words, seed):
    km = KMeans(n_clusters=n_clusters, random_state=seed, n_init="auto")
    labels = km.fit_predict(embeddings)

    vec = TfidfVectorizer(stop_words="english")
    X = vec.fit_transform(texts)
    vocab = vec.get_feature_names_out()

    topic_words = []
    for c in range(n_clusters):
        idx = np.where(labels == c)[0]
        if len(idx) == 0:
            topic_words.append([])
            continue

        centroid = np.asarray(X[idx].mean(axis=0)).ravel()
        top_idx = centroid.argsort()[-n_top_words:][::-1]
        topic_words.append([vocab[t] for t in top_idx])

    return labels, topic_words


def run_lda(texts, n_topics, n_top_words, seed):
    vec = CountVectorizer(stop_words="english", max_df=0.95, min_df=5)
    X = vec.fit_transform(texts)
    lda = LatentDirichletAllocation(
        n_components=n_topics,
        random_state=seed,
        max_iter=10,
        learning_method="online"
    )
    lda.fit(X)

    feature_names = vec.get_feature_names_out()
    topic_words = []
    for comp in lda.components_:
        idx = comp.argsort()[-n_top_words:][::-1]
        topic_words.append([feature_names[i] for i in idx])

    doc_topic = lda.transform(X)
    doc_topic_ids = np.argmax(doc_topic, axis=1)

    return lda, vec, topic_words, doc_topic_ids


def get_bertopic_top_words_limited(model, topics, max_topics, n_top_words):
    """
    Limit BERTopic to max_topics (by frequency) and return topic_words list.
    """
    s = pd.Series(topics)
    s = s[s != -1]
    if s.empty:
        return [], []

    counts = s.value_counts()
    top_topic_ids = counts.head(max_topics).index.tolist()

    topic_words = []
    for tid in top_topic_ids:
        words = [w for w, _ in model.get_topic(tid)[:n_top_words]]
        topic_words.append(words)

    return topic_words, top_topic_ids


def run_bertopic(texts, timestamps, embeddings, hp, seed, max_topics, n_top_words):
    umap_model = UMAP(
        n_neighbors=hp["n_neighbors"],
        n_components=5,
        min_dist=hp["min_dist"],
        metric="cosine",
        random_state=seed
    )

    hdb = HDBSCAN(
        min_cluster_size=hp["min_topic_size"],
        min_samples=5,
        prediction_data=True
    )

    model = BERTopic(
        embedding_model=None,   # we pass precomputed embeddings
        umap_model=umap_model,
        hdbscan_model=hdb,
        min_topic_size=hp["min_topic_size"],
        verbose=False
    )

    topics, probs = model.fit_transform(texts, embeddings)
    topic_words, top_topic_ids = get_bertopic_top_words_limited(
        model, topics, max_topics=max_topics, n_top_words=n_top_words
    )

    return model, topics, topic_words, top_topic_ids


# 7. OPTIMIZATION LOOP

def optimize_family(
    model_type,
    hp_grid,
    texts,
    timestamps,
    embeddings,
    n_runs,
    max_topics_kmeans,
    max_topics_lda,
    max_topics_bertopic,
    top_words
):
    """
    Run grid search over hp_grid for a given model_type ("KMeans", "LDA", "BERTopic").
    Returns:
        metrics_df: per-run metrics
        stab_df: per-config stability (mean over run pairs)
    """

    metrics_rows = []
    stab_rows = []

    print(f"\n############## OPTIMIZING {model_type} ###############")

    for hp in hp_grid:
        config_id = f"{model_type}_" + "_".join(f"{k}={v}" for k, v in hp.items())
        print(f"\nConfig: {config_id}")

        topicwords_runs = []

        for r in range(1, n_runs + 1):
            seed = 1000 + 10 * r  # arbitrary but reproducible per run

            if model_type == "KMeans":
                n_clusters = hp["n_clusters"]
                labels, topic_words = run_kmeans_topic_words(
                    texts, embeddings, n_clusters, top_words, seed
                )
                n_topics = n_clusters  # all topics used; all ≤ max_topics_kmeans by grid

            elif model_type == "LDA":
                n_topics = hp["n_topics"]
                lda, vec, topic_words, doc_topic_ids = run_lda(
                    texts, n_topics, top_words, seed
                )

            elif model_type == "BERTopic":
                model, topics, topic_words, top_topic_ids = run_bertopic(
                    texts, timestamps, embeddings, hp,
                    seed=seed,
                    max_topics=max_topics_bertopic,
                    n_top_words=top_words
                )
                n_topics = len(topic_words)

            else:
                raise ValueError("Unknown model_type")

            topicwords_runs.append(topic_words)

            # Coherence & diversity for THIS run
            C_v = compute_coherence_gensim(texts, topic_words, "c_v")
            C_umass = compute_coherence_gensim(texts, topic_words, "u_mass")
            mean_cos = calculate_topic_diversity(topic_words, "cosine")
            mean_jsd = calculate_topic_diversity(topic_words, "jsd")

            metrics_rows.append({
                "model": model_type,
                "config_id": config_id,
                "run": r,
                "n_topics": n_topics,
                "C_v": C_v,
                "C_umass": C_umass,
                "Mean_Cos": mean_cos,
                "Mean_JSD": mean_jsd,
            })

        # After n_runs: stability for this config
        stab_df_cfg = model_stability(topicwords_runs, max_rank=top_words)
        if not stab_df_cfg.empty:
            stab_rows.append({
                "model": model_type,
                "config_id": config_id,
                "Stab_Jaccard_mean": stab_df_cfg["mean_jaccard"].mean(),
                "Stab_Spearman_mean": stab_df_cfg["mean_spearman"].mean(),
            })
        else:
            stab_rows.append({
                "model": model_type,
                "config_id": config_id,
                "Stab_Jaccard_mean": 0.0,
                "Stab_Spearman_mean": 0.0,
            })

    metrics_df = pd.DataFrame(metrics_rows)
    stab_df = pd.DataFrame(stab_rows)
    return metrics_df, stab_df


# Run optimization for each family
kmeans_metrics, kmeans_stab = optimize_family(
    "KMeans",
    kmeans_grid,
    docs,
    timestamps,
    embeddings,
    n_runs=N_STABILITY_RUNS,
    max_topics_kmeans=MAX_TOPICS_KMEANS,
    max_topics_lda=MAX_TOPICS_LDA,
    max_topics_bertopic=MAX_TOPICS_BERTOPIC,
    top_words=TOP_WORDS,
)

lda_metrics, lda_stab = optimize_family(
    "LDA",
    lda_grid,
    docs,
    timestamps,
    embeddings,
    n_runs=N_STABILITY_RUNS,
    max_topics_kmeans=MAX_TOPICS_KMEANS,
    max_topics_lda=MAX_TOPICS_LDA,
    max_topics_bertopic=MAX_TOPICS_BERTOPIC,
    top_words=TOP_WORDS,
)

bertopic_metrics, bertopic_stab = optimize_family(
    "BERTopic",
    bertopic_grid,
    docs,
    timestamps,
    embeddings,
    n_runs=N_STABILITY_RUNS,
    max_topics_kmeans=MAX_TOPICS_KMEANS,
    max_topics_lda=MAX_TOPICS_LDA,
    max_topics_bertopic=MAX_TOPICS_BERTOPIC,
    top_words=TOP_WORDS,
)

all_metrics = pd.concat([kmeans_metrics, lda_metrics, bertopic_metrics], ignore_index=True)
all_stab = pd.concat([kmeans_stab, lda_stab, bertopic_stab], ignore_index=True)


# 8. SUMMARY & MODEL SELECTION

def summarize_with_stability(metrics_df, stab_df, model_name):
    df_model = metrics_df[metrics_df["model"] == model_name]
    stab_model = stab_df[stab_df["model"] == model_name]

    agg = df_model.groupby("config_id").agg({
        "n_topics": ["mean", "std"],
        "C_v": ["mean", "std"],
        "C_umass": ["mean", "std"],
        "Mean_Cos": ["mean", "std"],
        "Mean_JSD": ["mean", "std"],
    })

    agg.columns = ["_".join(c) for c in agg.columns]
    agg = agg.reset_index()

    agg["Diversity_1_minus_cos"] = 1 - agg["Mean_Cos_mean"]

    summary = agg.merge(stab_model, on="config_id", how="left")

    print(f"\n### SUMMARY: {model_name} ###")
    print(summary.sort_values("C_v_mean", ascending=False))

    # Choose best config by C_v_mean (you can change this criterion)
    best_row = summary.sort_values("C_v_mean", ascending=False).iloc[0]
    best_config_id = best_row["config_id"]

    return summary, best_config_id

kmeans_summary, best_kmeans_id = summarize_with_stability(
    all_metrics, all_stab, "KMeans"
)
lda_summary, best_lda_id = summarize_with_stability(
    all_metrics, all_stab, "LDA"
)
bertopic_summary, best_bertopic_id = summarize_with_stability(
    all_metrics, all_stab, "BERTopic"
)

print("\nBEST CONFIGS:")
print("  KMeans  :", best_kmeans_id)
print("  LDA     :", best_lda_id)
print("  BERTopic:", best_bertopic_id)


# Extract best hyperparams
def parse_hp_from_id(config_id, model_type):
    """
    config_id like: 'KMeans_n_clusters=10' or
                    'LDA_n_topics=7' or
                    'BERTopic_min_topic_size=50_n_neighbors=15_min_dist=0.0'
    """
    parts = config_id.split(f"{model_type}_")[1].split("_")
    hp = {}
    for p in parts:
        k, v = p.split("=")
        try:
            v_cast = int(v)
        except ValueError:
            try:
                v_cast = float(v)
            except ValueError:
                v_cast = v
        hp[k] = v_cast
    return hp

best_kmeans_hp = parse_hp_from_id(best_kmeans_id, "KMeans")
best_lda_hp = parse_hp_from_id(best_lda_id, "LDA")
best_bertopic_hp = parse_hp_from_id(best_bertopic_id, "BERTopic")

print("\nParsed best hyperparams:")
print("KMeans :", best_kmeans_hp)
print("LDA    :", best_lda_hp)
print("BERTop :", best_bertopic_hp)


# 9. REFIT BEST MODELS & STORE TOPICS

# --- KMeans best ---
km_labels_best, km_topic_words_best = run_kmeans_topic_words(
    docs,
    embeddings,
    n_clusters=best_kmeans_hp["n_clusters"],
    n_top_words=TOP_WORDS,
    seed=42
)
df["kmeans_best_cluster"] = km_labels_best

km_topic_df = []
for cid, words in enumerate(km_topic_words_best):
    km_topic_df.append({
        "model": "KMeans",
        "topic_id": cid,
        "top_terms": ", ".join(words),
        "n_docs": int((df["kmeans_best_cluster"] == cid).sum())
    })
km_topic_df = pd.DataFrame(km_topic_df)

# --- LDA best ---
lda_best, lda_vec_best, lda_topic_words_best, lda_doc_topic_ids_best = run_lda(
    docs,
    n_topics=best_lda_hp["n_topics"],
    n_top_words=TOP_WORDS,
    seed=42
)
df["lda_best_topic"] = lda_doc_topic_ids_best

lda_topic_df = []
for tid, words in enumerate(lda_topic_words_best):
    lda_topic_df.append({
        "model": "LDA",
        "topic_id": tid,
        "top_terms": ", ".join(words),
        "n_docs": int((df["lda_best_topic"] == tid).sum())
    })
lda_topic_df = pd.DataFrame(lda_topic_df)

# --- BERTopic best ---
ber_best_model, ber_topics_best, ber_topic_words_best, ber_top_ids_best = run_bertopic(
    docs,
    timestamps,
    embeddings,
    hp=best_bertopic_hp,
    seed=42,
    max_topics=MAX_TOPICS_BERTOPIC,
    n_top_words=TOP_WORDS
)
df["bertopic_best_topic"] = ber_topics_best

ber_topic_df = []
for idx, tid in enumerate(ber_top_ids_best):
    words = ber_topic_words_best[idx]
    ber_topic_df.append({
        "model": "BERTopic",
        "topic_id": tid,
        "top_terms": ", ".join(words),
        "n_docs": int((df["bertopic_best_topic"] == tid).sum())
    })
ber_topic_df = pd.DataFrame(ber_topic_df)

# Combined topic summary (best configs only)
best_topic_summary = pd.concat(
    [km_topic_df, lda_topic_df, ber_topic_df],
    ignore_index=True
)

# Save outputs
out_dir = Path("./topic_outputs")
out_dir.mkdir(exist_ok=True, parents=True)

best_topic_summary.to_csv(out_dir / "best_topic_summary.csv", index=False)
df.to_csv(out_dir / "docs_with_topics.csv", index=False)

print(f"\nSaved best topic summary to {out_dir/'best_topic_summary.csv'}")
print(f"Saved document-topic assignments to {out_dir/'docs_with_topics.csv'}")


# 10. VISUALIZATIONS (BEST MODELS)

sns.set(style="whitegrid")

# KMeans PCA
print("\n--- Visualizing best KMeans model (PCA) ---")
pca = PCA(n_components=2, random_state=42)
coords = pca.fit_transform(embeddings)

plt.figure(figsize=(8, 6))
sns.scatterplot(
    x=coords[:, 0],
    y=coords[:, 1],
    hue=km_labels_best,
    palette="tab20",
    s=8,
    alpha=0.6,
    linewidth=0
)
plt.title("Best KMeans Clusters (PCA projection)")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc="upper left", ncol=1)
plt.tight_layout()
plt.savefig(out_dir / "kmeans_pca.png", dpi=300)
plt.show()

# LDA topic frequencies
print("\n--- LDA best topic frequencies ---")
lda_counts = pd.Series(df["lda_best_topic"]).value_counts().sort_index()
plt.figure(figsize=(10, 4))
sns.barplot(x=lda_counts.index, y=lda_counts.values)
plt.xlabel("LDA Topic")
plt.ylabel("Number of documents")
plt.title("LDA Best Topic Distribution")
plt.tight_layout()
plt.savefig(out_dir / "lda_topic_distribution.png", dpi=300)
plt.show()

# BERTopic topic frequencies (only top MAX_TOPICS_BERTOPIC)
print("\n--- BERTopic best topic frequencies (top 15) ---")
s = pd.Series(ber_topics_best)
s = s[s != -1]
counts = s.value_counts().head(MAX_TOPICS_BERTOPIC).sort_values(ascending=True)
plt.figure(figsize=(10, 6))
sns.barplot(x=counts.values, y=counts.index.astype(str), orient="h")
plt.xlabel("Number of documents")
plt.ylabel("BERTopic Topic ID")
plt.title(f"BERTopic Best Topics (Top {MAX_TOPICS_BERTOPIC} by frequency)")
plt.tight_layout()
plt.savefig(out_dir / "bertopic_topic_distribution.png", dpi=300)
plt.show()

print("\nDone. You now have:")
print("- Per-config grid search metrics (all_metrics)")
print("- Per-config stability (all_stab)")
print("- Best configs per model (printed above)")
print("- Best topics saved in ./topic_outputs/")


Device: cpu
Loaded 63,084 rows.

--- Generating embeddings with all-MiniLM-L6-v2 ---


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/247 [00:00<?, ?it/s]

Embeddings shape: (63084, 384)

############## OPTIMIZING KMeans ###############

Config: KMeans_n_clusters=5

Config: KMeans_n_clusters=7

Config: KMeans_n_clusters=9

Config: KMeans_n_clusters=10

############## OPTIMIZING LDA ###############

Config: LDA_n_topics=5

Config: LDA_n_topics=7

Config: LDA_n_topics=9

Config: LDA_n_topics=10

############## OPTIMIZING BERTopic ###############

Config: BERTopic_min_topic_size=30_n_neighbors=15_min_dist=0.0

Config: BERTopic_min_topic_size=30_n_neighbors=15_min_dist=0.3

Config: BERTopic_min_topic_size=50_n_neighbors=40_min_dist=0.0

Config: BERTopic_min_topic_size=50_n_neighbors=40_min_dist=0.3

### SUMMARY: KMeans ###
              config_id  n_topics_mean  n_topics_std  C_v_mean   C_v_std  \
0  KMeans_n_clusters=10           10.0           0.0  0.440472  0.014988   
3   KMeans_n_clusters=9            9.0           0.0  0.426000  0.013013   
2   KMeans_n_clusters=7            7.0           0.0  0.422637  0.002941   
1   KMeans_n_clusters

ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
#!/usr/bin/env python3
"""
fit_best_topics_jsonl.py

Use the *already selected* best configurations for KMeans, LDA, and BERTopic
to (re)fit final topic models on the full proposition corpus, and export:

- Fitted models (via joblib)
- Document–topic assignments
- Top-n words per topic

This version:
- Reads from the JSONL proposition file used in the main paper:
    "/content/drive/MyDrive/NLP /artificial_filtered_output.jsonl"
- Uses the 'proposition' field as text.
- Handles BERTopic probabilities robustly (no AxisError).
"""

import os
import re
import json
import joblib
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation

from bertopic import BERTopic
from umap import UMAP

RANDOM_STATE = 42

# 1. Paths and config IDs (from your finished grid-search summaries)

INPUT_JSONL = "/content/drive/MyDrive/NLP /artificial_filtered_output.jsonl"
TEXT_COL    = "proposition"   # field in the JSONL
TIME_COL    = "timestamp"     # if present; not strictly needed here

OUT_DIR   = "./topic_models_best"
os.makedirs(OUT_DIR, exist_ok=True)

BEST_KMEANS_ID   = "KMeans_n_clusters=10"
BEST_LDA_ID      = "LDA_n_topics=10"
BEST_BERTOPIC_ID = "BERTopic_min_topic_size=30_n_neighbors=15_min_dist=0.0"

# 2. Robust parser for config_id → hyperparameter dict

def parse_hp_from_id(config_id: str, model_type: str) -> dict:
    """
    Parse config_id strings of the form:
      - "KMeans_n_clusters=10"
      - "LDA_n_topics=10"
      - "BERTopic_min_topic_size=30_n_neighbors=15_min_dist=0.0"

    into a dict of hyperparameters:
      {"n_clusters": 10}, {"n_topics": 10}, etc.

    Skips any segments without '=' so we don't crash.
    """
    hp = {}
    # Strip model prefix if present
    if config_id.startswith(model_type + "_"):
        config_id = config_id[len(model_type) + 1:]

    parts = config_id.split("_")
    for p in parts:
        if "=" not in p:
            continue
        k, v = p.split("=", 1)
        v_strip = v.strip()
        # int
        if re.fullmatch(r"-?\d+", v_strip):
            v_cast = int(v_strip)
        else:
            # float / bool / fallback string
            try:
                v_cast = float(v_strip)
            except ValueError:
                if v_strip.lower() in {"true", "false"}:
                    v_cast = (v_strip.lower() == "true")
                else:
                    v_cast = v_strip
        hp[k] = v_cast
    return hp

# 3. Data loading + vectorization

print(f"Loading corpus from JSONL: {INPUT_JSONL}")
df = pd.read_json(INPUT_JSONL, lines=True)
if TEXT_COL not in df.columns:
    raise KeyError(f"Expected '{TEXT_COL}' column in JSONL, found: {df.columns.tolist()}")

texts = df[TEXT_COL].astype(str).tolist()
n_docs = len(texts)
print(f"Loaded {n_docs:,} propositions")

# Simple BoW for KMeans/LDA
vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=10,
    stop_words="english"
)
X_bow = vectorizer.fit_transform(texts)
vocab = np.array(vectorizer.get_feature_names_out())
print(f"BoW matrix shape: {X_bow.shape}")

# 4. Fit best KMeans

def fit_best_kmeans(X, config_id):
    hp = parse_hp_from_id(config_id, "KMeans")
    n_clusters = int(hp.get("n_clusters", 10))
    print(f"\n[Best KMeans] n_clusters={n_clusters}")
    km = KMeans(
        n_clusters=n_clusters,
        random_state=RANDOM_STATE,
        n_init="auto"
    )
    labels = km.fit_predict(X)
    return km, labels

kmeans_model, kmeans_labels = fit_best_kmeans(X_bow, BEST_KMEANS_ID)

# 5. Fit best LDA

def fit_best_lda(X, config_id):
    hp = parse_hp_from_id(config_id, "LDA")
    n_topics = int(hp.get("n_topics", 10))
    print(f"\n[Best LDA] n_topics={n_topics}")
    lda = LatentDirichletAllocation(
        n_components=n_topics,
        random_state=RANDOM_STATE,
        learning_method="batch"
    )
    doc_topic = lda.fit_transform(X)
    labels = doc_topic.argmax(axis=1)
    return lda, doc_topic, labels

lda_model, lda_doc_topic, lda_labels = fit_best_lda(X_bow, BEST_LDA_ID)

# 6. Fit best BERTopic (with probabilities)

def fit_best_bertopic(texts, config_id):
    hp = parse_hp_from_id(config_id, "BERTopic")

    min_topic_size = int(hp.get("min_topic_size", 30))
    n_neighbors    = int(hp.get("n_neighbors", 15))
    min_dist       = float(hp.get("min_dist", 0.0))

    print(
        f"\n[Best BERTopic] min_topic_size={min_topic_size}, "
        f"n_neighbors={n_neighbors}, min_dist={min_dist}"
    )

    umap_model = UMAP(
        n_neighbors=n_neighbors,
        n_components=5,
        min_dist=min_dist,
        metric="cosine",
        random_state=RANDOM_STATE
    )

    topic_model = BERTopic(
        umap_model=umap_model,
        min_topic_size=min_topic_size,
        calculate_probabilities=True,  # ensure probs are computed
        verbose=True
    )

    topics, probs = topic_model.fit_transform(texts)
    return topic_model, topics, probs

bertopic_model, bertopic_topics, bertopic_probs = fit_best_bertopic(
    texts, BEST_BERTOPIC_ID
)

# 7. Helpers to extract top words per topic

def top_words_from_centroids(model, vocab, n_top=10):
    """Top words per KMeans cluster centroid."""
    centers = model.cluster_centers_
    topic_terms = {}
    for k in range(centers.shape[0]):
        idx = np.argsort(centers[k])[::-1][:n_top]
        topic_terms[k] = vocab[idx].tolist()
    return topic_terms

def top_words_from_lda(model, vocab, n_top=10):
    """Top words per LDA topic (components_)."""
    topic_terms = {}
    for k, comp in enumerate(model.components_):
        idx = np.argsort(comp)[::-1][:n_top]
        topic_terms[k] = vocab[idx].tolist()
    return topic_terms

def top_words_from_bertopic(model, n_top=10):
    """Use BERTopic.get_topic for each topic id (excluding -1)."""
    topic_terms = {}
    for topic_id in model.get_topics().keys():
        if topic_id == -1:  # BERTopic outlier bucket
            continue
        words = [w for (w, score) in model.get_topic(topic_id)[:n_top]]
        topic_terms[topic_id] = words
    return topic_terms

kmeans_top_words   = top_words_from_centroids(kmeans_model, vocab, n_top=15)
lda_top_words      = top_words_from_lda(lda_model, vocab, n_top=15)
bertopic_top_words = top_words_from_bertopic(bertopic_model, n_top=15)

# 8. Save models, doc–topic assignments, and topic-word tables

print("\nSaving models and outputs...")

# Models
joblib.dump(kmeans_model,   os.path.join(OUT_DIR, "kmeans_best.joblib"))
joblib.dump(lda_model,      os.path.join(OUT_DIR, "lda_best.joblib"))
joblib.dump(bertopic_model, os.path.join(OUT_DIR, "bertopic_best.joblib"))
joblib.dump(vectorizer,     os.path.join(OUT_DIR, "vectorizer_bow.joblib"))

# Topic words
with open(os.path.join(OUT_DIR, "kmeans_topics.json"), "w") as f:
    json.dump(kmeans_top_words, f, indent=2)

with open(os.path.join(OUT_DIR, "lda_topics.json"), "w") as f:
    json.dump(lda_top_words, f, indent=2)

with open(os.path.join(OUT_DIR, "bertopic_topics.json"), "w") as f:
    json.dump(bertopic_top_words, f, indent=2)

# Document-level assignments
doc_out = df.copy()
doc_out["kmeans_topic"]   = kmeans_labels
doc_out["lda_topic"]      = lda_labels
doc_out["bertopic_topic"] = bertopic_topics

# Robust handling of BERTopic probabilities
if bertopic_probs is not None:
    bertopic_probs = np.asarray(bertopic_probs)
    if bertopic_probs.ndim == 2:
        # max probability per document
        doc_out["bertopic_max_prob"] = bertopic_probs.max(axis=1)
        # probability of the assigned topic
        idx = np.arange(len(bertopic_topics))
        doc_out["bertopic_topic_prob"] = bertopic_probs[idx, bertopic_topics]
    else:
        print(
            f"[Warning] bertopic_probs has ndim={bertopic_probs.ndim}; "
            "skipping probability-based columns."
        )

doc_out.to_csv(os.path.join(OUT_DIR, "documents_with_topics.csv"), index=False)

print(f"Done. Outputs written to: {OUT_DIR}")

Loading corpus from JSONL: /content/drive/MyDrive/NLP /artificial_filtered_output.jsonl
Loaded 63,084 propositions
BoW matrix shape: (63084, 11351)

[Best KMeans] n_clusters=10

[Best LDA] n_topics=10


2025-12-10 23:27:48,781 - BERTopic - Embedding - Transforming documents to embeddings.



[Best BERTopic] min_topic_size=30, n_neighbors=15, min_dist=0.0


Batches:   0%|          | 0/1972 [00:00<?, ?it/s]

2025-12-10 23:28:15,500 - BERTopic - Embedding - Completed ✓
2025-12-10 23:28:15,503 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-10 23:29:33,681 - BERTopic - Dimensionality - Completed ✓
2025-12-10 23:29:33,683 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-10 23:31:25,405 - BERTopic - Cluster - Completed ✓
2025-12-10 23:31:25,420 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-10 23:31:28,333 - BERTopic - Representation - Completed ✓



Saving models and outputs...
Done. Outputs written to: ./topic_models_best


In [None]:
#!/usr/bin/env python3
"""
fit_and_analyze_topics_deep.py

Use the *already selected* best configurations for KMeans, LDA, and BERTopic
to (re)fit final topic models on the full proposition corpus, and export:

- Fitted models (via joblib)
- Document–topic assignments
- Per-topic / per-cluster summaries:
    * Top words
    * Representative propositions
    * SoA label histograms
    * SoA / sentiment / time summaries
    * Cross-tabs between KMeans, LDA, BERTopic

Assumptions:
- You have a JSONL file with at least 'proposition' (and optionally 'timestamp').
- You have a CSV with NLI-based SoA labels & scores and sentiment:
    "/content/drive/MyDrive/NLP /ai_human_agency_inferences_mnli_triple_labels_winner_meta_goldclf.csv"
  containing at least:
    'proposition', 'combined_soa_label', 'combined_agency_score', 'sentiment_score'

Adjust paths, column names, and configs as needed.
"""

import os
import re
import json
import joblib
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation

from bertopic import BERTopic
from umap import UMAP

RANDOM_STATE = 42

# 1. Paths and config IDs

INPUT_JSONL = "/content/drive/MyDrive/NLP /artificial_filtered_output.jsonl"
SOA_CSV     = "/content/drive/MyDrive/NLP /ai_human_agency_inferences_mnli_triple_labels_winner_meta_goldclf.csv"

TEXT_COL    = "proposition"
TIME_COL    = "timestamp"   # optional; ignore if absent

# SoA / sentiment columns from the CSV
SOA_LABEL_COL = "combined_soa_label"
SOA_SCORE_COL = "combined_agency_score"
SENT_COL      = "sentiment_score"

OUT_DIR   = "./topic_models_best_deep"
os.makedirs(OUT_DIR, exist_ok=True)

BEST_KMEANS_ID   = "KMeans_n_clusters=10"
BEST_LDA_ID      = "LDA_n_topics=10"
BEST_BERTOPIC_ID = "BERTopic_min_topic_size=30_n_neighbors=15_min_dist=0.0"

# 2. Robust parser for config_id → hyperparameter dict

def parse_hp_from_id(config_id: str, model_type: str) -> dict:
    """
    Parse config_id strings of the form:
      - "KMeans_n_clusters=10"
      - "LDA_n_topics=10"
      - "BERTopic_min_topic_size=30_n_neighbors=15_min_dist=0.0"

    into a dict of hyperparameters, skipping segments without "=".
    """
    hp = {}
    if config_id.startswith(model_type + "_"):
        config_id = config_id[len(model_type) + 1:]

    parts = config_id.split("_")
    for p in parts:
        if "=" not in p:
            continue
        k, v = p.split("=", 1)
        v_strip = v.strip()
        if re.fullmatch(r"-?\d+", v_strip):
            v_cast = int(v_strip)
        else:
            try:
                v_cast = float(v_strip)
            except ValueError:
                if v_strip.lower() in {"true", "false"}:
                    v_cast = (v_strip.lower() == "true")
                else:
                    v_cast = v_strip
        hp[k] = v_cast
    return hp

# 3. Data loading + merge SoA features

print(f"Loading propositions from JSONL: {INPUT_JSONL}")
df_json = pd.read_json(INPUT_JSONL, lines=True)

if TEXT_COL not in df_json.columns:
    raise KeyError(f"Expected '{TEXT_COL}' in JSONL; found: {df_json.columns.tolist()}")

print(f"Loading SoA / NLI features from: {SOA_CSV}")
df_soa = pd.read_csv(SOA_CSV)

if TEXT_COL not in df_soa.columns:
    raise KeyError(f"Expected '{TEXT_COL}' in SoA CSV; found: {df_soa.columns.tolist()}")

# Keep only the SoA-relevant columns plus proposition
soa_cols_keep = [c for c in df_soa.columns
                 if c in {TEXT_COL, SOA_LABEL_COL, SOA_SCORE_COL, SENT_COL}]
df_soa = df_soa[soa_cols_keep]

print("Merging JSONL and SoA CSV on proposition text...")
df = df_json.merge(df_soa, on=TEXT_COL, how="left", suffixes=("", "_soa"))

texts = df[TEXT_COL].astype(str).tolist()
n_docs = len(texts)
print(f"Merged dataframe has {n_docs:,} rows")

# 4. BoW for KMeans/LDA

vectorizer_bow = CountVectorizer(
    max_df=0.95,
    min_df=10,
    stop_words="english"
)
X_bow = vectorizer_bow.fit_transform(texts)
vocab = np.array(vectorizer_bow.get_feature_names_out())
print(f"BoW matrix shape: {X_bow.shape}")

# 5. Fit best KMeans

def fit_best_kmeans(X, config_id):
    hp = parse_hp_from_id(config_id, "KMeans")
    n_clusters = int(hp.get("n_clusters", 10))
    print(f"\n[Best KMeans] n_clusters={n_clusters}")
    km = KMeans(
        n_clusters=n_clusters,
        random_state=RANDOM_STATE,
        n_init="auto"
    )
    labels = km.fit_predict(X)
    return km, labels

kmeans_model, kmeans_labels = fit_best_kmeans(X_bow, BEST_KMEANS_ID)

# 6. Fit best LDA

def fit_best_lda(X, config_id):
    hp = parse_hp_from_id(config_id, "LDA")
    n_topics = int(hp.get("n_topics", 10))
    print(f"\n[Best LDA] n_topics={n_topics}")
    lda = LatentDirichletAllocation(
        n_components=n_topics,
        random_state=RANDOM_STATE,
        learning_method="batch"
    )
    doc_topic = lda.fit_transform(X)
    labels = doc_topic.argmax(axis=1)
    return lda, doc_topic, labels

lda_model, lda_doc_topic, lda_labels = fit_best_lda(X_bow, BEST_LDA_ID)

# 7. Fit best BERTopic

def fit_best_bertopic(texts, config_id):
    hp = parse_hp_from_id(config_id, "BERTopic")

    min_topic_size = int(hp.get("min_topic_size", 30))
    n_neighbors    = int(hp.get("n_neighbors", 15))
    min_dist       = float(hp.get("min_dist", 0.0))

    print(
        f"\n[Best BERTopic] min_topic_size={min_topic_size}, "
        f"n_neighbors={n_neighbors}, min_dist={min_dist}"
    )

    umap_model = UMAP(
        n_neighbors=n_neighbors,
        n_components=5,
        min_dist=min_dist,
        metric="cosine",
        random_state=RANDOM_STATE
    )

    topic_model = BERTopic(
        umap_model=umap_model,
        min_topic_size=min_topic_size,
        calculate_probabilities=True,
        verbose=True
    )

    topics, probs = topic_model.fit_transform(texts)
    return topic_model, topics, probs

bertopic_model, bertopic_topics, bertopic_probs = fit_best_bertopic(
    texts, BEST_BERTOPIC_ID
)

# 8. Helpers: top words and TF–IDF cluster keywords

def top_words_from_centroids(model, vocab, n_top=10):
    """Top words per KMeans cluster centroid using BoW centroids."""
    centers = model.cluster_centers_
    topic_terms = {}
    for k in range(centers.shape[0]):
        idx = np.argsort(centers[k])[::-1][:n_top]
        topic_terms[k] = vocab[idx].tolist()
    return topic_terms

def top_words_from_lda(model, vocab, n_top=10):
    """Top words per LDA topic (components_)."""
    topic_terms = {}
    for k, comp in enumerate(model.components_):
        idx = np.argsort(comp)[::-1][:n_top]
        topic_terms[k] = vocab[idx].tolist()
    return topic_terms

def top_words_from_bertopic(model, n_top=10):
    """Use BERTopic.get_topic for each topic id (excluding -1)."""
    topic_terms = {}
    for topic_id in model.get_topics().keys():
        if topic_id == -1:
            continue
        words = [w for (w, score) in model.get_topic(topic_id)[:n_top]]
        topic_terms[topic_id] = words
    return topic_terms

def get_kmeans_cluster_keywords_tfidf(texts, labels, n_top_words=12,
                                      min_df=5, max_df=0.9):
    """
    Compute TF–IDF-based keywords per KMeans cluster, following your snippet.
    Ensures keys are plain Python ints (not numpy.int32) for JSON-compatibility.
    """
    vectorizer = TfidfVectorizer(
        stop_words="english",
        min_df=min_df,
        max_df=max_df
    )
    X = vectorizer.fit_transform(texts)
    vocab = np.array(vectorizer.get_feature_names_out())

    labels = np.asarray(labels)
    cluster_keywords = {}

    for c in np.unique(labels):
        c_int = int(c)  # <- critical fix: cast numpy scalar to Python int
        idx = np.where(labels == c)[0]
        if len(idx) == 0:
            cluster_keywords[c_int] = []
            continue
        centroid = X[idx].mean(axis=0).A1
        top_idx = centroid.argsort()[-n_top_words:][::-1]
        cluster_keywords[c_int] = vocab[top_idx].tolist()

    return cluster_keywords

kmeans_top_words_bow   = top_words_from_centroids(kmeans_model, vocab, n_top=15)
lda_top_words          = top_words_from_lda(lda_model, vocab, n_top=15)
bertopic_top_words     = top_words_from_bertopic(bertopic_model, n_top=15)
kmeans_keywords_tfidf  = get_kmeans_cluster_keywords_tfidf(texts, kmeans_labels)

# 9. Save models + basic doc–topic assignments

print("\nSaving models and basic doc–topic outputs...")

joblib.dump(kmeans_model,   os.path.join(OUT_DIR, "kmeans_best.joblib"))
joblib.dump(lda_model,      os.path.join(OUT_DIR, "lda_best.joblib"))
joblib.dump(bertopic_model, os.path.join(OUT_DIR, "bertopic_best.joblib"))
joblib.dump(vectorizer_bow, os.path.join(OUT_DIR, "vectorizer_bow.joblib"))

with open(os.path.join(OUT_DIR, "kmeans_topics_bow.json"), "w") as f:
    json.dump(kmeans_top_words_bow, f, indent=2)

with open(os.path.join(OUT_DIR, "kmeans_topics_tfidf.json"), "w") as f:
    json.dump(kmeans_keywords_tfidf, f, indent=2)

with open(os.path.join(OUT_DIR, "lda_topics.json"), "w") as f:
    json.dump(lda_top_words, f, indent=2)

with open(os.path.join(OUT_DIR, "bertopic_topics.json"), "w") as f:
    json.dump(bertopic_top_words, f, indent=2)

# Document-level assignments
doc_out = df.copy()
doc_out["kmeans_topic"]   = kmeans_labels
doc_out["lda_topic"]      = lda_labels
doc_out["bertopic_topic"] = bertopic_topics

if bertopic_probs is not None:
    bertopic_probs = np.asarray(bertopic_probs)
    if bertopic_probs.ndim == 2:
        doc_out["bertopic_max_prob"] = bertopic_probs.max(axis=1)
        idx = np.arange(len(bertopic_topics))
        doc_out["bertopic_topic_prob"] = bertopic_probs[idx, bertopic_topics]
    else:
        print(
            f"[Warning] bertopic_probs ndim={bertopic_probs.ndim}; "
            "skipping probability-based columns."
        )

doc_out.to_csv(os.path.join(OUT_DIR, "documents_with_topics.csv"), index=False)

# 10. Deep LDA analysis: per-topic summaries + top docs

print("\nBuilding deep LDA topic summaries...")

n_lda_topics = lda_model.n_components
lda_summaries = []
lda_top_docs = {}

# Pre-compute SoA label categories if available
has_soa = SOA_LABEL_COL in doc_out.columns
has_soa_score = SOA_SCORE_COL in doc_out.columns
has_sent = SENT_COL in doc_out.columns
has_time = TIME_COL in doc_out.columns

for k in range(n_lda_topics):
    mask = (doc_out["lda_topic"] == k)
    subset = doc_out[mask]
    idx = subset.index.to_numpy()
    n_k = len(subset)

    if n_k == 0:
        lda_summaries.append({
            "topic_id": k,
            "n_docs": 0,
            "prop_neg_soa": np.nan,
            "prop_non_soa": np.nan,
            "prop_pos_soa": np.nan,
            "mean_soa_score": np.nan,
            "mean_sentiment": np.nan,
            "time_min": np.nan,
            "time_max": np.nan,
            "dom_kmeans_topic": np.nan,
            "dom_bertopic_topic": np.nan
        })
        lda_top_docs[k] = []
        continue

    # SoA proportions
    if has_soa:
        vc = subset[SOA_LABEL_COL].value_counts(normalize=True)
        prop_neg = float(vc.get("NEG_SOA", 0.0))
        prop_non = float(vc.get("NON_SOA", 0.0))
        prop_pos = float(vc.get("POS_SOA", 0.0))
    else:
        prop_neg = prop_non = prop_pos = np.nan

    # SoA score / sentiment
    mean_soa_score = float(subset[SOA_SCORE_COL].mean()) if has_soa_score else np.nan
    mean_sentiment = float(subset[SENT_COL].mean()) if has_sent else np.nan

    # Time
    if has_time:
        time_min = subset[TIME_COL].min()
        time_max = subset[TIME_COL].max()
    else:
        time_min = time_max = np.nan

    # Dominant KMeans / BERTopic basin
    dom_kmeans = subset["kmeans_topic"].value_counts().idxmax()
    dom_bertopic = subset["bertopic_topic"].value_counts().idxmax()

    # Most representative docs: top θ_{d,k}
    # lda_doc_topic is aligned with df indices
    theta_k = lda_doc_topic[:, k]
    topic_scores = theta_k[idx]
    top_local = np.argsort(topic_scores)[::-1][:10]
    top_doc_indices = idx[top_local]

    rep_docs = []
    for d_id in top_doc_indices:
        row = doc_out.loc[d_id]
        rep_docs.append({
            "doc_index": int(d_id),
            "proposition": str(row[TEXT_COL]),
            "theta_dk": float(theta_k[d_id]),
            SOA_LABEL_COL: str(row.get(SOA_LABEL_COL, "")),
            SOA_SCORE_COL: float(row.get(SOA_SCORE_COL, np.nan)),
            SENT_COL: float(row.get(SENT_COL, np.nan))
        })

    lda_summaries.append({
        "topic_id": k,
        "n_docs": int(n_k),
        "prop_neg_soa": prop_neg,
        "prop_non_soa": prop_non,
        "prop_pos_soa": prop_pos,
        "mean_soa_score": mean_soa_score,
        "mean_sentiment": mean_sentiment,
        "time_min": time_min,
        "time_max": time_max,
        "dom_kmeans_topic": int(dom_kmeans),
        "dom_bertopic_topic": int(dom_bertopic)
    })
    lda_top_docs[k] = rep_docs

lda_summary_df = pd.DataFrame(lda_summaries)
lda_summary_df.to_csv(os.path.join(OUT_DIR, "lda_topic_summary.csv"), index=False)

with open(os.path.join(OUT_DIR, "lda_top_docs.json"), "w") as f:
    json.dump(lda_top_docs, f, indent=2)

# 11. Deep KMeans analysis: per-cluster summaries + top docs

print("Building deep KMeans cluster summaries...")

kmeans_summaries = []
kmeans_top_docs = {}

unique_clusters = np.unique(kmeans_labels)

# For representative docs (BoW-based), pre-compute distances
# (If you prefer SBERT embeddings, replace this with your dense matrix.)
dists_all = kmeans_model.transform(X_bow)

for c in unique_clusters:
    c_int = int(c)
    mask = (doc_out["kmeans_topic"] == c_int)
    subset = doc_out[mask]
    idx = subset.index.to_numpy()
    n_c = len(subset)

    if n_c == 0:
        kmeans_summaries.append({
            "cluster_id": c_int,
            "n_docs": 0,
            "prop_neg_soa": np.nan,
            "prop_non_soa": np.nan,
            "prop_pos_soa": np.nan,
            "mean_soa_score": np.nan,
            "mean_sentiment": np.nan,
            "dom_lda_topic": np.nan,
            "dom_bertopic_topic": np.nan
        })
        kmeans_top_docs[c_int] = []
        continue

    if has_soa:
        vc = subset[SOA_LABEL_COL].value_counts(normalize=True)
        prop_neg = float(vc.get("NEG_SOA", 0.0))
        prop_non = float(vc.get("NON_SOA", 0.0))
        prop_pos = float(vc.get("POS_SOA", 0.0))
    else:
        prop_neg = prop_non = prop_pos = np.nan

    mean_soa_score = float(subset[SOA_SCORE_COL].mean()) if has_soa_score else np.nan
    mean_sentiment = float(subset[SENT_COL].mean()) if has_sent else np.nan

    dom_lda = subset["lda_topic"].value_counts().idxmax()
    dom_bertopic = subset["bertopic_topic"].value_counts().idxmax()

    # Distances to cluster centroid c, smaller = more representative
    cluster_dists = dists_all[idx, c_int]
    top_local = np.argsort(cluster_dists)[:10]
    top_doc_indices = idx[top_local]

    rep_docs = []
    for d_id in top_doc_indices:
        row = doc_out.loc[d_id]
        rep_docs.append({
            "doc_index": int(d_id),
            "proposition": str(row[TEXT_COL]),
            "dist_to_centroid": float(dists_all[d_id, c_int]),
            SOA_LABEL_COL: str(row.get(SOA_LABEL_COL, "")),
            SOA_SCORE_COL: float(row.get(SOA_SCORE_COL, np.nan)),
            SENT_COL: float(row.get(SENT_COL, np.nan))
        })

    kmeans_summaries.append({
        "cluster_id": c_int,
        "n_docs": int(n_c),
        "prop_neg_soa": prop_neg,
        "prop_non_soa": prop_non,
            "prop_pos_soa": prop_pos,
            "mean_soa_score": mean_soa_score,
            "mean_sentiment": mean_sentiment,
            "dom_lda_topic": int(dom_lda),
            "dom_bertopic_topic": int(dom_bertopic)
        })
    kmeans_top_docs[c_int] = rep_docs

kmeans_summary_df = pd.DataFrame(kmeans_summaries)
kmeans_summary_df.to_csv(os.path.join(OUT_DIR, "kmeans_cluster_summary.csv"), index=False)

with open(os.path.join(OUT_DIR, "kmeans_top_docs.json"), "w") as f:
    json.dump(kmeans_top_docs, f, indent=2)

# 12. Deep BERTopic analysis: per-topic summaries + top docs

print("Building deep BERTopic topic summaries...")

bertopic_summaries = []
bertopic_top_docs = {}

unique_topics = sorted(t for t in np.unique(bertopic_topics) if t != -1)

for t in unique_topics:
    t_int = int(t)
    mask = (doc_out["bertopic_topic"] == t_int)
    subset = doc_out[mask]
    idx = subset.index.to_numpy()
    n_t = len(subset)

    if n_t == 0:
        bertopic_summaries.append({
            "topic_id": t_int,
            "n_docs": 0,
            "prop_neg_soa": np.nan,
            "prop_non_soa": np.nan,
            "prop_pos_soa": np.nan,
            "mean_soa_score": np.nan,
            "mean_sentiment": np.nan,
            "time_min": np.nan,
            "time_max": np.nan,
            "dom_kmeans_topic": np.nan,
            "dom_lda_topic": np.nan
        })
        bertopic_top_docs[t_int] = []
        continue

    if has_soa:
        vc = subset[SOA_LABEL_COL].value_counts(normalize=True)
        prop_neg = float(vc.get("NEG_SOA", 0.0))
        prop_non = float(vc.get("NON_SOA", 0.0))
        prop_pos = float(vc.get("POS_SOA", 0.0))
    else:
        prop_neg = prop_non = prop_pos = np.nan

    mean_soa_score = float(subset[SOA_SCORE_COL].mean()) if has_soa_score else np.nan
    mean_sentiment = float(subset[SENT_COL].mean()) if has_sent else np.nan

    if has_time:
        time_min = subset[TIME_COL].min()
        time_max = subset[TIME_COL].max()
    else:
        time_min = time_max = np.nan

    dom_kmeans = subset["kmeans_topic"].value_counts().idxmax()
    dom_lda = subset["lda_topic"].value_counts().idxmax()

    # Representative docs: highest BERTopic topic probability
    if "bertopic_topic_prob" in subset.columns:
        probs_local = subset["bertopic_topic_prob"].to_numpy()
    else:
        probs_local = np.ones_like(idx, dtype=float)

    top_local = np.argsort(probs_local)[::-1][:10]
    top_doc_indices = idx[top_local]

    rep_docs = []
    for d_id in top_doc_indices:
        row = doc_out.loc[d_id]
        rep_docs.append({
            "doc_index": int(d_id),
            "proposition": str(row[TEXT_COL]),
            "topic_prob": float(row.get("bertopic_topic_prob", np.nan)),
            SOA_LABEL_COL: str(row.get(SOA_LABEL_COL, "")),
            SOA_SCORE_COL: float(row.get(SOA_SCORE_COL, np.nan)),
            SENT_COL: float(row.get(SENT_COL, np.nan))
        })

    bertopic_summaries.append({
        "topic_id": t_int,
        "n_docs": int(n_t),
        "prop_neg_soa": prop_neg,
        "prop_non_soa": prop_non,
        "prop_pos_soa": prop_pos,
        "mean_soa_score": mean_soa_score,
        "mean_sentiment": mean_sentiment,
        "time_min": time_min,
        "time_max": time_max,
        "dom_kmeans_topic": int(dom_kmeans),
        "dom_lda_topic": int(dom_lda)
    })
    bertopic_top_docs[t_int] = rep_docs

bertopic_summary_df = pd.DataFrame(bertopic_summaries)
bertopic_summary_df.to_csv(os.path.join(OUT_DIR, "bertopic_topic_summary.csv"), index=False)

with open(os.path.join(OUT_DIR, "bertopic_top_docs.json"), "w") as f:
    json.dump(bertopic_top_docs, f, indent=2)

# 13. Cross-tabs between models

print("Computing cross-tabs between KMeans, LDA, and BERTopic...")

ct_lda_kmeans = pd.crosstab(doc_out["lda_topic"], doc_out["kmeans_topic"])
ct_lda_kmeans.to_csv(os.path.join(OUT_DIR, "crosstab_lda_kmeans.csv"))

ct_lda_bertopic = pd.crosstab(doc_out["lda_topic"], doc_out["bertopic_topic"])
ct_lda_bertopic.to_csv(os.path.join(OUT_DIR, "crosstab_lda_bertopic.csv"))

ct_kmeans_bertopic = pd.crosstab(doc_out["kmeans_topic"], doc_out["bertopic_topic"])
ct_kmeans_bertopic.to_csv(os.path.join(OUT_DIR, "crosstab_kmeans_bertopic.csv"))

print(f"\nDone. Deep topic outputs written to: {OUT_DIR}")

Loading propositions from JSONL: /content/drive/MyDrive/NLP /artificial_filtered_output.jsonl
Loading SoA / NLI features from: /content/drive/MyDrive/NLP /ai_human_agency_inferences_mnli_triple_labels_winner_meta_goldclf.csv


  df_soa = pd.read_csv(SOA_CSV)


Merging JSONL and SoA CSV on proposition text...
Merged dataframe has 64,078 rows
BoW matrix shape: (64078, 11372)

[Best KMeans] n_clusters=10

[Best LDA] n_topics=10


2025-12-11 00:26:00,030 - BERTopic - Embedding - Transforming documents to embeddings.



[Best BERTopic] min_topic_size=30, n_neighbors=15, min_dist=0.0


Batches:   0%|          | 0/2003 [00:00<?, ?it/s]

2025-12-11 00:26:27,002 - BERTopic - Embedding - Completed ✓
2025-12-11 00:26:27,003 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-11 00:27:49,049 - BERTopic - Dimensionality - Completed ✓
2025-12-11 00:27:49,051 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-11 00:29:56,445 - BERTopic - Cluster - Completed ✓
2025-12-11 00:29:56,461 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-11 00:29:59,546 - BERTopic - Representation - Completed ✓



Saving models and basic doc–topic outputs...

Building deep LDA topic summaries...
Building deep KMeans cluster summaries...
Building deep BERTopic topic summaries...
Computing cross-tabs between KMeans, LDA, and BERTopic...

Done. Deep topic outputs written to: ./topic_models_best_deep


In [None]:

# 14. Ensure stable doc_id

# If your JSONL already had a unique ID, you can adapt this.
# Here we just create a stable index-based doc_id.
if "doc_id" not in doc_out.columns:
    doc_out = doc_out.reset_index(drop=True)
    doc_out["doc_id"] = np.arange(len(doc_out))

# Keep a handy reference
doc_ids = doc_out["doc_id"].to_numpy()
n_docs = len(doc_ids)


# 15. Master document table

master_cols = ["doc_id", TEXT_COL]

# Optional extras (only included if present)
optional_cols = [
    TIME_COL,          # "timestamp" if in JSONL
    "subreddit",
    "author",
    SOA_LABEL_COL,
    SOA_SCORE_COL,
    SENT_COL
]
for c in optional_cols:
    if c in doc_out.columns and c not in master_cols:
        master_cols.append(c)

master_df = doc_out[master_cols].copy()
master_df.to_csv(os.path.join(OUT_DIR, "master_documents.csv"), index=False)
# If you like Parquet:
# master_df.to_parquet(os.path.join(OUT_DIR, "master_documents.parquet"), index=False)

# 16. Per-model doc→topic outputs

# 16a. KMeans doc→topic (+ distance to centroid)
print("Exporting per-document KMeans assignments...")

# Recompute distances to be safe (or reuse dists_all if it's still in scope)
dists_all = kmeans_model.transform(X_bow)
doc_out["kmeans_dist_to_centroid"] = dists_all[
    np.arange(n_docs),
    doc_out["kmeans_topic"].astype(int).to_numpy()
]

kmeans_doc = doc_out[["doc_id", "kmeans_topic", "kmeans_dist_to_centroid"]].copy()
kmeans_doc.to_csv(os.path.join(OUT_DIR, "kmeans_doc_topics.csv"), index=False)

# 16b. BERTopic doc→topic (+ probability)
print("Exporting per-document BERTopic assignments...")

bertopic_doc_cols = ["doc_id", "bertopic_topic"]
if "bertopic_topic_prob" in doc_out.columns:
    bertopic_doc_cols.append("bertopic_topic_prob")
if "bertopic_max_prob" in doc_out.columns:
    bertopic_doc_cols.append("bertopic_max_prob")

bertopic_doc = doc_out[bertopic_doc_cols].copy()
bertopic_doc.to_csv(os.path.join(OUT_DIR, "bertopic_doc_topics.csv"), index=False)

# 16c. LDA doc→topic (wide and long)
print("Exporting per-document LDA assignments...")

n_lda_topics = lda_model.n_components
lda_topic_cols = [f"lda_t{k}" for k in range(n_lda_topics)]

lda_wide = pd.DataFrame(
    lda_doc_topic,
    columns=lda_topic_cols
)
lda_wide.insert(0, "doc_id", doc_out["doc_id"].to_numpy())
lda_wide.to_csv(os.path.join(OUT_DIR, "lda_doc_topics_wide.csv"), index=False)

# Long format: (doc_id, lda_topic, lda_prob)
lda_long = lda_wide.melt(
    id_vars=["doc_id"],
    var_name="lda_topic",
    value_name="lda_prob"
)
lda_long["lda_topic"] = (
    lda_long["lda_topic"]
    .str.replace("lda_t", "", regex=False)
    .astype(int)
)
lda_long.to_csv(os.path.join(OUT_DIR, "lda_doc_topics_long.csv"), index=False)

# 17. Per-model topic→word tables (topic_id, word, weight)

print("Exporting topic→word tables...")

# 17a. LDA topic→word (use normalized components as weights)
lda_tw_records = []
for k in range(n_lda_topics):
    comp = lda_model.components_[k]
    comp_sum = comp.sum()
    # top 50 words per topic (tweak if you like)
    top_idx = np.argsort(comp)[::-1][:50]
    for j in top_idx:
        lda_tw_records.append({
            "topic_id": int(k),
            "word": vocab[j],
            "weight": float(comp[j] / comp_sum) if comp_sum > 0 else float(comp[j])
        })

lda_tw_df = pd.DataFrame(lda_tw_records)
lda_tw_df.to_csv(os.path.join(OUT_DIR, "lda_topic_words.csv"), index=False)

# 17b. KMeans topic→word via TF–IDF centroids
tfidf_vec = TfidfVectorizer(
    stop_words="english",
    min_df=5,
    max_df=0.9
)
X_tfidf = tfidf_vec.fit_transform(texts)
vocab_tfidf = np.array(tfidf_vec.get_feature_names_out())

kmeans_tw_records = []
for c in np.unique(kmeans_labels):
    c_int = int(c)
    idx_c = np.where(kmeans_labels == c_int)[0]
    if len(idx_c) == 0:
        continue
    centroid = X_tfidf[idx_c].mean(axis=0).A1
    top_idx = np.argsort(centroid)[::-1][:50]
    for j in top_idx:
        if centroid[j] <= 0:
            continue
        kmeans_tw_records.append({
            "topic_id": c_int,
            "word": vocab_tfidf[j],
            "weight": float(centroid[j])
        })

kmeans_tw_df = pd.DataFrame(kmeans_tw_records)
kmeans_tw_df.to_csv(os.path.join(OUT_DIR, "kmeans_topic_words.csv"), index=False)

# 17c. BERTopic topic→word (uses BERTopic's own c-TF-IDF scores)
bertopic_tw_records = []
for topic_id, word_scores in bertopic_model.get_topics().items():
    if topic_id == -1:
        continue  # skip outlier/noise topic
    for (word, weight) in word_scores[:50]:
        bertopic_tw_records.append({
            "topic_id": int(topic_id),
            "word": word,
            "weight": float(weight)
        })

bertopic_tw_df = pd.DataFrame(bertopic_tw_records)
bertopic_tw_df.to_csv(os.path.join(OUT_DIR, "bertopic_topic_words.csv"), index=False)

# 18. Optional: export document embeddings

print("Attempting to export document embeddings from BERTopic's embedding model...")

emb_model = None
if hasattr(bertopic_model, "embedding_model"):
    emb_model = bertopic_model.embedding_model
elif hasattr(bertopic_model, "_embedding_model"):
    emb_model = bertopic_model._embedding_model

if emb_model is not None:
    # Depending on your BERTopic/embedding model, this might be .embed or .transform
    try:
        doc_embeddings = emb_model.embed(texts)
    except AttributeError:
        doc_embeddings = emb_model.transform(texts)

    doc_embeddings = np.asarray(doc_embeddings)
    emb_cols = [f"e{i}" for i in range(doc_embeddings.shape[1])]
    emb_df = pd.DataFrame(doc_embeddings, columns=emb_cols)
    emb_df.insert(0, "doc_id", doc_out["doc_id"].to_numpy())

    # Parquet is nicer for big dense matrices
    emb_df.to_parquet(
        os.path.join(OUT_DIR, "document_embeddings.parquet"),
        index=False
    )
    # If you really want CSV (bigger files):
    # emb_df.to_csv(os.path.join(OUT_DIR, "document_embeddings.csv"), index=False)
else:
    print(
        "[Warning] Could not find an embedding_model on BERTopic; "
        "skipping embedding export."
    )

print("\nExtra exports complete.")


Exporting per-document KMeans assignments...
Exporting per-document BERTopic assignments...
Exporting per-document LDA assignments...
Exporting topic→word tables...
Attempting to export document embeddings from BERTopic's embedding model...

Extra exports complete.


In [None]:
#!/usr/bin/env python3
"""
viz_lda_soa_topics.py

Visualize LDA topics and their SoA composition using:

- lda_topic_summary.csv
- lda_topics.json

Outputs:
- lda_topics_stacked_soa.png     (stacked bar of NEG/NON/POS proportions per topic)
- lda_topics_mean_soa_score.png  (mean SoA score per topic)

Assumes:
- Columns in lda_topic_summary.csv (from your deep script):
    topic_id, n_docs,
    prop_neg_soa, prop_non_soa, prop_pos_soa,
    mean_soa_score, mean_sentiment,
    time_min, time_max,
    dom_kmeans_topic, dom_bertopic_topic
- lda_topics.json maps topic_id -> list of top words.
"""

import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# 1. Paths

BASE_DIR = Path("./topic_models_best_deep")  # change if needed
summary_path = BASE_DIR / "lda_topic_summary.csv"
topics_path  = BASE_DIR / "lda_topics.json"

# 2. Load data

df = pd.read_csv(summary_path)

with open(topics_path, "r") as f:
    lda_topics = json.load(f)

# Keys in JSON might be strings; normalize to int
lda_topics_int = {}
for k, words in lda_topics.items():
    try:
        k_int = int(k)
    except (TypeError, ValueError):
        k_int = k
    lda_topics_int[k_int] = words

# 3. Build human-readable topic labels

def make_topic_label(topic_id, words, n_docs, top_n=4):
    """Label like: T0: ai / technology / robots / jobs (n=1234)"""
    if not words:
        core = "..."
    else:
        core = " / ".join(words[:top_n])
    return f"T{topic_id}: {core} (n={n_docs})"

df["top_words"] = df["topic_id"].map(lambda k: lda_topics_int.get(k, []))
df["topic_label"] = df.apply(
    lambda row: make_topic_label(
        row["topic_id"],
        row["top_words"],
        row["n_docs"]
    ),
    axis=1
)

# Order topics by size (descending), so the biggest are easiest to read
df = df.sort_values("n_docs", ascending=True)  # ascending=True for nicer horizontal plot (small at top)

# 4. Stacked bar: topic size + SoA composition

# For plotting, we’ll use the proportions:
#   - prop_neg_soa
#   - prop_non_soa
#   - prop_pos_soa

# Melt to long format for seaborn
soa_long = df[[
    "topic_id",
    "topic_label",
    "prop_neg_soa",
    "prop_non_soa",
    "prop_pos_soa"
]].copy()

soa_long = soa_long.melt(
    id_vars=["topic_id", "topic_label"],
    value_vars=["prop_neg_soa", "prop_non_soa", "prop_pos_soa"],
    var_name="soa_type",
    value_name="prop"
)

# Nice labels / ordering
soa_type_order = ["prop_neg_soa", "prop_non_soa", "prop_pos_soa"]
soa_type_map = {
    "prop_neg_soa": "NEG_SOA",
    "prop_non_soa": "NON_SOA",
    "prop_pos_soa": "POS_SOA",
}
soa_long["soa_type"] = soa_long["soa_type"].map(soa_type_map)

# Plot
sns.set(style="whitegrid", font_scale=0.9)

fig, ax = plt.subplots(figsize=(10, max(5, 0.4 * df.shape[0])))

# We’ll do manual stacking with pandas pivot for control
pivot = soa_long.pivot(
    index="topic_label",
    columns="soa_type",
    values="prop"
).fillna(0.0)

# Ensure consistent column order
pivot = pivot[["NEG_SOA", "NON_SOA", "POS_SOA"]]

# Horizontal stacked bars
bottom = None
colors = {
    "NEG_SOA": "#d62728",  # red-ish
    "NON_SOA": "#7f7f7f",  # gray
    "POS_SOA": "#2ca02c",  # green-ish
}

for soa in pivot.columns:
    vals = pivot[soa]
    ax.barh(
        pivot.index,
        vals,
        left=bottom,
        label=soa,
        color=colors.get(soa, None),
        alpha=0.9
    )
    if bottom is None:
        bottom = vals.copy()
    else:
        bottom = bottom + vals

ax.set_xlabel("Proportion of documents")
ax.set_ylabel("LDA Topic")
ax.set_title("LDA topics by SoA label composition (NEG / NON / POS)")

ax.legend(title="SoA label", bbox_to_anchor=(1.02, 1), loc="upper left", borderaxespad=0.)

plt.tight_layout()
out_path1 = BASE_DIR / "lda_topics_stacked_soa.png"
plt.savefig(out_path1, dpi=300)
plt.close(fig)

print(f"Saved stacked SoA composition figure to: {out_path1}")

# 5. Mean SoA score per topic (lollipop / bar)

# Sort by mean_soa_score so you see the NEG → POS progression
df_score = df.sort_values("mean_soa_score", ascending=True)

fig, ax = plt.subplots(figsize=(8, max(5, 0.4 * df_score.shape[0])))

y = df_score["topic_label"]
x = df_score["mean_soa_score"]

# Color by sign of mean_soa_score
colors_score = ["#d62728" if val < 0 else "#2ca02c" for val in x]

# Lollipop-style
ax.hlines(y=y, xmin=0, xmax=x, color="lightgray", alpha=0.8)
ax.scatter(x, y, color=colors_score, s=60)

ax.axvline(0.0, color="black", linewidth=1, linestyle="--", alpha=0.7)

ax.set_xlabel("Mean SoA score (Entailment − Contradiction)")
ax.set_ylabel("LDA Topic")
ax.set_title("Mean SoA polarity per LDA topic")

plt.tight_layout()
out_path2 = BASE_DIR / "lda_topics_mean_soa_score.png"
plt.savefig(out_path2, dpi=300)
plt.close(fig)

print(f"Saved mean SoA score figure to: {out_path2}")


Saved stacked SoA composition figure to: topic_models_best_deep/lda_topics_stacked_soa.png
Saved mean SoA score figure to: topic_models_best_deep/lda_topics_mean_soa_score.png


In [None]:
#!/usr/bin/env python3
"""
viz_bertopic_soa.py

Visualizations for BERTopic + SoA on r/artificial:

1. Stacked bar chart of NEG/NON/POS proportions per topic (top N by size).
2. Scatterplot of topic size vs mean SoA polarity.
3. Time series of mean SoA polarity per topic over time (selected topics).
4. Sankey diagram: topics -> SoA labels (flows by document count).

Assumes files created by fit_and_analyze_topics_deep.py:

- ./topic_models_best_deep/bertopic_topic_summary.csv
- ./topic_models_best_deep/bertopic_topics.json
- ./topic_models_best_deep/bertopic_doc_topics.csv
- ./topic_models_best_deep/master_documents.csv
"""

from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# 1. Paths and basic loading

BASE = Path("./topic_models_best_deep")

summary_path   = BASE / "bertopic_topic_summary.csv"
topics_path    = BASE / "bertopic_topics.json"
doc_topics_path = BASE / "bertopic_doc_topics.csv"
master_path    = BASE / "master_documents.csv"   # has SoA + timestamp

df_sum   = pd.read_csv(summary_path)
df_docs  = pd.read_csv(doc_topics_path)
df_master = pd.read_csv(master_path)

with open(topics_path, "r") as f:
    topics_dict = json.load(f)

# Normalize JSON keys to int
topics_dict_int = {}
for k, words in topics_dict.items():
    try:
        k_int = int(k)
    except (TypeError, ValueError):
        k_int = k
    topics_dict_int[k_int] = words

# Optional: rename text column if needed
TEXT_COL = "proposition"
SOA_LABEL_COL = "combined_soa_label"
SOA_SCORE_COL = "combined_agency_score"
TIME_COL = "timestamp"

# 2. Build human-readable topic labels

def make_topic_label(topic_id, words, n_docs, top_n=4):
    """T12: ai / tools / workflow / build (n=1234)"""
    if not isinstance(words, (list, tuple)) or len(words) == 0:
        core = "..."
    else:
        core = " / ".join(words[:top_n])
    return f"T{topic_id}: {core} (n={n_docs})"

df_sum["top_words"] = df_sum["topic_id"].map(lambda t: topics_dict_int.get(t, []))
df_sum["topic_label"] = df_sum.apply(
    lambda r: make_topic_label(r["topic_id"], r["top_words"], r["n_docs"]),
    axis=1
)

# 3. FIGURE 1: stacked SoA composition (top N topics by size)

TOP_N = 15  # adjust as desired

# order by size ascending for nice horizontal plot
df_top = df_sum.sort_values("n_docs", ascending=True).tail(TOP_N)

soa_cols = ["prop_neg_soa", "prop_non_soa", "prop_pos_soa"]
soa_long = df_top[["topic_id", "topic_label"] + soa_cols].melt(
    id_vars=["topic_id", "topic_label"],
    value_vars=soa_cols,
    var_name="soa_type",
    value_name="prop"
)

soa_type_map = {
    "prop_neg_soa": "NEG_SOA",
    "prop_non_soa": "NON_SOA",
    "prop_pos_soa": "POS_SOA"
}
soa_long["soa_type"] = soa_long["soa_type"].map(soa_type_map)

# Pivot for manual stacking
pivot = soa_long.pivot(index="topic_label", columns="soa_type", values="prop").fillna(0.0)
pivot = pivot[["NEG_SOA", "NON_SOA", "POS_SOA"]]

sns.set(style="whitegrid", font_scale=0.9)
fig, ax = plt.subplots(figsize=(10, max(6, 0.4 * pivot.shape[0])))

bottom = None
colors = {"NEG_SOA": "#d62728", "NON_SOA": "#7f7f7f", "POS_SOA": "#2ca02c"}

for soa in pivot.columns:
    vals = pivot[soa]
    ax.barh(
        pivot.index,
        vals,
        left=bottom,
        label=soa,
        color=colors.get(soa),
        alpha=0.9
    )
    if bottom is None:
        bottom = vals.copy()
    else:
        bottom = bottom + vals

ax.set_xlabel("Proportion of documents")
ax.set_ylabel("BERTopic topic")
ax.set_title("Top BERTopic topics by SoA label composition (NEG / NON / POS)")
ax.legend(title="SoA label", bbox_to_anchor=(1.02, 1), loc="upper left", borderaxespad=0.)

plt.tight_layout()
out1 = BASE / "bertopic_topics_stacked_soa_topN.png"
plt.savefig(out1, dpi=300)
plt.close(fig)
print(f"Saved stacked SoA figure to {out1}")

# 4. FIGURE 2: size vs mean SoA polarity scatter

df_scatter = df_sum.copy()

fig, ax = plt.subplots(figsize=(8, 6))

x = df_scatter["mean_soa_score"]
y = df_scatter["n_docs"]
sizes = 50 + 200 * (y / y.max())  # scale by size

# color by sign of mean SoA score
colors_sc = np.where(x >= 0, "#2ca02c", "#d62728")

ax.scatter(x, y, s=sizes, c=colors_sc, alpha=0.7, edgecolor="none")
ax.axvline(0.0, color="black", linestyle="--", linewidth=1, alpha=0.7)

ax.set_xlabel("Mean SoA score (Entailment − Contradiction)")
ax.set_ylabel("Number of documents (topic size)")
ax.set_title("BERTopic: topic size vs mean SoA polarity")

# Optionally annotate top/bottom topics by polarity or size
for _, row in df_scatter.nlargest(5, "n_docs").iterrows():
    ax.text(row["mean_soa_score"], row["n_docs"],
            f"T{int(row['topic_id'])}", fontsize=8,
            ha="center", va="bottom")

for _, row in df_scatter.nsmallest(3, "mean_soa_score").iterrows():
    ax.text(row["mean_soa_score"], row["n_docs"],
            f"T{int(row['topic_id'])}", fontsize=8,
            ha="right", va="bottom")

for _, row in df_scatter.nlargest(3, "mean_soa_score").iterrows():
    ax.text(row["mean_soa_score"], row["n_docs"],
            f"T{int(row['topic_id'])}", fontsize=8,
            ha="left", va="bottom")

plt.tight_layout()
out2 = BASE / "bertopic_topics_size_vs_mean_soa.png"
plt.savefig(out2, dpi=300)
plt.close(fig)
print(f"Saved size vs SoA scatter to {out2}")

# 5. FIGURE 3: time-series of mean SoA per topic (Plotly)

# Merge doc-level SoA + timestamps with BERTopic assignments
df_merged = df_docs.merge(df_master, on="doc_id", how="left")

# Parse timestamps to datetime if present
if TIME_COL in df_merged.columns:
    df_merged[TIME_COL] = pd.to_datetime(df_merged[TIME_COL], errors="coerce")
    df_merged = df_merged.dropna(subset=[TIME_COL])

    # Choose a temporal resolution (e.g., quarterly)
    df_merged["year_quarter"] = df_merged[TIME_COL].dt.to_period("Q").astype(str)

    # Focus on a subset of topics: e.g., top 6 by size
    top_topics = df_sum.sort_values("n_docs", ascending=False)["topic_id"].head(6).tolist()

    ts = (
        df_merged[df_merged["bertopic_topic"].isin(top_topics)]
        .groupby(["bertopic_topic", "year_quarter"])[SOA_SCORE_COL]
        .mean()
        .reset_index()
        .rename(columns={SOA_SCORE_COL: "mean_soa_score"})
    )

    # Attach readable labels
    label_map = df_sum.set_index("topic_id")["topic_label"].to_dict()
    ts["topic_label"] = ts["bertopic_topic"].map(label_map)

    fig_ts = px.line(
        ts,
        x="year_quarter",
        y="mean_soa_score",
        color="topic_label",
        markers=True,
        title="Mean SoA polarity over time for selected BERTopic topics",
    )
    fig_ts.update_layout(
        xaxis_title="Year–Quarter",
        yaxis_title="Mean SoA score (Entailment − Contradiction)",
        legend_title="BERTopic topic",
    )
    out3 = BASE / "bertopic_topics_time_soa.html"
    fig_ts.write_html(out3)
    print(f"Saved time-series SoA figure to {out3}")
else:
    print("[Warning] No timestamp column in master_documents; skipping time-series plot.")

# 6. FIGURE 4: Sankey from topics -> SoA labels

# Use doc-level labels to count flows
if SOA_LABEL_COL in df_merged.columns:
    # Restrict to top N topics by size for readability
    top_topics_sankey = df_sum.sort_values("n_docs", ascending=False)["topic_id"].head(10).tolist()
    df_sankey = df_merged[df_merged["bertopic_topic"].isin(top_topics_sankey)].copy()

    # Map numeric topic_id to short labels
    short_label_map = {tid: f"T{tid}" for tid in top_topics_sankey}
    df_sankey["topic_short"] = df_sankey["bertopic_topic"].map(short_label_map)

    # Only allow 3-level SoA labels
    df_sankey = df_sankey[df_sankey[SOA_LABEL_COL].isin(["NEG_SOA", "NON_SOA", "POS_SOA"])]

    flows = (
        df_sankey.groupby(["topic_short", SOA_LABEL_COL])["doc_id"]
        .nunique()
        .reset_index()
        .rename(columns={"doc_id": "count"})
    )

    # Build node list
    topic_nodes = sorted(flows["topic_short"].unique())
    soa_nodes = ["NEG_SOA", "NON_SOA", "POS_SOA"]
    all_nodes = topic_nodes + soa_nodes

    node_index = {name: i for i, name in enumerate(all_nodes)}

    sources = flows["topic_short"].map(node_index).tolist()
    targets = flows[SOA_LABEL_COL].map(node_index).tolist()
    values  = flows["count"].tolist()

    fig_sk = go.Figure(data=[go.Sankey(
        arrangement="snap",
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=all_nodes,
            color=["#aec7e8"] * len(topic_nodes) +
                  ["#d62728", "#7f7f7f", "#2ca02c"]
        ),
        link=dict(
            source=sources,
            target=targets,
            value=values,
        )
    )])

    fig_sk.update_layout(
        title_text="Flow from BERTopic topics to SoA labels",
        font_size=11
    )

    out4 = BASE / "bertopic_topics_sankey_soa.html"
    fig_sk.write_html(out4)
    print(f"Saved Sankey SoA figure to {out4}")
else:
    print("[Warning] No SoA label column in master_documents; skipping Sankey plot.")


Saved stacked SoA figure to topic_models_best_deep/bertopic_topics_stacked_soa_topN.png
Saved size vs SoA scatter to topic_models_best_deep/bertopic_topics_size_vs_mean_soa.png
Saved time-series SoA figure to topic_models_best_deep/bertopic_topics_time_soa.html
Saved Sankey SoA figure to topic_models_best_deep/bertopic_topics_sankey_soa.html


In [None]:
#!/usr/bin/env python3
"""
viz_bertopic_core.py

BERTopic core visualizations for top 10 topics:
1) Colorful bar chart of topic frequencies.
2) Butterfly plot of NEG_SOA vs POS_SOA proportions.

Inputs (from fit_and_analyze_topics_deep.py):
- ./topic_models_best_deep/bertopic_topic_summary.csv
- ./topic_models_best_deep/bertopic_topics.json
"""

from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Paths & load

BASE = Path("./topic_models_best_deep")
summary_path = BASE / "bertopic_topic_summary.csv"
topics_path  = BASE / "bertopic_topics.json"

df = pd.read_csv(summary_path)

with open(topics_path, "r") as f:
    topics_dict = json.load(f)

# Normalize topic keys to int
topics_dict_int = {}
for k, words in topics_dict.items():
    try:
        k_int = int(k)
    except (TypeError, ValueError):
        k_int = k
    topics_dict_int[k_int] = words

# 2. Focus on 10 largest topics & build labels

top10 = df.sort_values("n_docs", ascending=False).head(10).copy()

# Attach raw top words (for reference / checking)
top10["top_words"] = top10["topic_id"].map(lambda t: topics_dict_int.get(t, []))

# Manually chosen semantic labels based on top words for these 10 topics
semantic_label_map = {
    0: "AGI futures & speculation",
    1: "Courses & learning resources",
    2: "Chatbots & customer service",
    3: "ChatGPT use & impressions",
    4: "Voice / TTS & audio cloning",
    5: "LLM reasoning & capabilities",
    6: "General AI hot takes",
    7: "AI music & songs",
    8: "AI video & YouTube content",
    9: "AI art & visual creativity",
}

top10["semantic_label"] = top10["topic_id"].map(semantic_label_map)

# Fallback if any id is missing in map
top10["semantic_label"] = top10.apply(
    lambda r: r["semantic_label"]
    if isinstance(r["semantic_label"], str)
    else f"T{int(r['topic_id'])}",
    axis=1,
)

# Also build a slightly richer label including size, for y-axis
top10["label_with_n"] = top10.apply(
    lambda r: f"T{int(r['topic_id'])}: {r['semantic_label']} (n={int(r['n_docs'])})",
    axis=1,
)

# Sort ascending for nicer horizontal plots (small at top)
top10 = top10.sort_values("n_docs", ascending=True)

sns.set(style="whitegrid", font_scale=0.9)

# 3. FIGURE 1 — colorful bar chart of topic frequencies

fig, ax = plt.subplots(figsize=(10, max(6, 0.5 * top10.shape[0])))

y_pos = np.arange(top10.shape[0])
counts = top10["n_docs"].to_numpy()

# Use a qualitative palette
palette = sns.color_palette("tab10", n_colors=top10.shape[0])
colors = palette

ax.barh(y_pos, counts, color=colors)

ax.set_yticks(y_pos)
ax.set_yticklabels(top10["label_with_n"])
ax.set_xlabel("Number of propositions")
ax.set_ylabel("BERTopic topic")
ax.set_title("Top 10 BERTopic topics in r/artificial (topic frequencies)")

plt.tight_layout()
out_freq = BASE / "bertopic_top10_frequencies.png"
plt.savefig(out_freq, dpi=300)
plt.close(fig)

print(f"Saved frequency bar chart to {out_freq}")

# 4. FIGURE 2 — “Butterfly” NEG_SOA vs POS_SOA plot

fig, ax = plt.subplots(figsize=(10, max(6, 0.5 * top10.shape[0])))

y_pos = np.arange(top10.shape[0])

neg = top10["prop_neg_soa"].to_numpy()
pos = top10["prop_pos_soa"].to_numpy()

# Left side: NEG_SOA (negative values)
ax.barh(y_pos, -neg, color="#d62728", alpha=0.9, label="NEG_SOA")

# Right side: POS_SOA (positive values)
ax.barh(y_pos, pos, color="#2ca02c", alpha=0.9, label="POS_SOA")

# Vertical zero line in the middle
ax.axvline(0.0, color="black", linestyle="--", linewidth=1)

ax.set_yticks(y_pos)
ax.set_yticklabels(top10["semantic_label"])
ax.set_xlabel("Proportion of documents (NEG left, POS right)")
ax.set_ylabel("BERTopic topic")
ax.set_title("Positive vs negative agency framing across top 10 BERTopic topics")

# Make x-axis symmetric for interpretability
max_prop = max(neg.max(), pos.max())
ax.set_xlim(-max_prop * 1.1, max_prop * 1.1)

# Reverse y-axis so biggest topics are at the bottom (optional)
ax.invert_yaxis()

ax.legend(loc="upper right")

plt.tight_layout()
out_bfly = BASE / "bertopic_top10_butterfly_soa.png"
plt.savefig(out_bfly, dpi=300)
plt.close(fig)

print(f"Saved NEG vs POS SoA butterfly chart to {out_bfly}")


Saved frequency bar chart to topic_models_best_deep/bertopic_top10_frequencies.png
Saved NEG vs POS SoA butterfly chart to topic_models_best_deep/bertopic_top10_butterfly_soa.png


In [None]:
#!/usr/bin/env python3
"""
viz_kmeans_soa.py

Plot NEG_SOA / NON_SOA / POS_SOA composition per KMeans cluster
using kmeans_cluster_summary.csv produced by fit_and_analyze_topics_deep.py.
"""

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

BASE = Path("./topic_models_best_deep")
summary_path = BASE / "kmeans_cluster_summary.csv"

df = pd.read_csv(summary_path)

# Your semantic labels
kmeans_cluster_labels = {
    0: "Optimistic AI tools & productivity",
    1: "Loss of control / AI takeover anxiety",
    2: "Meta-discussion about AI and the community",
    3: "Technical implementation & model-building talk",
    4: "Ethical risks, regulation, and governance",
    5: "Human learning, skill development, and self-efficacy with AI",
    6: "Creative uses of AI (art, writing, projects)",
    7: "Economic and labor displacement / job security",
    8: "Everyday troubleshooting and practical help requests",
    9: "Philosophical debates about mind, agency, and consciousness",
}

df["semantic_label"] = df["cluster_id"].map(kmeans_cluster_labels)
df["label_with_n"] = df.apply(
    lambda r: f"C{int(r['cluster_id'])}: {r['semantic_label']} (n={int(r['n_docs'])})",
    axis=1,
)

# Sort by size for nicer plotting
df = df.sort_values("n_docs", ascending=True)

sns.set(style="whitegrid", font_scale=0.9)

fig, ax = plt.subplots(figsize=(10, 6))

neg = df["prop_neg_soa"].to_numpy()
non = df["prop_non_soa"].to_numpy()
pos = df["prop_pos_soa"].to_numpy()
y_pos = np.arange(df.shape[0])

# Stacked horizontal bars: NEG then NON then POS
ax.barh(y_pos, neg, color="#d62728", label="NEG_SOA")
ax.barh(y_pos, non, left=neg, color="#7f7f7f", label="NON_SOA")
ax.barh(y_pos, pos, left=neg+non, color="#2ca02c", label="POS_SOA")

ax.set_yticks(y_pos)
ax.set_yticklabels(df["label_with_n"])
ax.set_xlabel("Proportion of documents")
ax.set_ylabel("KMeans cluster")
ax.set_title("SoA label composition across KMeans clusters")

ax.legend(title="SoA label", bbox_to_anchor=(1.02, 1), loc="upper left")

plt.tight_layout()
out_path = BASE / "kmeans_clusters_stacked_soa.png"
plt.savefig(out_path, dpi=300)
plt.close(fig)

print(f"Saved KMeans SoA stacked plot to {out_path}")


Saved KMeans SoA stacked plot to topic_models_best_deep/kmeans_clusters_stacked_soa.png


In [None]:
#!/usr/bin/env python3
"""
recompute_kmeans_soa_with_bart_gold.py

Recompute SoA label composition per KMeans cluster using the
fine-tuned BART gold labels (bart_goldclf_soa_label).
"""

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Paths

BASE = Path("./topic_models_best_deep")

DOC_TOPICS = BASE / "documents_with_topics.csv"
BART_GOLD  = Path("/content/drive/MyDrive/NLP /ai_human_agency_inferences_mnli_bart_goldclf.csv")

TEXT_COL        = "proposition"
KMEANS_COL      = "kmeans_topic"
SOA_LABEL_COL   = "bart_goldclf_soa_label"   # <--- use BART-gold labels

# Your semantic cluster labels
kmeans_cluster_labels = {
    0: "Optimistic AI tools & productivity",
    1: "Loss of control / AI takeover anxiety",
    2: "Meta-discussion about AI and the community",
    3: "Technical implementation & model-building talk",
    4: "Ethical risks, regulation, and governance",
    5: "Human learning, skill development, and self-efficacy with AI",
    6: "Creative uses of AI (art, writing, projects)",
    7: "Economic and labor displacement / job security",
    8: "Everyday troubleshooting and practical help requests",
    9: "Philosophical debates about mind, agency, and consciousness",
}

# Load data and merge BART-gold labels

print(f"Loading doc–topic assignments from {DOC_TOPICS}")
df_docs = pd.read_csv(DOC_TOPICS)

print(f"Loading BART-gold SoA labels from {BART_GOLD}")
df_gold = pd.read_csv(BART_GOLD)

if TEXT_COL not in df_docs.columns:
    raise KeyError(f"{TEXT_COL} missing from documents_with_topics.csv")

if TEXT_COL not in df_gold.columns or SOA_LABEL_COL not in df_gold.columns:
    raise KeyError(f"Need {TEXT_COL} and {SOA_LABEL_COL} in BART-gold CSV")

df_gold = df_gold[[TEXT_COL, SOA_LABEL_COL]]

print("Merging on proposition text...")
df = df_docs.merge(df_gold, on=TEXT_COL, how="left", suffixes=("", "_gold"))

# Sanity check
missing = df[SOA_LABEL_COL].isna().mean()
print(f"Fraction of docs missing BART-gold label: {missing:.3%}")

# Aggregate SoA composition per KMeans cluster

grouped = df.groupby(KMEANS_COL)

rows = []
for cluster_id, sub in grouped:
    n_docs = len(sub)
    vc = sub[SOA_LABEL_COL].value_counts(normalize=True)
    prop_neg = float(vc.get("NEG_SOA", 0.0))
    prop_non = float(vc.get("NON_SOA", 0.0))
    prop_pos = float(vc.get("POS_SOA", 0.0))

    rows.append({
        "cluster_id": int(cluster_id),
        "n_docs": int(n_docs),
        "prop_neg_soa": prop_neg,
        "prop_non_soa": prop_non,
        "prop_pos_soa": prop_pos,
    })

df_summary = pd.DataFrame(rows)
df_summary["semantic_label"] = df_summary["cluster_id"].map(kmeans_cluster_labels)
df_summary["label_with_n"] = df_summary.apply(
    lambda r: f"C{int(r['cluster_id'])}: {r['semantic_label']} (n={int(r['n_docs'])})",
    axis=1,
)

df_summary = df_summary.sort_values("n_docs", ascending=True)

# Plot stacked NEG / NON / POS per cluster

sns.set(style="whitegrid", font_scale=0.9)

fig, ax = plt.subplots(figsize=(10, 6))

y_pos = np.arange(df_summary.shape[0])
neg = df_summary["prop_neg_soa"].to_numpy()
non = df_summary["prop_non_soa"].to_numpy()
pos = df_summary["prop_pos_soa"].to_numpy()

ax.barh(y_pos, neg, color="#d62728", label="NEG_SOA")
ax.barh(y_pos, non, left=neg, color="#7f7f7f", label="NON_SOA")
ax.barh(y_pos, pos, left=neg+non, color="#2ca02c", label="POS_SOA")

ax.set_yticks(y_pos)
ax.set_yticklabels(df_summary["label_with_n"])
ax.set_xlabel("Proportion of documents")
ax.set_ylabel("KMeans cluster")
ax.set_title("SoA label composition across KMeans clusters (BART gold labels)")

ax.legend(title="SoA label", bbox_to_anchor=(1.02, 1), loc="upper left")

plt.tight_layout()
out_path = BASE / "kmeans_clusters_stacked_soa_bart_gold.png"
plt.savefig(out_path, dpi=300)
plt.close(fig)

print(f"Saved updated KMeans SoA stacked plot to {out_path}")


Loading doc–topic assignments from topic_models_best_deep/documents_with_topics.csv
Loading BART-gold SoA labels from /content/drive/MyDrive/NLP /ai_human_agency_inferences_mnli_bart_goldclf.csv



Columns (1,3,4,5,6,16,17,20,22,24,25,26,27,28,31) have mixed types. Specify dtype option on import or set low_memory=False.



Merging on proposition text...
Fraction of docs missing BART-gold label: 0.004%
Saved updated KMeans SoA stacked plot to topic_models_best_deep/kmeans_clusters_stacked_soa_bart_gold.png
