In [None]:
import os
from pathlib import Path
import sys

# --- ENVIRONMENT SWITCH ---
# Set to True if running on local machine with Google Drive Desktop mounted
# Set to False if running in Google Colab cloud
RUNNING_LOCALLY = False

if RUNNING_LOCALLY:
  # --- REPO ROOT ON sys.path (so `from src.*` works locally) ---
    _REPO_ROOT = str(Path(os.getcwd()).resolve().parents[1])
    if _REPO_ROOT not in sys.path:
        sys.path.insert(0, _REPO_ROOT)
    # Standard macOS path for Google Drive Desktop
    BASE_PATH = Path('/Volumes/GoogleDrive/My Drive/Colab Projects/AI Public Trust')
    
else:
    # Google Colab cloud path
    from google.colab import drive
    drive.mount('/content/drive')
    BASE_PATH = Path('/content/drive/My Drive/Colab Projects/AI Public Trust')

# Pre-compute critical paths used across notebooks
twits_folder = BASE_PATH / 'Raw Data/Twits/'
test_folder = BASE_PATH / 'Raw Data/'
datasets_folder = BASE_PATH / 'Data Sets'
cleanedds_folder = BASE_PATH / 'Data Sets/Cleaned Data'
networks_folder = BASE_PATH / 'Data Sets/Networks/'
literature_folder = BASE_PATH / 'Literature/'
topic_models_folder = BASE_PATH / 'Models/Topic Modeling/'


In [None]:
# # 3.3 Add LDA Topics to Tweets
#
# **Goal**: Train LDA on tweet text (using scikit-learn + CountVectorizer), evaluate (perplexity + coherence),
# create diagnostics (pyLDAvis + t-SNE + UMAP), and **enrich the sentiment JSONL** with topic results (for K=5),
# preserving all existing tweet fields. Artifacts are stored in Drive alongside prior steps.
#
# **Reads**:
# - `/content/drive/MyDrive/AI Public Trust/Data Sets/Cleaned Data/AItrust_pruned_twits_with_sentiment.json`
#
# **Writes (main)**:
# - `/content/drive/MyDrive/AI Public Trust/Data Sets/Cleaned Data/AItrust_pruned_twits_with_sentiment_and_topics_k5.json` (JSONL)
# - Optional block files if chunking: `..._blockNNN.json`
#
# **Writes (models/diagnostics)** under `/content/drive/MyDrive/AI Public Trust/Models/Topic Modeling/LDA/`:
# - `lda_k5_model.joblib`, `lda_k5_vectorizer.joblib`, `lda_grid_results.csv`
# - `lda_k5_doc_topic_matrix.npy` (train set), `lda_k5_pyLDAvis.html`, `lda_k5_tsne.png`, `lda_k5_umap.png`
# - `lda_k5_topics_metadata.json` (full) **and** a slim copy: `/.../Cleaned Data/AItrust_topics_k5_metadata.json`
#
# **Notes**:
# - Keeps substantive elements of the rough script: grid over K, held-out perplexity, coherence (gensim c_v),
#   pyLDAvis, t-SNE/UMAP, top terms, topic label dictionary; adds robust streaming enrichment to JSONL.
# - Vectorizer/model are saved and reused to transform the full corpus in chunks.


In [None]:
# do this ONCE
import os
if not RUNNING_LOCALLY:
    print('Running Colab setup shell commands...')
    !pip -q install "numpy==1.26.4" "scipy==1.10.1"
else:
    print('Running locally: Skipping Colab shell setup.')

import os; os.kill(os.getpid(), 9)  # force Colab runtime restart



[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.0/61.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m58.9/58.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m18.3/18.3 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m34.1/34.1 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-contrib-p

In [None]:
import os
if not RUNNING_LOCALLY:
    print('Running Colab setup shell commands...')
    !pip -q install "gensim==4.3.2" "umap-learn==0.5.5" "pyLDAvis==3.4.1"
else:
    print('Running locally: Skipping Colab shell setup.')



In [None]:
# %%
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 0. Environment & Imports
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# from google.colab import drive
import os, re, json, glob, math, gc, sys
from pathlib import Path
import numpy as np
import pandas as pd
from collections import defaultdict

# Plotting
import matplotlib.pyplot as plt

# Modeling stack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE

# UMAP for visualization
try:
    import umap
except ImportError:
import os
if not RUNNING_LOCALLY:
    print('Running Colab setup shell commands...')
    !pip -q install umap-learn==0.5.5
else:
    print('Running locally: Skipping Colab shell setup.')

    import umap

# Gensim for coherence
try:
    from gensim.corpora import Dictionary as GensimDictionary
    from gensim.models import CoherenceModel
except Exception:
import os
if not RUNNING_LOCALLY:
    print('Running Colab setup shell commands...')
    !pip -q install gensim==4.3.2
else:
    print('Running locally: Skipping Colab shell setup.')

    from gensim.corpora import Dictionary as GensimDictionary
    from gensim.models import CoherenceModel

# NLTK for tokenization/stopwords (for coherence pipeline)
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

# pyLDAvis
import importlib, subprocess, sys
try:
    import pyLDAvis
    try:
        from pyLDAvis.sklearn import prepare as sklearn_lda_prepare
    except ImportError:
        from pyLDAvis.sklearn_model import prepare as sklearn_lda_prepare
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "pyLDAvis==3.2.2"])
    import pyLDAvis
    from pyLDAvis.sklearn import prepare as sklearn_lda_prepare

# Persistence
from joblib import dump, load


In [None]:
# %%
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 1. Drive Mount & Paths
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# drive.mount('/content/drive')

# Base project folder (Ignacio standard)
# BASE = Path('/content/drive/My Drive/Colab Projects/AI Public Trust')
DATA_DIR = BASE / 'Data Sets' / 'Cleaned Data'
MODELS_DIR = BASE / 'Models' / 'Topic Modeling' / 'LDA'
MODELS_DIR.mkdir(parents=True, exist_ok=True)

# Inputs
INPUT_JSONL = DATA_DIR / 'AItrust_pruned_twits_with_sentiment.json'
ORIG_INPUT_JSONL = DATA_DIR / 'AItrust_pruned_twits_with_sentiment.json'

# Main enrichment output
OUTPUT_JSONL = DATA_DIR / 'AItrust_pruned_twits_with_sentiment_and_topics_k5.json'
# Slim topics metadata copy lives in Cleaned Data
SLIM_META_JSON = DATA_DIR / 'AItrust_topics_k5_metadata.json'

# Model artifacts
LDA_MODEL_PATH = MODELS_DIR / 'lda_k5_model.joblib'
VECT_PATH      = MODELS_DIR / 'lda_k5_vectorizer.joblib'
DOC_TOPIC_NPY  = MODELS_DIR / 'lda_k5_doc_topic_matrix.npy'
PYLDAVIS_HTML  = MODELS_DIR / 'lda_k5_pyLDAvis.html'
TSNE_PNG       = MODELS_DIR / 'lda_k5_tsne.png'
UMAP_PNG       = MODELS_DIR / 'lda_k5_umap.png'
GRID_CSV       = MODELS_DIR / 'lda_grid_results.csv'
FULL_META_JSON = MODELS_DIR / 'lda_k5_topics_metadata.json'

# Optional block writing (for very large corpora)
BLOCK_BASENAME = OUTPUT_JSONL.stem + '_block'

print('INPUT  :', INPUT_JSONL)
print('OUTPUT :', OUTPUT_JSONL)
print('MODELS :', MODELS_DIR)


In [None]:
# %%
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 2. Parameters (kept explicit)
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Training/sample controls
K_TARGET = 5                       # K for enrichment file
#TRAIN_SAMPLE_MAX = 5000           # cap number of docs for model training (adjust if needed)
TRAIN_SAMPLE_MAX = 25000           # cap number of docs for model training (adjust if needed)
TEST_SIZE = 0.10                   # held-out fraction for perplexity
RANDOM_STATE = 0
SEEDS = [0]                        # can extend e.g., [0,1,2]

# Grid for diagnostics (perplexity + coherence)
TOPIC_GRID = [3, 5, 8, 10, 12, 15, 20, 25, 30]

# Vectorizer parameters (kept close to rough script)
VECT_KW = dict(
    max_df=0.95,
    min_df=5,
)

# LDA parameters (scikit-learn)
LDA_KW = dict(
    learning_method='batch',   # consistent, deterministic per seed
    learning_decay=0.7,        # matches rough script default
    max_iter=20,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)

# Chunking for full-corpus transform
READ_CHUNK = 100_000            # number of lines per streaming chunk
WRITE_BLOCKS = True             # write block files to avoid a single massive write

# Visualization parameters
TSNE_KW = dict(n_components=2, learning_rate='auto', init='random', perplexity=30, random_state=RANDOM_STATE)
UMAP_KW = dict(n_neighbors=25, min_dist=0.10, n_components=2, metric='cosine', random_state=RANDOM_STATE)



In [None]:
# %%
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 3. Utilities: tweet text normalization & tokenization for coherence
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Light, non-destructive tweet cleanup (kept parameterized)
URL_RE = re.compile(r'https?://\S+')
MENTION_RE = re.compile(r'@\w+')
HASHTAG_RE = re.compile(r'#(\w+)')
WHITESPACE_RE = re.compile(r'\s+')

EN_STOP = set(stopwords.words('english'))
TOKENIZER = RegexpTokenizer(r"[A-Za-z][A-Za-z_\-']+")


def normalize_text(t: str) -> str:
    if not isinstance(t, str):
        return ''
    t = t.replace('RT ', ' ')                       # drop RT marker
    t = URL_RE.sub(' ', t)
    t = MENTION_RE.sub(' ', t)
    # keep hashtag terms without '#'
    t = HASHTAG_RE.sub(lambda m: ' ' + m.group(1) + ' ', t)
    t = t.lower()
    t = WHITESPACE_RE.sub(' ', t).strip()
    return t


def simple_tokenize(t: str):
    t = normalize_text(t)
    toks = TOKENIZER.tokenize(t)
    return [w for w in toks if w not in EN_STOP and len(w) > 1]


In [None]:
# %%
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 4. Read a training sample from JSONL (stream-safe)
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

def stream_jsonl(path: Path):
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip():
                continue
            try:
                yield json.loads(line)
            except json.JSONDecodeError:
                continue

# Collect up to TRAIN_SAMPLE_MAX docs for training/validation
train_docs = []
train_ids  = []
for i, rec in enumerate(stream_jsonl(INPUT_JSONL)):
    text = rec.get('text', '')
    if not text:
        continue
    train_docs.append(normalize_text(text))
    train_ids.append(rec.get('id'))
    if len(train_docs) >= TRAIN_SAMPLE_MAX:
        break

print(f"Loaded {len(train_docs):,} training docs from JSONL (cap={TRAIN_SAMPLE_MAX:,}).")


In [None]:
# %%
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 5. Vectorize & split
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
X_train_full = None
vect = CountVectorizer(**VECT_KW)
X_all = vect.fit_transform(train_docs)
y_dummy = np.zeros(len(train_docs))  # placeholder labels (not used)

X_train, X_test, _, _ = train_test_split(X_all, y_dummy, test_size=TEST_SIZE, random_state=RANDOM_STATE)
print("Vectorized. Shapes:", X_train.shape, X_test.shape)


In [None]:
# %%
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 6. Grid search over K: train, save, evaluate (perplexity) & collect results
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Initialize/append CSV
if GRID_CSV.exists():
    res_df = pd.read_csv(GRID_CSV)
else:
    res_df = pd.DataFrame(columns=["k", "seed", "perplexity"])

for k in TOPIC_GRID:
    for seed in SEEDS:
        tag = f"test_k{k}_s{seed}"
        model_path = MODELS_DIR / f"{tag}.joblib"
        docTopic_path = MODELS_DIR / f"{tag}_docTopic.joblib"

        if model_path.exists() and docTopic_path.exists():
            print(f"‚è©  {tag} exists ‚Äì skipping train.")
            continue

        lda = LatentDirichletAllocation(n_components=k, random_state=seed, **{k2:v for k2,v in LDA_KW.items() if k2!='random_state'})
        lda.fit(X_train)
        doc_topic = lda.transform(X_train)  # train doc-topic

        # Perplexity on test set
        perp = lda.perplexity(X_test)
        print(f"‚úÖ k={k}, seed={seed} ‚Üí Perplexity(Test) = {perp:.2f}")

        # Save artifacts for this grid point
        dump(lda, model_path)
        dump(doc_topic, docTopic_path)

        # Append results row
        res_df.loc[len(res_df)] = {"k":k, "seed":seed, "perplexity":perp}
        res_df.to_csv(GRID_CSV, index=False)

# Plot Perplexity vs K (avg over seeds)
if len(res_df):
    avg_perp = res_df.groupby('k')['perplexity'].mean().sort_index()
    plt.figure(figsize=(8,5))
    plt.plot(avg_perp.index, avg_perp.values, marker='o')
    plt.title('Held-out Perplexity vs Number of Topics (k)')
    plt.xlabel('Number of Topics (k)')
    plt.ylabel('Avg Perplexity (Test)')
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
# %%
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 7. Coherence (gensim c_v) using simple_tokenize ‚Üí Dictionary ‚Üí CoherenceModel
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Tokenize the same training docs (note: different pipeline than CountVectorizer)
print("Tokenizing training docs for coherence (gensim c_v)‚Ä¶")
tokenized_docs = [simple_tokenize(t) for t in train_docs]
gensim_dict = GensimDictionary(tokenized_docs)
print(f"‚úÖ Tokenized {len(tokenized_docs):,} documents; vocab size={len(gensim_dict):,}")

# Helper: extract top-N words per topic from a fitted sklearn LDA

def _top_words_from_sklearn_lda(lda_model, vocab, topn=15):
    comp = lda_model.components_  # shape: (k, vocab_size)
    top = {}
    for tid, row in enumerate(comp):
        top_idx = np.argsort(row)[::-1][:topn]
        top[tid] = [(vocab[i], float(row[i])) for i in top_idx]
    return top

# Compute coherence for each (k, seed)
coherence_scores = {}
for k in TOPIC_GRID:
    for seed in SEEDS:
        tag = f"test_k{k}_s{seed}"
        model_path = MODELS_DIR / f"{tag}.joblib"
        if not model_path.exists():
            continue
        lda_model = load(model_path)
        vocab = np.array(vect.get_feature_names_out())
        top_terms = _top_words_from_sklearn_lda(lda_model, vocab, topn=15)
        # Build gensim CoherenceModel using tokenized_docs & top terms per topic
        cm = CoherenceModel(
            topics=[[w for (w,_) in top_terms[t]] for t in sorted(top_terms)],
            texts=tokenized_docs,
            dictionary=gensim_dict,
            coherence='c_v',
        )
        cv = float(cm.get_coherence())
        coherence_scores[(k, seed)] = cv
        print(f"‚úÖ k={k}, seed={seed} ‚Üí c_v = {cv:.3f}")

# Plot Coherence vs K (avg over seeds)
if len(coherence_scores):
    from collections import defaultdict
    buckets = defaultdict(list)
    for (k, seed), score in coherence_scores.items():
        buckets[k].append(score)
    avg_cv = {k: sum(v)/len(v) for k, v in buckets.items()}
    sk = sorted(avg_cv)
    plt.figure(figsize=(8,5))
    plt.plot(sk, [avg_cv[k] for k in sk], marker='o')
    plt.title('Topic Coherence (c_v) vs Number of Topics (k)')
    plt.xlabel('Number of Topics (k)')
    plt.ylabel('Avg Coherence (c_v)')
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
# %%
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 8. Train final LDA at K_TARGET and create diagnostics (pyLDAvis, t-SNE, UMAP)
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
print(f"\nTraining final LDA with K={K_TARGET} for enrichment‚Ä¶")
lda_k = LatentDirichletAllocation(n_components=K_TARGET, random_state=RANDOM_STATE, **{k2:v for k2,v in LDA_KW.items() if k2!='random_state'})
lda_k.fit(X_train)
train_doc_topic = lda_k.transform(X_train)

# Save model/vectorizer + train doc-topic
dump(lda_k, LDA_MODEL_PATH)
dump(vect, VECT_PATH)
np.save(DOC_TOPIC_NPY, train_doc_topic)
print("Saved:", LDA_MODEL_PATH.name, VECT_PATH.name, DOC_TOPIC_NPY.name)

# pyLDAvis panel (on training set)
print("Preparing pyLDAvis panel‚Ä¶")
class _VectCompat:
    def __init__(self, v):
        self.v = v
    def get_feature_names(self):
        # pyLDAvis expects a list-like; convert to list for safety
        return list(self.v.get_feature_names_out())

vect_for_vis = _VectCompat(vect)
vis_panel = sklearn_lda_prepare(lda_k, X_train, vect_for_vis, mds='pcoa')
pyLDAvis.save_html(vis_panel, str(PYLDAVIS_HTML))
print("Saved:", PYLDAVIS_HTML.name)

# Visualize train doc-topic with TSNE & UMAP (colored by argmax topic)
dom_topic = train_doc_topic.argmax(axis=1)

print("t-SNE plot‚Ä¶")
tsne = TSNE(**TSNE_KW)
tsne_xy = tsne.fit_transform(train_doc_topic)
plt.figure(figsize=(8,6))
plt.scatter(tsne_xy[:,0], tsne_xy[:,1], c=dom_topic, s=5)
plt.title(f"t-SNE of doc‚Äìtopic vectors (k={K_TARGET})")
plt.tight_layout()
plt.savefig(TSNE_PNG, dpi=300)
plt.close()
print("Saved:", TSNE_PNG.name)

print("UMAP plot‚Ä¶")
reducer = umap.UMAP(**UMAP_KW)
umap_xy = reducer.fit_transform(train_doc_topic)
plt.figure(figsize=(8,6))
plt.scatter(umap_xy[:,0], umap_xy[:,1], c=dom_topic, s=5)
plt.title(f"UMAP of doc‚Äìtopic vectors (k={K_TARGET})")
plt.tight_layout()
plt.savefig(UMAP_PNG, dpi=300)
plt.close()
print("Saved:", UMAP_PNG.name)


In [None]:
# %%
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 9. Topic labels & metadata exports
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Optional human labels (extend or customize to your taxonomy)
# Keep structure compatible with rough script: mapping by K
TOPIC_LABELS = {
    5: {
        0: "Topic 0",
        1: "Topic 1",
        2: "Topic 2",
        3: "Topic 3",
        4: "Topic 4",
    },
    # You can add alternative K labelings here (8, 12, ‚Ä¶) if desired
}

# Build metadata for K_TARGET
vocab = np.array(vect.get_feature_names_out())
components = lda_k.components_
TOPN = 15
meta_topics = []
for tid in range(components.shape[0]):
    idx = np.argsort(components[tid])[::-1][:TOPN]
    terms = [
        {"term": vocab[i], "weight": float(components[tid, i])}
        for i in idx
    ]
    meta_topics.append({
        "topic_id": tid,
        "label": TOPIC_LABELS.get(K_TARGET, {}).get(tid, f"Topic {tid}"),
        "top_terms": terms,
    })

full_meta = {
    "k": K_TARGET,
    "topics": meta_topics,
    "vocabulary_size": int(len(vocab)),
    "vectorizer_params": VECT_KW,
    "lda_params": {**LDA_KW, "n_components": K_TARGET},
}

with open(FULL_META_JSON, 'w', encoding='utf-8') as f:
    json.dump(full_meta, f, ensure_ascii=False, indent=2)
print("Saved:", FULL_META_JSON.name)

# Slim quick-reference copy in Cleaned Data
slim_topics = {
    "k": K_TARGET,
    "topics": [
        {
            "topic_id": t["topic_id"],
            "label": t["label"],
            "top_terms": [w["term"] for w in t["top_terms"]],
        }
        for t in meta_topics
    ],
}
with open(SLIM_META_JSON, 'w', encoding='utf-8') as f:
    json.dump(slim_topics, f, ensure_ascii=False, indent=2)
print("Saved:", SLIM_META_JSON.name)


In [None]:
# --- Toggle: limit enrichment to a small sample without editing the main loop ---
SAMPLE_ENRICH   = False        # False = full run
SAMPLE_ENRICH_N = 50_000      # how many lines for the sample

# Always sample from the original full file
SOURCE = ORIG_INPUT_JSONL

if SAMPLE_ENRICH:
    sample_path = DATA_DIR / f"{SOURCE.stem}__SAMPLE{SAMPLE_ENRICH_N}.json"
    n = 0
    with open(SOURCE, 'r', encoding='utf-8') as fin, open(sample_path, 'w', encoding='utf-8') as fout:
        for line in fin:
            if not line.strip():
                continue
            fout.write(line)
            n += 1
            if n >= SAMPLE_ENRICH_N:
                break
    INPUT_JSONL    = sample_path
    OUTPUT_JSONL   = DATA_DIR / f"AItrust_pruned_twits_with_sentiment_and_topics_k5__SAMPLE{SAMPLE_ENRICH_N}.json"
    BLOCK_BASENAME = OUTPUT_JSONL.stem + "_block"
    print(f"üîé Enrichment LIMITED to first {n:,} lines ‚Üí using: {sample_path.name}")
    print(f"üìù Output will be: {OUTPUT_JSONL.name}")
else:
    INPUT_JSONL    = ORIG_INPUT_JSONL
    OUTPUT_JSONL   = DATA_DIR / "AItrust_pruned_twits_with_sentiment_and_topics_k5.json"
    BLOCK_BASENAME = OUTPUT_JSONL.stem + "_block"
    print(f"üöÄ Enrichment will run on FULL file: {INPUT_JSONL.name}")



In [None]:
# %%
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 10. Stream the full sentiment JSONL ‚Üí add topics (K=5) ‚Üí write JSONL (+ blocks)
# (BATCHED version: faster, same outputs)
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
from itertools import islice
import time, gc

# Load persisted vectorizer & model to ensure consistency
vect = load(VECT_PATH)
lda_k = load(LDA_MODEL_PATH)

# Ensure output parent exists
OUTPUT_JSONL.parent.mkdir(parents=True, exist_ok=True)

# Clear only this step's old block files
if WRITE_BLOCKS:
    for old in DATA_DIR.glob(BLOCK_BASENAME + "*.json"):
        try:
            old.unlink()
        except Exception:
            pass

# ---- speed knobs (tune as RAM allows) ----
BATCH_DOCS       = 50_000    # try 50k; drop to 20k if RAM tight
WRITE_BLOCK_SIZE = 200_000   # write to disk every ~200k lines
PROGRESS_EVERY   = 50_000

# ---- state ----
block_idx = 1
written_total = 0
processed_total = 0
write_buffer = []
start_time = time.time()

def flush_block():
    """Write accumulated lines to a block file."""
    global write_buffer, written_total, block_idx
    if not write_buffer:
        return
    if WRITE_BLOCKS:
        block_path = DATA_DIR / f"{BLOCK_BASENAME}{block_idx:03d}.json"
        with open(block_path, 'w', encoding='utf-8') as fout:
            fout.write("\n".join(write_buffer) + "\n")
        print(f"Wrote block {block_idx:03d} with {len(write_buffer):,} lines ‚Üí {block_path.name}")
        block_idx += 1
    written_total += len(write_buffer)
    write_buffer = []

def process_batch(records):
    """Batch-transform texts; add LDA fields; append JSON lines to buffer."""
    global processed_total

    # collect texts to transform
    idxs, texts = [], []
    for i, r in enumerate(records):
        t = r.get("text", "")
        if isinstance(t, str) and t:
            idxs.append(i)
            texts.append(normalize_text(t))

    # one vectorize+transform per batch
    if idxs:
        Xb = vect.transform(texts)
        Db = lda_k.transform(Xb)              # (len(idxs), K)
        argmax = Db.argmax(axis=1)
        for j, i_rec in enumerate(idxs):
            dist = Db[j].tolist()
            tid  = int(argmax[j])
            label = TOPIC_LABELS.get(K_TARGET, {}).get(tid, f"Topic {tid}")
            r = records[i_rec]
            r["lda_k5_topic_id"]   = tid
            r["lda_k5_topic_dist"] = [float(x) for x in dist]
            r["lda_k5_topic_label"]= label

    # serialize all records (including ones without text)
    for r in records:
        write_buffer.append(json.dumps(r, ensure_ascii=False))

    processed_total += len(records)
    if processed_total % PROGRESS_EVERY == 0:
        elapsed = time.time() - start_time
        rate = processed_total / max(elapsed, 1e-9)
        print(f"Progress: {processed_total:,} processed | {rate:,.0f}/s | {int(elapsed)}s elapsed")

    if len(write_buffer) >= WRITE_BLOCK_SIZE:
        flush_block()

with open(INPUT_JSONL, 'r', encoding='utf-8') as fin:
    while True:
        lines = list(islice(fin, BATCH_DOCS))
        if not lines:
            break

        # parse JSON lines
        batch_records = []
        for line in lines:
            line = line.strip()
            if not line:
                continue
            try:
                batch_records.append(json.loads(line))
            except json.JSONDecodeError:
                continue

        process_batch(batch_records)
        gc.collect()

# final flush + merge
flush_block()

if WRITE_BLOCKS:
    print("Merging blocks ‚Üí", OUTPUT_JSONL.name)
    with open(OUTPUT_JSONL, 'w', encoding='utf-8') as fout:
        for blk in sorted(DATA_DIR.glob(BLOCK_BASENAME + "*.json")):
            with open(blk, 'r', encoding='utf-8') as fin:
                for line in fin:
                    fout.write(line)
    print("‚úÖ Merged", len(list(DATA_DIR.glob(BLOCK_BASENAME + "*.json"))), "blocks ‚Üí", OUTPUT_JSONL.name)

elapsed_total = time.time() - start_time
print(f"\n‚úÖ Enrichment done. Lines written: {written_total:,} | "
      f"Processed: {processed_total:,} | Elapsed: {int(elapsed_total)}s")


In [None]:
# %%
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 11. (Optional) Merge block files ‚Üí single JSONL
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
if WRITE_BLOCKS:
    print("Merging blocks ‚Üí", OUTPUT_JSONL.name)
    with open(OUTPUT_JSONL, 'w', encoding='utf-8') as fout:
        for blk in sorted(DATA_DIR.glob(BLOCK_BASENAME + "*.json")):
            with open(blk, 'r', encoding='utf-8') as fin:
                for line in fin:
                    fout.write(line)
    print("‚úÖ Merged", len(list(DATA_DIR.glob(BLOCK_BASENAME + "*.json"))), "blocks ‚Üí", OUTPUT_JSONL.name)


In [None]:
# %%
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 12. Sanity check: peek a few records
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
print("\nSample enriched records:")
peek_n = 3
with open(OUTPUT_JSONL, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= peek_n:
            break
        try:
            rec = json.loads(line)
            print({
                'id': rec.get('id'),
                'topic_id': rec.get('lda_k5_topic_id'),
                'topic_label': rec.get('lda_k5_topic_label'),
                'topic_dist_0_2': [round(x, 3) for x in rec.get('lda_k5_topic_dist', [])[:3]],
                'sentiment': rec.get('sentiment'),
            })
        except Exception as e:
            print('Decode error on peek line', i, e)


###*Temporary* -- the merge tripped because Google Drive‚Äôs FUSE mount glitched (‚ÄúTransport endpoint is not connected‚Äù).

In [None]:
!wc -l "/content/drive/MyDrive/AI Public Trust/Data Sets/Cleaned Data/AItrust_pruned_twits_with_sentiment_and_topics_k5.json"


In [None]:
import re, gzip, os, io
from pathlib import Path

# Paths (match your project layout)
DATA_DIR = Path("/content/drive/MyDrive/AI Public Trust/Data Sets/Cleaned Data")
BASENAME = "AItrust_pruned_twits_with_sentiment_and_topics_k5"
OUT_GZ   = DATA_DIR / f"{BASENAME}.jsonl.gz"
OUT_PART = DATA_DIR / f"{BASENAME}.jsonl.gz.part"   # temp file, renamed on success

# 1) Gather block files in numeric order
pat = re.compile(rf"^{re.escape(BASENAME)}_block(\d+)\.json$")
blocks = []
for p in DATA_DIR.glob(f"{BASENAME}_block*.json"):
    m = pat.match(p.name)
    if m:
        blocks.append((int(m.group(1)), p))
blocks.sort(key=lambda x: x[0])
block_paths = [p for _, p in blocks]

if not block_paths:
    raise SystemExit("No block files found. Nothing to merge.")

print(f"Found {len(block_paths)} block(s). First: {block_paths[0].name}  Last: {block_paths[-1].name}")

# 2) Stream-merge ‚Üí gzip (write to .part first)
lines = 0
FLUSH_EVERY = 500_000

# If a previous .part exists (e.g., after an interruption), remove it
try:
    OUT_PART.unlink()
except FileNotFoundError:
    pass

with gzip.open(OUT_PART, "wt", encoding="utf-8", compresslevel=6) as fout:
    for i, blk in enumerate(block_paths, 1):
        with blk.open("r", encoding="utf-8") as fin:
            for line in fin:
                fout.write(line)
                lines += 1
                if lines % FLUSH_EVERY == 0:
                    # progress indicator
                    print(f"Progress: {lines:,} lines written (through block {i}/{len(block_paths)})")

print(f"Merge-to-gzip complete ‚Üí {OUT_PART.name}  |  Lines: {lines:,}")

# 3) Quick gzip integrity check
rc = os.system(f'gzip -t "{OUT_PART}"')
if rc != 0:
    raise SystemExit("Gzip integrity check failed. Do not rename .part; please rerun this cell.")

# 4) Rename .part ‚Üí final (best-effort atomic on Drive)
try:
    OUT_GZ.unlink()  # remove old final if present
except FileNotFoundError:
    pass
OUT_PART.rename(OUT_GZ)
print(f"‚úÖ Final file ready: {OUT_GZ.name}  |  Total lines (written): {lines:,}")


In [None]:
!zcat "/content/drive/MyDrive/AI Public Trust/Data Sets/Cleaned Data/AItrust_pruned_twits_with_sentiment_and_topics_k5.jsonl.gz" | wc -l


In [None]:
import gzip, json, itertools, random
p = "/content/drive/MyDrive/AI Public Trust/Data Sets/Cleaned Data/AItrust_pruned_twits_with_sentiment_and_topics_k5.jsonl.gz"
with gzip.open(p, "rt", encoding="utf-8") as f:
    sample = [json.loads(x) for x in itertools.islice(f, 10)]
sample[:3]


## Quick Analysis stuff (to redo properly)

In [None]:
import gzip, json
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer

DATA_PATH = "/content/drive/MyDrive/AI Public Trust/Data Sets/Cleaned Data/AItrust_pruned_twits_with_sentiment_and_topics_k5.jsonl.gz"

# --- Pass 1: collect counts + text samples ---
topic_counts = Counter()
sentiment_counts = Counter()
topic_sentiment_counts = defaultdict(Counter)
topic_texts = defaultdict(list)  # hold a small sample of texts per topic
topic_corpus = defaultdict(list) # collect texts for n-gram extraction

MAX_SAMPLE_PER_TOPIC = 20_000   # cap so we don't overload RAM

with gzip.open(DATA_PATH, "rt", encoding="utf-8") as f:
    for i, line in enumerate(f, 1):
        try:
            obj = json.loads(line)
        except:
            continue

        t_id = obj.get("lda_k5_topic_id")
        sent = obj.get("sentiment_label")
        txt = obj.get("text", "")

        if t_id is None or not txt:
            continue

        topic_counts[t_id] += 1
        sentiment_counts[sent] += 1
        topic_sentiment_counts[t_id][sent] += 1

        if len(topic_texts[t_id]) < 3:
            topic_texts[t_id].append(txt)

        if len(topic_corpus[t_id]) < MAX_SAMPLE_PER_TOPIC:
            topic_corpus[t_id].append(txt)

print("Pass 1 complete")

# --- Pass 2: extract top n-grams per topic ---
vectorizer = CountVectorizer(stop_words="english", ngram_range=(1,2), max_features=5000)

topic_top_terms = {}
for t_id, texts in topic_corpus.items():
    if not texts:
        continue
    X = vectorizer.fit_transform(texts)
    freqs = X.sum(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    top_idx = freqs.argsort()[::-1][:15]
    topic_top_terms[t_id] = [(terms[i], freqs[i]) for i in top_idx]

print("Pass 2 complete")

# --- Display nicely ---
print("\n=== Global Sentiment Counts ===")
for s, c in sentiment_counts.most_common():
    print(f" {s:<8} {c:,}")

print("\n=== Topics Overview ===")
for t_id, c in topic_counts.most_common():
    print(f"\n--- Topic {t_id} | {c:,} tweets ---")
    for s, sc in topic_sentiment_counts[t_id].most_common():
        print(f"   {s:<8}: {sc:,}")

    print("   Top n-grams:")
    for term, freq in topic_top_terms.get(t_id, []):
        print(f"     {term:<20} {freq}")

    print("   Example tweets:")
    for ex in topic_texts[t_id]:
        print("    -", ex.replace("\n", " ")[:200])


In [None]:
import re, gzip, json
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from pathlib import Path
import numpy as np

# Path to merged gzipped JSONL
DATA_FILE = Path("/content/drive/MyDrive/AI Public Trust/Data Sets/Cleaned Data/AItrust_pruned_twits_with_sentiment_and_topics_k5.jsonl.gz")

# ‚îÄ‚îÄ Cleaning helpers for TF-IDF ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
URL_RE   = re.compile(r'https?://\S+')
MENT_RE  = re.compile(r'@\w+')
HASH_RE  = re.compile(r'#(\w+)')        # keep the word, drop '#'
NONALPH  = re.compile(r'[^a-zA-Z]+')    # strip non letters
SHORT_RE = re.compile(r'\b\w{1,2}\b')   # drop very short tokens

TWITTER_STOP = {
    'rt','amp','https','http','tco','via','img','video','tweet','retweet',
    'follow','join','click','share','subscribe','like'
}

def clean_text_basic(s: str) -> str:
    s = s.lower()
    s = URL_RE.sub(' ', s)
    s = MENT_RE.sub(' ', s)
    s = HASH_RE.sub(r' \1 ', s)         # keep hashtag word
    s = NONALPH.sub(' ', s)
    s = SHORT_RE.sub(' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def tokenizer(s: str):
    # split, drop twitter stopwords
    toks = s.split()
    return [t for t in toks if t not in TWITTER_STOP]

# ‚îÄ‚îÄ Step 1: Stream sample ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
sample_texts = []
sample_topics = []
sample_sents  = []
sample_dists  = []
sample_dates  = []

MAX_SAMPLE = 500_000  # adjust up/down based on RAM/time
with gzip.open(DATA_FILE, "rt", encoding="utf-8") as fin:
    for i, line in enumerate(fin):
        try:
            obj = json.loads(line)
        except:
            continue
        txt = obj.get("text", "")
        if not txt:
            continue

        sample_texts.append(txt)
        sample_topics.append(obj.get("lda_k5_topic_id"))
        sample_sents.append(obj.get("sentiment_label"))
        sample_dists.append(obj.get("lda_k5_topic_dist"))
        sample_dates.append(obj.get("created_at","")[:10])  # YYYY-MM-DD

        if i + 1 >= MAX_SAMPLE:
            break

print(f"‚úÖ Loaded {len(sample_texts):,} tweets for analysis")

# ‚îÄ‚îÄ Step 2: Basic counts ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
topic_counts = Counter(sample_topics)
print("\n=== Topic counts (argmax assignment) ===")
for t in sorted(topic_counts):
    c = topic_counts[t]
    print(f"Topic {t}: {c:,} ({c/len(sample_texts):.1%})")

# ‚îÄ‚îÄ Step 3: TF-IDF distinctive words per topic (with cleaning) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

URL_RE   = re.compile(r'https?://\S+')
MENT_RE  = re.compile(r'@\w+')
HASH_RE  = re.compile(r'#(\w+)')
NONALPH  = re.compile(r'[^a-zA-Z]+')
SHORT_RE = re.compile(r'\b\w{1,2}\b')

TWITTER_STOP = {
    'rt','amp','https','http','tco','via','img','video','tweet','retweet',
    'follow','join','click','share','subscribe','like'
}
# Domain-global words to suppress across topics
GLOBAL_STOP = {
    'ai','chatgpt','gpt','openai','artificial','intelligence','artificialintelligence',
    'google','bard','microsoft','bing','llm','ml','nlp','dataset','model','models'
}

def clean_text_basic(s: str) -> str:
    s = s.lower()
    s = URL_RE.sub(' ', s)
    s = MENT_RE.sub(' ', s)
    s = HASH_RE.sub(r' \1 ', s)    # keep hashtag word
    s = NONALPH.sub(' ', s)
    s = SHORT_RE.sub(' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def tokenizer_clean(s: str):
    toks = s.split()
    stops = TWITTER_STOP | GLOBAL_STOP
    return [t for t in toks if t not in stops]

# Vectorizer tuned to drop very common/rare terms and compress TF
tfidf = TfidfVectorizer(
    preprocessor=clean_text_basic,
    tokenizer=tokenizer_clean,
    stop_words='english',
    max_df=0.20,          # drop terms in >20% of docs (kills global boilerplate)
    min_df=50,            # keep terms seen in at least 50 docs (for stability on 500k)
    sublinear_tf=True,    # log(1 + tf)
    max_features=20_000,
)

X = tfidf.fit_transform(sample_texts)
terms = np.array(tfidf.get_feature_names_out())
topics_arr = np.array(sample_topics)

topic_top_terms = {}
for t in sorted(set(x for x in topics_arr if x is not None)):
    mask_in = (topics_arr == t)
    mask_out = ~mask_in
    if mask_in.sum() == 0 or mask_out.sum() == 0:
        continue

    mean_in  = X[mask_in].mean(axis=0).A1
    mean_out = X[mask_out].mean(axis=0).A1
    lift = mean_in - mean_out                # contrastive TF-IDF (‚Äútopic ‚Äì rest‚Äù)

    top_idx = np.argsort(lift)[::-1][:15]
    topic_top_terms[t] = list(terms[top_idx])

print("\n=== Top contrastive TF-IDF terms per topic (cleaned) ===")
for t in sorted(topic_top_terms):
    print(f"Topic {t}: {', '.join(topic_top_terms[t])}")


# --- Step 3b: ngrams per topic ---
def top_ngrams(texts, n=15, ngram_range=(1,3)):
    vec = CountVectorizer(
        stop_words='english',
        ngram_range=ngram_range,
        max_features=50000,
        lowercase=True,
    )
    X = vec.fit_transform(texts)
    counts = np.array(X.sum(axis=0)).ravel()
    vocab = np.array(vec.get_feature_names_out())
    idx = counts.argsort()[::-1][:n]
    return list(zip(vocab[idx], counts[idx]))

# --- Step 3c: N-grams per topic ---
topic_ngrams = {}
for t in sorted(set(sample_topics)):
    texts_t = [txt for txt, tid in zip(sample_texts, sample_topics) if tid == t]
    topic_ngrams[t] = top_ngrams(texts_t, n=15, ngram_range=(1,3))

print("\n=== Top n-grams per topic ===")
for t, grams in topic_ngrams.items():
    tops = ", ".join([f"{g} ({c:,})" for g,c in grams])
    print(f"Topic {t}: {tops}")






In [None]:
# Define human-readable topic labels
topic_labels = {
    0: "General AI / Tools & Utility",
    1: "Memes / Culture",
    2: "Crypto / NFT / Giveaways / Hype",
    3: "Tech / Research / ML Ethics",
    4: "ChatGPT vs Google Bard"
}

# ‚îÄ‚îÄ Step 5: Sentiment by topic (stacked proportions) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
df = pd.DataFrame({
    "topic": [topic_labels[t] for t in sample_topics],
    "sentiment": sample_sents
})
sent_by_topic = df.groupby(["topic","sentiment"]).size().unstack(fill_value=0)
(sent_by_topic
     .div(sent_by_topic.sum(axis=1), axis=0)
     .plot(kind="bar", stacked=True, figsize=(10,5)))
plt.title("Sentiment distribution by topic (sample)")
plt.ylabel("Proportion")
plt.xlabel("Topic")
plt.tight_layout()
plt.show()

# ‚îÄ‚îÄ Step 6: Timeline by topic (monthly proportions) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
df_time = pd.DataFrame({
    "topic": [topic_labels[t] for t in sample_topics],
    "date": pd.to_datetime(sample_dates, errors="coerce")
}).dropna(subset=["date"])
df_time["month"] = df_time["date"].dt.to_period("M")
timeline = df_time.groupby(["month","topic"]).size().unstack(fill_value=0)
(timeline
     .div(timeline.sum(axis=1), axis=0)
     .plot(figsize=(12,6)))
plt.title("Topic prevalence over time (sample)")
plt.ylabel("Proportion of tweets")
plt.xlabel("Month")
plt.tight_layout()
plt.show()


In [None]:
# ‚îÄ‚îÄ Step 4: Overlap (conditional) + Heatmap + Topic-Overlap Graph ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx

# Parameters
THRESH    = 0.25   # topic membership threshold
EDGE_MIN  = 0.25   # only draw edges with conditional overlap > EDGE_MIN
ROUND_TO  = 3      # display rounding for the table/heatmap

K = len(sample_dists[0]) if sample_dists else 0
labels_in_order = [topic_labels[i] for i in range(K)]

# Build conditional overlap: P(Tj > THRESH | Ti > THRESH)
overlap = np.zeros((K, K), dtype=float)
counts_i = np.zeros(K, dtype=float)  # denominator per row i

for dist in sample_dists:
    # Boolean mask for which topics clear the threshold in THIS tweet
    mask = [(d is not None) and (d > THRESH) for d in dist]
    for i, mi in enumerate(mask):
        if mi:
            counts_i[i] += 1
            for j, mj in enumerate(mask):
                if mj:
                    overlap[i, j] += 1

# Normalize rows (handle zero denominators and set diagonal conventionally to 1)
for i in range(K):
    if counts_i[i] > 0:
        overlap[i, :] /= counts_i[i]
    else:
        overlap[i, i] = 1.0

df_overlap = pd.DataFrame(overlap, index=labels_in_order, columns=labels_in_order)

print(f"\n=== Conditional overlap matrix: P(Tj > {THRESH} | Ti > {THRESH}) ===")
display(df_overlap.round(ROUND_TO))

# Heatmap (matplotlib, single plot)
plt.figure(figsize=(7.5, 6))
plt.imshow(df_overlap.values, aspect='auto')  # default colormap
plt.colorbar(label="Conditional overlap")
plt.xticks(ticks=np.arange(K), labels=labels_in_order, rotation=45, ha='right')
plt.yticks(ticks=np.arange(K), labels=labels_in_order)
plt.title(f"Topic Overlap Heatmap  (threshold={THRESH})")
# annotate with values
for i in range(K):
    for j in range(K):
        val = df_overlap.values[i, j]
        plt.text(j, i, f"{val:.2f}", ha='center', va='center', fontsize=9)
plt.tight_layout()
plt.show()

# Topic overlap graph (spring layout). Edge i‚Üíj weight = P(Tj | Ti)
# We'll draw an undirected edge with weight = max(P(i|j), P(j|i)) for readability,
# but annotate width by that max weight so thicker edges = stronger two-way overlap.
G = nx.Graph()
for i, lab_i in enumerate(labels_in_order):
    G.add_node(lab_i)

# Build undirected weights from the conditional matrix
for i in range(K):
    for j in range(i+1, K):
        w_ij = overlap[i, j]
        w_ji = overlap[j, i]
        w = max(w_ij, w_ji)  # symmetric edge weight for drawing
        if w > EDGE_MIN:
            G.add_edge(labels_in_order[i], labels_in_order[j],
                       weight=w, w_ij=w_ij, w_ji=w_ji)

pos = nx.spring_layout(G, seed=42)  # deterministic layout
weights = [G[u][v]['weight'] for u, v in G.edges()]
# scale widths for visibility
edge_widths = [4 * (w ** 2) for w in weights]  # emphasize stronger overlaps

plt.figure(figsize=(8, 6))
nx.draw_networkx_nodes(G, pos, node_size=1200)
nx.draw_networkx_labels(G, pos, font_size=9)
nx.draw_networkx_edges(G, pos, width=edge_widths, alpha=0.6)
# edge labels show both directions (i|j and j|i) rounded
edge_labels = {
    (u, v): f"{G[u][v]['w_ij']:.2f}/{G[u][v]['w_ji']:.2f}"
    for (u, v) in G.edges()
}
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8)
plt.title(f"Topic Overlap Graph (edges where overlap > {EDGE_MIN})")
plt.axis('off')
plt.tight_layout()
plt.show()


In [None]:
# ‚îÄ‚îÄ Step 6b-alt: Sentiment over time per topic (stacked AREA, proportions) ‚îÄ‚îÄ

df_time_sent = pd.DataFrame({
    "topic": [topic_labels[t] for t in sample_topics],
    "date": pd.to_datetime(sample_dates, errors="coerce"),
    "sentiment": sample_sents
}).dropna(subset=["date"])
df_time_sent["month"] = df_time_sent["date"].dt.to_period("M").astype(str)

# Count tweets by (month, topic, sentiment)
counts_mts = (df_time_sent
              .groupby(["month", "topic", "sentiment"])
              .size()
              .rename("n")
              .reset_index())

# Convert counts to proportions within (month, topic)
props_mts = (counts_mts
             .groupby(["month","topic"])
             .apply(lambda g: g.assign(prop=g["n"]/g["n"].sum()))
             .reset_index(drop=True))

months_sorted = sorted(props_mts["month"].unique())
sent_order = ["negative","neutral","positive"]

topics_order = list(dict.fromkeys([topic_labels[t] for t in sorted(set(sample_topics))]))

for topic_name in topics_order:
    sub = props_mts[props_mts["topic"] == topic_name]
    pivot = (sub.pivot_table(index="month", columns="sentiment", values="prop", fill_value=0.0)
                 .reindex(index=months_sorted, fill_value=0.0))
    cols_present = [c for c in sent_order if c in pivot.columns] + [c for c in pivot.columns if c not in sent_order]
    pivot = pivot[cols_present]

    ax = pivot.plot(kind="area", stacked=True, figsize=(12,5), alpha=0.8)
    ax.set_ylim(0,1)
    ax.set_ylabel("Proportion")
    ax.set_xlabel("Month")
    ax.set_title(f"Sentiment over time ‚Äî {topic_name}")
    ax.legend(title="Sentiment", loc="upper right", ncol=3, frameon=False)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()



## TEMPORARY TESTS --

In [None]:
import os
print(os.listdir(DATA_DIR))


In [None]:
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Quick author-doc training (in-memory, capped) ‚Üí new LDA + vectorizer (K=5)
# Keeps hashtags/tickers, de-emphasizes RT boilerplate, light domain stoplist
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
from pathlib import Path
import gzip, json, re, unicodedata, numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from joblib import dump

# --- Paths (match your project) ---
BASE       = Path('/content/drive/My Drive/Colab Projects/AI Public Trust')
DATA_DIR   = BASE / 'Data Sets' / 'Cleaned Data'
MODELS_DIR = BASE / 'Models' / 'Topic Modeling' / 'LDA'
MODELS_DIR.mkdir(parents=True, exist_ok=True)

# Input: any merged gz with 'text' works (sentiment-only or enriched)
INPUT_GZ = DATA_DIR / 'AItrust_pruned_twits_with_sentiment_and_topics_k5.jsonl.gz'

# Outputs: NEW names so nothing gets overwritten
RUN_TAG          = 'authorlite_k5'
VECT_PATH_NEW    = MODELS_DIR / f'{RUN_TAG}_vectorizer.joblib'
MODEL_PATH_NEW   = MODELS_DIR / f'{RUN_TAG}_lda_k5.joblib'
META_JSON_NEW    = MODELS_DIR / f'{RUN_TAG}_topics_metadata.json'

# --- Small, safe knobs ---
MAX_TWEETS_READ        = 2_000_000   # how many tweets to stream for building author docs
MAX_AUTHORS            = 150_000     # cap unique authors
MIN_TWEETS_PER_AUTHOR  = 3           # need at least this many tweets to keep an author-doc
MAX_DOCS_FOR_TRAIN     = 200_000     # cap author-docs used to fit LDA
K_TARGET               = 5           # <-- five topics

# --- Cleaning (keeps hashtags/tickers; uses original text for RTs when present) ---
url_re   = re.compile(r'https?://\S+')
space_re = re.compile(r'\s+')

def normalize_text(s: str) -> str:
    s = s.lower()
    s = url_re.sub(' ', s)
    s = unicodedata.normalize("NFKC", s)
    s = space_re.sub(' ', s).strip()
    return s

def get_text(obj):
    # prefer the referenced original for retweets
    txt = obj.get('text') or ''
    if obj.get('type') == 'retweeted':
        ref = obj.get('referenced_tweets_dictionary') or {}
        txt = ref.get('text') or txt
    return txt

# --- Build author docs (in memory, bounded) ---
auth_texts = defaultdict(list)
with gzip.open(INPUT_GZ, 'rt', encoding='utf-8') as fin:
    for i, line in enumerate(fin, 1):
        if i > MAX_TWEETS_READ:
            break
        try:
            obj = json.loads(line)
        except json.JSONDecodeError:
            continue
        aid = str(obj.get('author_id') or '')
        if not aid:
            continue
        txt = get_text(obj)
        if not txt or txt.startswith('rt @'):  # drop bare ‚ÄúRT @‚Äù shells
            continue
        auth_texts[aid].append(normalize_text(txt))
        if len(auth_texts) >= MAX_AUTHORS:
            # soft cap on #authors; we keep accumulating for existing authors
            pass

# filter to authors with enough tweets, then cap total docs
author_docs = [" \n".join(v) for (a, v) in auth_texts.items() if len(v) >= MIN_TWEETS_PER_AUTHOR]
if len(author_docs) > MAX_DOCS_FOR_TRAIN:
    author_docs = author_docs[:MAX_DOCS_FOR_TRAIN]

print(f"Author docs for training: {len(author_docs):,} (from ~{len(auth_texts):,} authors)")

# --- Light domain stoplist removal (so generic words don‚Äôt dominate) ---
DOMAIN_STOP = {
    'ai','chatgpt','gpt','openai','bard','google','bing','microsoft',
    'rt','https','http','amp'
}
def strip_domain_terms(text: str) -> str:
    return re.sub(r'\b(' + '|'.join(map(re.escape, DOMAIN_STOP)) + r')\b', ' ', text)

author_docs = [strip_domain_terms(t) for t in author_docs]

# --- Vectorize (keep hashtags/tickers, include bigrams) ---
vect = CountVectorizer(
    lowercase=True,
    stop_words='english',
    token_pattern=r"(?u)\b[#@$]?[a-zA-Z0-9_]{2,}\b",  # keep #aiart $BTC etc.
    ngram_range=(1,2),
    min_df=5,
    max_features=75_000
)
X = vect.fit_transform(author_docs)
print("Vocab size:", len(vect.get_feature_names_out()))

# --- LDA fit (batch is stable for topic quality) ---
lda = LatentDirichletAllocation(
    n_components=K_TARGET,
    learning_method='batch',
    learning_decay=0.9,
    random_state=42,
    max_iter=30,
    evaluate_every=5,
    n_jobs=-1
)
lda.fit(X)
print("LDA trained for K=5.")

# --- Save artifacts ---
dump(vect, VECT_PATH_NEW)
dump(lda,  MODEL_PATH_NEW)

# --- Quick topic preview & metadata (no heuristic labels) ---
import json
terms = np.array(vect.get_feature_names_out())
TOPN  = 20
topics_meta = []
for t in range(K_TARGET):
    comp = lda.components_[t]
    top_idx = comp.argsort()[::-1][:TOPN]
    top_terms = terms[top_idx].tolist()
    topics_meta.append({"topic_id": t, "top_terms": top_terms})

with open(META_JSON_NEW, 'w', encoding='utf-8') as f:
    json.dump({
        "k": K_TARGET,
        "vectorizer_params": {
            "token_pattern": r"(?u)\b[#@$]?[a-zA-Z0-9_]{2,}\b",
            "ngram_range": [1,2],
            "min_df": 5,
            "max_features": 75_000,
            "stop_words": "english"
        },
        "domain_stop": sorted(DOMAIN_STOP),
        "topics": topics_meta
    }, f, ensure_ascii=False, indent=2)

print(f"‚úÖ Saved: {VECT_PATH_NEW.name}, {MODEL_PATH_NEW.name}, {META_JSON_NEW.name}")
print("\nPreview (top terms only):")
for m in topics_meta:
    print(f"Topic {m['topic_id']:>2}: " + ", ".join(m['top_terms'][:12]))



In [None]:
# %%
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 10. Enrich tweets with author-trained topics (safe, batched, non-overwriting)
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
import os, re, json, time, gzip, io, math, unicodedata
from pathlib import Path
import numpy as np
from joblib import load
from collections import Counter

# ‚îÄ‚îÄ Paths (match Ignacio layout) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
BASE       = Path('/content/drive/My Drive/Colab Projects/AI Public Trust')
DATA_DIR   = BASE / 'Data Sets' / 'Cleaned Data'
MODELS_DIR = BASE / 'Models' / 'Topic Modeling' / 'LDA'
MODELS_DIR.mkdir(parents=True, exist_ok=True)

# INPUT can be the big merged gz or plain jsonl (auto-detected)
# Use the fully merged file you produced yesterday:
PREFERRED_INPUTS = [
    DATA_DIR / 'AItrust_pruned_twits_with_sentiment_and_topics_k5.jsonl.gz',  # merged+gz
    DATA_DIR / 'AItrust_pruned_twits_with_sentiment_and_topics_k5.jsonl',     # merged (plain)
    DATA_DIR / 'AItrust_pruned_twits_with_sentiment.jsonl.gz',                # pre-topic (gz)
    DATA_DIR / 'AItrust_pruned_twits_with_sentiment.jsonl',                   # pre-topic (plain)
]

INPUT_PATH = None
for cand in PREFERRED_INPUTS:
    if cand.exists():
        INPUT_PATH = cand
        break
if INPUT_PATH is None:
    raise FileNotFoundError("No input file found. Expected one of: " + ", ".join(map(str, PREFERRED_INPUTS)))

# ‚îÄ‚îÄ Which author model to use (from the quick author-doc training cell) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
RUN_TAG        = 'authorlite'   # keep in sync with your training cell
K_TARGET       = 15             # keep in sync with your training cell
VECT_PATH_NEW  = MODELS_DIR / f'{RUN_TAG}_vectorizer.joblib'
MODEL_PATH_NEW = MODELS_DIR / f'{RUN_TAG}_lda_k{K_TARGET}.joblib'
META_JSON_NEW  = MODELS_DIR / f'{RUN_TAG}_k{K_TARGET}_topics_metadata.json'

# Sanity: make sure artifacts exist
for p in [VECT_PATH_NEW, MODEL_PATH_NEW, META_JSON_NEW]:
    if not p.exists():
        raise FileNotFoundError(f"Missing model artifact: {p.name}. "
                                f"Run the author-doc training cell first.")

# ‚îÄ‚îÄ Outputs (SAFE: include run tag so you never overwrite prior outputs) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
OUT_STEM       = f'AItrust_with_author_topics_{RUN_TAG}_k{K_TARGET}'
OUTPUT_JSONL   = DATA_DIR / f'{OUT_STEM}.jsonl'           # final (optional merge target)
BLOCK_BASENAME = OUT_STEM + '_block'                      # block prefix
WRITE_BLOCKS   = True
READ_CHUNK     = 200_000                                  # how many lines per block file
BATCH_SIZE     = 50_000                                   # vectorize/transform in batches

# ‚îÄ‚îÄ Minimal normalization (match trainer) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
url_re   = re.compile(r'https?://\S+')
space_re = re.compile(r'\s+')
def normalize_text(s: str) -> str:
    s = s.lower()
    s = url_re.sub(' ', s)
    s = unicodedata.normalize("NFKC", s)
    s = space_re.sub(' ', s).strip()
    return s

def get_text(obj):
    # prefer the referenced original for retweets
    txt = obj.get('text') or ''
    if obj.get('type') == 'retweeted':
        ref = obj.get('referenced_tweets_dictionary') or {}
        txt = ref.get('text') or txt
    return txt

# ‚îÄ‚îÄ Load model artifacts & labels ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
vect = load(VECT_PATH_NEW)
lda  = load(MODEL_PATH_NEW)

with open(META_JSON_NEW, 'r', encoding='utf-8') as f:
    meta = json.load(f)
topic_labels = {m['topic_id']: m.get('label', f'Topic {m["topic_id"]}') for m in meta.get('topics', [])}
# Fallback if labels missing
for t in range(K_TARGET):
    topic_labels.setdefault(t, f'Topic {t}')

# ‚îÄ‚îÄ Optional: clear old blocks from this RUN_TAG (safe) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
for old in DATA_DIR.glob(BLOCK_BASENAME + "*.json"):
    try:
        old.unlink()
    except Exception:
        pass

# ‚îÄ‚îÄ Reader (auto-detect .gz) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
def open_any(path: Path):
    if str(path).endswith('.gz'):
        return gzip.open(path, 'rt', encoding='utf-8', errors='ignore')
    return open(path, 'r', encoding='utf-8', errors='ignore')

# ‚îÄ‚îÄ Batched enrichment ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
print(f"Input : {INPUT_PATH.name}")
print(f"Model : {MODEL_PATH_NEW.name}")
print(f"Vector: {VECT_PATH_NEW.name}")
print(f"Write : blocks={WRITE_BLOCKS}  chunk={READ_CHUNK:,}  batch={BATCH_SIZE:,}")
print(f"Out   : stem={OUT_STEM}")

start_time      = time.time()
processed_total = 0
written_total   = 0
block_idx       = 1
topic_counter   = Counter()

buffer_lines    = []      # for block flush
buffer_json     = []      # for vectorization batch

def flush_block():
    """Flush buffered lines to next block file"""
    global block_idx, written_total, buffer_lines
    if not buffer_lines:
        return
    block_path = DATA_DIR / f"{BLOCK_BASENAME}{block_idx:03d}.json"
    with open(block_path, 'w', encoding='utf-8') as fout:
        fout.write("\n".join(buffer_lines) + "\n")
    print(f"Wrote block {block_idx:03d} with {len(buffer_lines):,} lines ‚Üí {block_path.name}")
    written_total += len(buffer_lines)
    buffer_lines = []
    block_idx += 1

def process_batch(objs):
    """Vectorize & topic-score a batch of tweet objects; return list of enriched JSON strings."""
    if not objs:
        return []
    # Prepare texts
    texts = [normalize_text(get_text(o) or "") for o in objs]
    # Vectorize & transform
    X = vect.transform(texts)
    D = lda.transform(X)  # (batch_size, K)
    out_lines = []
    for o, dist in zip(objs, D):
        tid = int(np.argmax(dist))
        label = topic_labels.get(tid, f"Topic {tid}")
        # Add new fields (do not remove any existing)
        o[f'lda_{RUN_TAG}_k{K_TARGET}_topic_id']    = tid
        o[f'lda_{RUN_TAG}_k{K_TARGET}_topic_label'] = label
        o[f'lda_{RUN_TAG}_k{K_TARGET}_topic_dist']  = [float(x) for x in dist]
        topic_counter[tid] += 1
        out_lines.append(json.dumps(o, ensure_ascii=False))
    return out_lines

# ‚îÄ‚îÄ Stream, batch, write ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
with open_any(INPUT_PATH) as fin:
    # if not writing blocks, open the single output file (still SAFE filename)
    out_main = None if WRITE_BLOCKS else open(OUTPUT_JSONL, 'w', encoding='utf-8')

    try:
        for line in fin:
            if not line.strip():
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue
            buffer_json.append(obj)
            processed_total += 1

            # Process in batches
            if len(buffer_json) >= BATCH_SIZE:
                enriched = process_batch(buffer_json)
                buffer_json = []
                if out_main:
                    out_main.write("\n".join(enriched) + "\n")
                    written_total += len(enriched)
                else:
                    buffer_lines.extend(enriched)
                    if len(buffer_lines) >= READ_CHUNK:
                        flush_block()

            # progress
            if processed_total % 200_000 == 0:
                elapsed = time.time() - start_time
                rate = processed_total / max(elapsed, 1e-9)
                print(f"Progress: {processed_total:,} processed | {rate:,.0f}/s | {int(elapsed)}s elapsed")

        # tail batch
        if buffer_json:
            enriched = process_batch(buffer_json)
            buffer_json = []
            if out_main:
                out_main.write("\n".join(enriched) + "\n")
                written_total += len(enriched)
            else:
                buffer_lines.extend(enriched)

        # final flush
        if out_main:
            out_main.flush()
            out_main.close()
        else:
            flush_block()

    except Exception as e:
        # ensure we close on error
        if out_main and not out_main.closed:
            out_main.close()
        raise e

elapsed = time.time() - start_time
print(f"\n‚úÖ Author-topic enrichment done.")
print(f"Processed: {processed_total:,} | Written: {written_total:,} | Elapsed: {int(elapsed)}s")
print("Counts by topic_id:")
for tid in range(K_TARGET):
    print(f"  {tid:2d} ({topic_labels[tid]:25}): {topic_counter.get(tid,0):,}")

print("\nNote:")
print(f"‚Ä¢ Outputs saved as blocks named {BLOCK_BASENAME}###.json in: {DATA_DIR}")
print(f"‚Ä¢ This did NOT overwrite the  previous k=5 fields; it added new keys prefixed with lda_{RUN_TAG}_k{K_TARGET}_‚Ä¶")
