# Topic Modeling

In [None]:
import os
import gc
import math
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

# --------------------------------------------------------------------------
# 1) FILE PATHS AND SETTINGS
# --------------------------------------------------------------------------
CSV_FILE_PATH = "../data/csv/tweets.csv" 
TEXT_COLUMN_NAME = "Tweet"        
TWEET_EMBEDDINGS_OUTPUT_PATH = "../data/npy/tweet_embeddings.npy"
TWEETS_METADATA_PATH = "../data/csv/tweets_metadata.csv"

BATCH_SIZE_EMBEDDING = 256
NUM_WORKERS_EMBEDDING = 4
TRANSFORMER_MODEL_NAME = "all-mpnet-base-v2"

def embed_tweets():
    """
    Loads all tweets from a CSV file without chunking.
    Embeds them via Sentence-BERT and saves:
      1) tweet_embeddings.npy
      2) tweets_metadata.csv (with tweet text and ID/reference)
    """

    if not os.path.exists(CSV_FILE_PATH):
        raise FileNotFoundError(f"CSV file not found: {CSV_FILE_PATH}")

    df_local = pd.read_csv(CSV_FILE_PATH, encoding='utf-8')
    if TEXT_COLUMN_NAME not in df_local.columns:
        raise ValueError(f"Column '{TEXT_COLUMN_NAME}' not found in CSV.")

    # Convert all tweets to strings
    tweets = df_local[TEXT_COLUMN_NAME].astype(str).tolist()
    num_tweets = len(tweets)
    print(f"Loaded {num_tweets} tweets from {CSV_FILE_PATH}.")

    # Optionally keep an ID column if it exists, else create an index-based ID
    if "tweet_id" not in df_local.columns:
        df_local["tweet_id"] = df_local.index + 1  # simple numeric ID

    # Save tweet metadata
    df_local[["tweet_id", TEXT_COLUMN_NAME]].to_csv(TWEETS_METADATA_PATH, index=False, encoding='utf-8')
    print(f"Saved tweets metadata to {TWEETS_METADATA_PATH}")

    # 2) Embedding
    print(f"Embedding {num_tweets} tweets in batches of {BATCH_SIZE_EMBEDDING}...")
    embed_model = SentenceTransformer(TRANSFORMER_MODEL_NAME)

    tweet_embeddings_list = []
    num_batches = math.ceil(num_tweets / BATCH_SIZE_EMBEDDING)

    for batch_idx in range(num_batches):
        start_idx = batch_idx * BATCH_SIZE_EMBEDDING
        end_idx = min(start_idx + BATCH_SIZE_EMBEDDING, num_tweets)
        batch_texts = tweets[start_idx:end_idx]

        # Encode this batch
        batch_embeds = embed_model.encode(
            batch_texts,
            show_progress_bar=False,
            batch_size=8,  # smaller sub-batch to reduce memory usage
            num_workers=NUM_WORKERS_EMBEDDING
        )

        tweet_embeddings_list.extend(batch_embeds)
        print(f"Processed batch {batch_idx+1}/{num_batches} ({len(batch_texts)} tweets).")

        del batch_texts, batch_embeds
        gc.collect()

    # Convert embeddings to float32 to save memory
    tweet_embeddings = np.array(tweet_embeddings_list, dtype=np.float32)
    print(f"Final embeddings shape: {tweet_embeddings.shape}")

    # Save embeddings to disk
    np.save(TWEET_EMBEDDINGS_OUTPUT_PATH, tweet_embeddings)
    print(f"Embeddings saved to {TWEET_EMBEDDINGS_OUTPUT_PATH}")

    del tweet_embeddings_list, tweet_embeddings, embed_model
    gc.collect()

if __name__ == "__main__":
    embed_tweets()
    print("Step 1 (tweets embedding) completed successfully.")



In [None]:
import os
import gc
import re
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim.corpora.dictionary import Dictionary

from bertopic import BERTopic
import umap

from sklearn.neighbors import NearestNeighbors

# -----------------------------------------------------------------------------
# 1) FILE PATHS
# -----------------------------------------------------------------------------
TWEET_EMBEDDINGS_OUTPUT_PATH = "data/npy/tweet_embeddings.npy"
TWEETS_METADATA_PATH         = "data/csv/tweets_metadata.csv"

TWEETS_WITH_TOPICS_CSV       = "data/csv/tweets_with_topics.csv"
TOPIC_INFO_CSV               = "data/csv/tweets_topic_info.csv"

# -----------------------------------------------------------------------------
# 2) HYPERPARAMETERS FOR TOPIC MODELING
# -----------------------------------------------------------------------------
STOP_WORDS      = set(stopwords.words('english'))
STEMMER         = PorterStemmer()

NO_BELOW        = 2      # Filter out tokens that appear in fewer than 2 tweets
NO_ABOVE        = 0.90   # Filter out tokens that appear in more than 90% of tweets

UMAP_N_NEIGHBORS   = 15
UMAP_N_COMPONENTS  = 10
MIN_TOPIC_SIZE     = 5
N_GRAM_RANGE       = (1, 3)
NR_TOPICS          = 25

# -----------------------------------------------------------------------------
# 3) BASIC PREPROCESSING
# -----------------------------------------------------------------------------
def preprocess_text(text: str):
    """
    Lowercase, remove non-alpha, remove stopwords & short tokens, and apply stemming.
    Adjust if you have different preprocessing preferences.
    """
    text = text.lower()
    # Remove anything not alphabetical or space
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = text.split()
    # Filter out short tokens and stopwords
    tokens = [t for t in tokens if t not in STOP_WORDS and len(t) > 2]
    # Stem each token
    tokens = [STEMMER.stem(t) for t in tokens]
    return tokens

# -----------------------------------------------------------------------------
# 4) OPTIONAL: OUTLIER REASSIGNMENT
# -----------------------------------------------------------------------------
def reassign_outliers_to_nearest_cluster(assigned_topics, embeddings):
    """
    BERTopic sometimes assigns -1 to outlier documents.
    This function reassigns outlier tweets to the closest non-outlier cluster
    via nearest neighbors (cosine or Euclidean distance).
    """
    assigned_topics = np.array(assigned_topics)
    outlier_mask = (assigned_topics == -1)
    if not outlier_mask.any():
        return assigned_topics  # No outliers => no changes

    print(f"Found {outlier_mask.sum()} outliers. Reassigning them to the nearest cluster...")

    # Split embeddings into outliers vs. non-outliers
    non_outlier_mask = ~outlier_mask
    non_outlier_embeddings = embeddings[non_outlier_mask]
    non_outlier_topics = assigned_topics[non_outlier_mask]

    nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(non_outlier_embeddings)
    outlier_indices = np.where(outlier_mask)[0]
    outlier_embeddings = embeddings[outlier_indices]

    distances, indices = nbrs.kneighbors(outlier_embeddings)
    for i, idx in enumerate(indices[:, 0]):
        outlier_idx = outlier_indices[i]
        assigned_topics[outlier_idx] = non_outlier_topics[idx]

    return assigned_topics

# -----------------------------------------------------------------------------
# 5) MAIN PIPELINE: LOAD EMBEDDINGS, PREPROCESS TEXT, FIT BERTopic
# -----------------------------------------------------------------------------
def fit_topic_model():
    # 1) Check file existence
    if not os.path.exists(TWEETS_METADATA_PATH):
        raise FileNotFoundError(f"Tweet metadata not found at: {TWEETS_METADATA_PATH}")
    if not os.path.exists(TWEET_EMBEDDINGS_OUTPUT_PATH):
        raise FileNotFoundError(f"Tweet embeddings not found at: {TWEET_EMBEDDINGS_OUTPUT_PATH}")

    # 2) Load metadata and embeddings
    df_tweets = pd.read_csv(TWEETS_METADATA_PATH, encoding='utf-8')
    tweet_embeddings = np.load(TWEET_EMBEDDINGS_OUTPUT_PATH)

    print(f"Number of tweets loaded: {len(df_tweets)}")
    print(f"Embeddings shape: {tweet_embeddings.shape}")

    # 3) Preprocess text & filter out empty tokens
    df_tweets["tokens"] = df_tweets["Tweet"].astype(str).apply(preprocess_text)
    df_tweets = df_tweets[df_tweets["tokens"].apply(lambda x: len(x) > 0)]
    print(f"Number of tweets after removing empty tokens: {len(df_tweets)}")

    # 4) Filter extreme tokens via Gensim Dictionary
    dictionary = Dictionary(df_tweets["tokens"])
    dictionary.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE)

    # Rebuild tokens so only in-dictionary words remain
    filtered_tokens = []
    for tok_list in df_tweets["tokens"]:
        filtered = [w for w in tok_list if w in dictionary.token2id]
        filtered_tokens.append(filtered)

    df_tweets["tokens"] = filtered_tokens
    df_tweets = df_tweets[df_tweets["tokens"].apply(lambda x: len(x) > 0)]
    print(f"Number of tweets after dictionary filtering: {len(df_tweets)}")

    # 5) Filter embeddings to match the final set of tweets
    #    Because some tweets may have been dropped if they had no valid tokens.
    retained_indices = df_tweets.index.values  # the current DataFrame index
    filtered_embeddings = tweet_embeddings[retained_indices, :]
    print(f"Filtered embeddings shape: {filtered_embeddings.shape}")

    # 6) Prepare text input for BERTopic
    text_for_model = [" ".join(tok_list) for tok_list in df_tweets["tokens"]]

    # 7) Configure BERTopic
    my_umap = umap.UMAP(
        n_neighbors=UMAP_N_NEIGHBORS,
        n_components=UMAP_N_COMPONENTS,
        min_dist=0.1
    )

    topic_model = BERTopic(
        umap_model=my_umap,
        min_topic_size=MIN_TOPIC_SIZE,
        n_gram_range=N_GRAM_RANGE,
        nr_topics=NR_TOPICS,
        calculate_probabilities=True,
        verbose=True
    )

    # 8) Fit BERTopic
    print("Fitting BERTopic on tweets...")
    assigned_topics, _ = topic_model.fit_transform(text_for_model, filtered_embeddings)

    # 9) Optionally reassign outliers from -1
    assigned_topics = reassign_outliers_to_nearest_cluster(assigned_topics, filtered_embeddings)
    df_tweets["topic_id"] = assigned_topics

    # 10) Extract topic info
    topic_info_df = topic_model.get_topic_info()
    print("BERTopic modeling complete.")
    print("=== Sample of topic info ===")
    print(topic_info_df.head(10))

    # 11) Save results
    df_tweets.to_csv(TWEETS_WITH_TOPICS_CSV, index=False, encoding='utf-8')
    topic_info_df.to_csv(TOPIC_INFO_CSV, index=False, encoding='utf-8')

    print(f"Saved tweets with topic assignments to: {TWEETS_WITH_TOPICS_CSV}")
    print(f"Saved topic info to: {TOPIC_INFO_CSV}")

    # Cleanup
    del tweet_embeddings, filtered_embeddings
    gc.collect()

if __name__ == "__main__":
    fit_topic_model()
    print("Topic modeling step completed successfully.")
