Topic classification with BERTopic

Install transformers 4.41.0 for compatability with spacy and BERTopic

`pip install transformers==4.41.0` 

Careful when running this, very hardware intensive

In [4]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import psutil
import gc
from tqdm import tqdm
import re
import umap

In [5]:
df_posts = pd.read_json('../data/dataset.json')
df_posts.head()

Unnamed: 0,timestamp,text,text_id,user,user_id
0,2024-10-31,Running a business means juggling countless ad...,2018569761,danielwoodard,1077866112
1,2024-10-31,Liz Truss is walking in the lingering shadow o...,2092717718,nelsonjacqueline,1089670430
2,2024-10-31,The UK is bracing for war as government buildi...,2059143248,ihooper,1007478642
3,2024-10-31,Marrying a second or third cousin once removed...,2008209828,wrightnicholas,1039258480
4,2024-10-31,It's truly disgraceful how the Indian National...,2001239278,michael51,1021455936


Apply minimal preprocessing (Remove URLs and Mentions)

In [6]:
df_posts['text'] = df_posts['text'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+|@\w+', '', x) if pd.notna(x) else x)

In [7]:
#df_posts = df_posts.sample(frac=0.1, random_state=42)  # 10% of the dataset

In [8]:
# Monitor memory usage
def monitor_memory():
    process = psutil.Process()
    memory_gb = process.memory_info().rss / 1024 / 1024 / 1024
    return f"Memory Usage: {memory_gb:.2f} GB"

# Create embeddings with memory monitoring and larger batches
def create_multifeature_embeddings(df_posts, sentence_model, batch_size=64):
    print(f"\nStarting embedding generation for {len(df_posts)} documents")
    print(monitor_memory())

    print("\nGenerating text embeddings...")
    text_embeddings = sentence_model.encode(
        df_posts['text'].fillna("").tolist(),
        batch_size=batch_size,
        show_progress_bar=True
    )
    print(monitor_memory())

    print(monitor_memory())
    return text_embeddings

# Set up and train BERTopic model with memory optimization
def setup_bertopic_model(df_posts, batch_size=64):
    print(f"\nDataset size: {len(df_posts)} documents")
    print(f"DataFrame memory usage: {df_posts.memory_usage().sum() / 1024**2:.2f} MB")
    print(monitor_memory())

    print("\nInitializing models...")
    sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

    vectorizer_model = CountVectorizer(
        stop_words="english",
        #min_df = 350,  # Terms must appear in at least 350 rows (0.5% of dataset)
        #max_df = 0.8,  # Terms must appear in less than 80% of the dataset (56,000 rows)
        ngram_range=(1, 3) # Increase to include trigrams
    )

    umap_model = UMAP(
        n_neighbors=15,
        n_components=5,
        min_dist=0.0,
        metric='cosine',
        random_state=42
    )

    embeddings = create_multifeature_embeddings(df_posts, sentence_model, batch_size)

    topic_model = BERTopic(
        embedding_model=sentence_model,
        vectorizer_model=vectorizer_model,
        umap_model=umap_model,
        #min_topic_size = 20,      # uncomment to set manual minimum topic size
        nr_topics='auto',
        calculate_probabilities=True,
        verbose=True
    )

    print("\nFitting BERTopic model...")
    topics, probs = topic_model.fit_transform(
        documents=df_posts['text'].fillna("").tolist(),
        embeddings=embeddings
    )

    return topic_model, topics, probs

# Analyze topics with memory considerations
def analyze_topics(topic_model, topics, df_posts):
    print("\nAnalyzing topics...")
    print(monitor_memory())

    topic_info = topic_model.get_topic_info()

    df_posts['topic'] = topics

    topic_docs = {}
    unique_topics = set(topics)
    print(f"\nFound {len(unique_topics)-1} topics (excluding -1)")

    for topic in tqdm(unique_topics):
        if topic != -1:
            topic_docs[topic] = df_posts[df_posts['topic'] == topic]['text'].head(3).tolist()

    try:
        print("\nGenerating visualizations...")
        topic_model.visualize_topics()
        topic_model.visualize_hierarchy()
    except Exception as e:
        print(f"Warning: Visualization error: {e}")

    return topic_info, topic_docs

# Run the complete pipeline with memory monitoring
def run_topic_analysis(df_posts, batch_size=64):
    print(f"Starting analysis with batch size: {batch_size}")
    print(monitor_memory())

    if 'text' not in df_posts.columns:
        raise ValueError("Missing required column: text")

    topic_model, topics, probs = setup_bertopic_model(df_posts, batch_size)
    topic_info, topic_docs = analyze_topics(topic_model, topics, df_posts)

    summary = {
        'num_topics': len(set(topics)) - 1,
        'topic_sizes': topic_info['Count'].tolist(),
        'top_topics': topic_info.head(10).to_dict('records')
    }

    return topic_model, summary, topics, probs

In [None]:
try:
    topic_model, summary, topics, probs = run_topic_analysis(df_posts, batch_size=64)
    
    print(f"\nAnalysis complete!")
    print(f"Found {summary['num_topics']} topics")
    print("\nTop 10 topics:")
    for topic in summary['top_topics']:
        print(f"Topic {topic['Topic']}: Size {topic['Count']}")
    
    topic_model.save("bertopic_model_large")
    
except Exception as e:
    print(f"Error during analysis: {e}")

Starting analysis with batch size: 64
Memory Usage: 0.67 GB

Dataset size: 70260 documents
DataFrame memory usage: 2.68 MB
Memory Usage: 0.67 GB

Initializing models...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Starting embedding generation for 70260 documents
Memory Usage: 0.77 GB

Generating text embeddings...


Batches:   0%|          | 0/1098 [00:00<?, ?it/s]

2025-01-26 19:39:11,720 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Memory Usage: 1.24 GB
Memory Usage: 1.24 GB

Fitting BERTopic model...


2025-01-26 19:40:08,788 - BERTopic - Dimensionality - Completed ✓
2025-01-26 19:40:08,789 - BERTopic - Cluster - Start clustering the reduced embeddings


In [None]:
# 1. Save visualizations to HTML files
fig = topic_model.visualize_barchart(top_n_topics=20)
fig.write_html("../output/topic_barchart.html")

topic_model.visualize_topics().write_html("../output/topic_clusters.html")
topic_model.visualize_hierarchy().write_html("../output/topic_hierarchy.html")

# 2. Print text-based summary
topics_info = topic_model.get_topic_info()
print("\nMost frequent topics with their terms:")
for _, row in topics_info.head(20).iterrows():
    topic_id = row['Topic']
    size = row['Count']
    if topic_id != -1:
        terms = topic_model.get_topic(topic_id)
        print(f"\nTopic {topic_id} (Size: {size}):")
        # Print top 10 terms for each topic with their weights
        for term, weight in terms[:20]:
            print(f"  - {term}: {weight:.3f}")

In [37]:
# Transform documents into topics and probabilities
#documents = df_posts['text'].tolist()
#topics, probs = topic_model.transform(documents)

# Reassign "Other" topics to the most probable topic
#import numpy as np
#for i, prob in enumerate(probs):
#    if topics[i] == -1:  # Check for "Other"
#        topics[i] = np.argmax(prob)  # Assign the most probable topic

# Add topic numbers to the dataframe
#df_posts['topic'] = topics

# Generate topic labels
#topic_labels = {}
#for topic_id in set(topics):
#    if topic_id != -1:
#        terms = topic_model.get_topic(topic_id)
#        topic_labels[topic_id] = ", ".join([term for term, _ in terms[:3]])
#    else:
#        topic_labels[topic_id] = "Other"

# Add descriptive topic labels to the dataframe
#df_posts['topic_label'] = df_posts['topic'].map(topic_labels)

# Save augmented dataframe
#df_posts.to_csv('../output/posts_with_topics.csv', index=False)

In [None]:
# 
documents = df_posts['text'].tolist()
topics, probs = topic_model.transform(documents)

# Handle "Other" first
num_others = np.sum(np.array(topics) == -1)
for i, prob in enumerate(probs):
    if topics[i] == -1:
        topics[i] = np.argmax(prob)

# Reduce topics to a smaller number for feature clustering
topic_model.reduce_topics(documents, nr_topics=30)

# Generate labels for the topics
topic_labels = {topic_id: ", ".join([term for term, _ in topic_model.get_topic(topic_id)[:3]]) 
               for topic_id in set(topics) if topic_id != -1}
topic_labels[-1] = "Other"

df_posts['topic'] = topics
df_posts['topic_label'] = df_posts['topic'].map(topic_labels)
df_posts.to_csv('../output/posts_with_topics.csv', index=False)