Topic classification with BERTopic

Install transformers 4.41.0 for compatability with spacy and BERTopic

`pip install transformers==4.41.0` 

Careful when running this, very hardware intensive

In [1]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import psutil
import gc
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_posts = pd.read_csv('../output/export_for_topic_classification.csv')
df_posts.head()

Unnamed: 0,timestamp,text,text_id,user,user_id,hashtags,mentions,emojis,frequency,language
0,00:00:00,running a business means juggling countless ad...,2018569761,danielwoodard,1077866112,"['#HRtech', '#businessmanagement']",[],[],1,EN
1,00:00:00,liz truss is walking in the lingering shadow o...,2092717718,nelsonjacqueline,1089670430,['#politics'],[],[],1,EN
2,00:00:00,the uk is bracing for war as government buildi...,2059143248,ihooper,1007478642,"['#Ukrainewashed', '#WarPreparedness']",[],['🇺🇦'],1,EN
3,00:00:00,marrying a second or third cousin once removed...,2008209828,wrightnicholas,1039258480,"['#FamilyTree', '#GeneticFacts']",[],['🧬'],1,EN
4,00:00:00,it's truly disgraceful how the indian national...,2001239278,michael51,1021455936,['#RationChorCongress'],[],"['🤦', '♂']",1,EN


In [3]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import psutil
import gc
from tqdm import tqdm

def monitor_memory():
    """Monitor memory usage"""
    process = psutil.Process()
    memory_gb = process.memory_info().rss / 1024 / 1024 / 1024
    return f"Memory Usage: {memory_gb:.2f} GB"

def create_multifeature_embeddings(df_posts, sentence_model, batch_size=64):
    """
    Create combined embeddings with memory monitoring and larger batches
    """
    print(f"\nStarting embedding generation for {len(df_posts)} documents")
    print(monitor_memory())
    
    def safe_join(items):
        if not items or (isinstance(items, list) and len(items) == 0):
            return ""
        return " ".join(str(item) for item in items)
    
    print("\nGenerating text embeddings...")
    text_embeddings = sentence_model.encode(
        df_posts['text'].fillna("").tolist(),
        batch_size=batch_size,
        show_progress_bar=True
    )
    print(monitor_memory())
    
    print("\nGenerating hashtag embeddings...")
    hashtag_embeddings = sentence_model.encode(
        [safe_join(tags) for tags in df_posts['hashtags']],
        batch_size=batch_size,
        show_progress_bar=True
    )
    print(monitor_memory())
    
    
    print(monitor_memory())
    
    print("\nCombining embeddings...")
    combined_embeddings = (
        0.95 * text_embeddings + 
        0.05 * hashtag_embeddings
    )
    
    # Clean up to free memory
    del text_embeddings, hashtag_embeddings
    gc.collect()
    
    print(monitor_memory())
    return combined_embeddings

def setup_bertopic_model(df_posts, batch_size=64):
    """
    Set up and train BERTopic model with memory optimization
    """
    print(f"\nDataset size: {len(df_posts)} documents")
    print(f"DataFrame memory usage: {df_posts.memory_usage().sum() / 1024**2:.2f} MB")
    print(monitor_memory())
    
    print("\nInitializing models...")
    sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Modified parameters for large dataset
    vectorizer_model = CountVectorizer(
        stop_words="english",
        #min_df=2,    # Term must appear in at least 3 documents
        #max_df=0.7,  # Ignore terms that appear in >50% of documents
        ngram_range=(1, 2)
    )
    
    # Optimized UMAP settings for large dataset
    umap_model = UMAP(
        n_neighbors=15,
        n_components=5,
        min_dist=0.0,
        metric='cosine',
        low_memory=True,
        random_state=42
    )
    
    # Create embeddings
    embeddings = create_multifeature_embeddings(df_posts, sentence_model, batch_size)
    
    # Initialize BERTopic with optimized settings
    topic_model = BERTopic(
        embedding_model=sentence_model,
        vectorizer_model=vectorizer_model,
        umap_model=umap_model,
        min_topic_size=30,  # Increased for larger dataset
        nr_topics='auto',
        calculate_probabilities=True,
        verbose=True
    )
    
    print("\nFitting BERTopic model...")
    topics, probs = topic_model.fit_transform(
        documents=df_posts['text'].fillna("").tolist(),
        embeddings=embeddings
    )
    
    return topic_model, topics, probs

def analyze_topics(topic_model, topics, df_posts):
    """
    Analyze topics with memory considerations
    """
    print("\nAnalyzing topics...")
    print(monitor_memory())
    
    topic_info = topic_model.get_topic_info()
    
    # More memory-efficient way to store topics
    df_posts['topic'] = topics
    
    # Get sample documents for each topic (limited to save memory)
    topic_docs = {}
    unique_topics = set(topics)
    print(f"\nFound {len(unique_topics)-1} topics (excluding -1)")
    
    for topic in tqdm(unique_topics):
        if topic != -1:
            topic_docs[topic] = df_posts[df_posts['topic'] == topic]['text'].head(3).tolist()
    
    try:
        print("\nGenerating visualizations...")
        topic_model.visualize_topics()
        topic_model.visualize_hierarchy()
    except Exception as e:
        print(f"Warning: Visualization error: {e}")
    
    return topic_info, topic_docs

def run_topic_analysis(df_posts, batch_size=64):
    """
    Run the complete pipeline with memory monitoring
    """
    print(f"Starting analysis with batch size: {batch_size}")
    print(monitor_memory())
    
    required_columns = ['text', 'hashtags']
    if not all(col in df_posts.columns for col in required_columns):
        raise ValueError(f"Missing columns. Required: {required_columns}")
    
    # Optionally sample for testing
    df_posts = df_posts.sample(n=10000, random_state=42)  # Uncomment to test with sample
    
    topic_model, topics, probs = setup_bertopic_model(df_posts, batch_size)
    topic_info, topic_docs = analyze_topics(topic_model, topics, df_posts)
    
    # Create memory-efficient summary
    summary = {
        'num_topics': len(set(topics)) - 1,
        'topic_sizes': topic_info['Count'].tolist(),
        'top_topics': topic_info.head(10).to_dict('records')
    }
    
    return topic_model, summary

In [4]:
# Then run:
try:
    # You can adjust batch size based on your memory
    topic_model, summary = run_topic_analysis(df_posts, batch_size=64)
    
    print(f"\nAnalysis complete!")
    print(f"Found {summary['num_topics']} topics")
    print("\nTop 10 topics:")
    for topic in summary['top_topics']:
        print(f"Topic {topic['Topic']}: Size {topic['Count']}")
    
    # Save model if needed
    topic_model.save("bertopic_model_large")
    
except Exception as e:
    print(f"Error during analysis: {e}")

Starting analysis with batch size: 64
Memory Usage: 0.57 GB

Dataset size: 10000 documents
DataFrame memory usage: 0.84 MB
Memory Usage: 0.57 GB

Initializing models...

Starting embedding generation for 10000 documents
Memory Usage: 0.67 GB

Generating text embeddings...


Batches: 100%|██████████| 157/157 [00:10<00:00, 15.28it/s]


Memory Usage: 1.25 GB

Generating hashtag embeddings...


Batches: 100%|██████████| 157/157 [00:08<00:00, 18.32it/s]
2025-01-22 21:27:10,968 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Memory Usage: 1.43 GB
Memory Usage: 1.43 GB

Combining embeddings...
Memory Usage: 1.48 GB

Fitting BERTopic model...


2025-01-22 21:27:27,290 - BERTopic - Dimensionality - Completed ✓
2025-01-22 21:27:27,291 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-01-22 21:27:28,021 - BERTopic - Cluster - Completed ✓
2025-01-22 21:27:28,022 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-01-22 21:27:28,695 - BERTopic - Representation - Completed ✓
2025-01-22 21:27:28,697 - BERTopic - Topic reduction - Reducing number of topics
2025-01-22 21:27:29,316 - BERTopic - Topic reduction - Reduced number of topics from 45 to 23



Analyzing topics...
Memory Usage: 1.15 GB

Found 22 topics (excluding -1)


100%|██████████| 23/23 [00:00<00:00, 5136.25it/s]



Generating visualizations...





Analysis complete!
Found 22 topics

Top 10 topics:
Topic -1: Size 4332
Topic 0: Size 2184
Topic 1: Size 897
Topic 2: Size 618
Topic 3: Size 438
Topic 4: Size 230
Topic 5: Size 179
Topic 6: Size 141
Topic 7: Size 133
Topic 8: Size 107


In [5]:
# 1. Save visualizations to HTML files
fig = topic_model.visualize_barchart(top_n_topics=20)
fig.write_html("../output/topic_barchart.html")

topic_model.visualize_topics().write_html("../output/topic_clusters.html")
topic_model.visualize_hierarchy().write_html("../output/topic_hierarchy.html")

# 2. Print text-based summary
topics_info = topic_model.get_topic_info()
print("\nMost frequent topics with their terms:")
for _, row in topics_info.head(20).iterrows():
    topic_id = row['Topic']
    size = row['Count']
    if topic_id != -1:
        terms = topic_model.get_topic(topic_id)
        print(f"\nTopic {topic_id} (Size: {size}):")
        # Print top 10 terms for each topic with their weights
        for term, weight in terms[:20]:
            print(f"  - {term}: {weight:.3f}")


Most frequent topics with their terms:

Topic 0 (Size: 2184):
  - business: 0.027
  - politics: 0.015
  - check: 0.010
  - time: 0.010
  - let: 0.008
  - change: 0.008
  - political: 0.007
  - like: 0.007
  - people: 0.007
  - marketing: 0.007

Topic 1 (Size: 897):
  - health: 0.057
  - mental: 0.043
  - mental health: 0.040
  - important: 0.016
  - care: 0.015
  - secretary: 0.014
  - healthcare: 0.013
  - support: 0.013
  - covid: 0.012
  - health secretary: 0.012

Topic 2 (Size: 618):
  - vote: 0.095
  - artist: 0.062
  - social artist: 0.059
  - voting: 0.056
  - let: 0.056
  - social: 0.054
  - army: 0.042
  - counts: 0.042
  - love: 0.039
  - vote counts: 0.037

Topic 3 (Size: 438):
  - vibes: 0.028
  - wait: 0.026
  - just: 0.024
  - ready: 0.023
  - amazing: 0.022
  - love: 0.019
  - shoutout: 0.017
  - believe: 0.017
  - like: 0.016
  - day: 0.016

Topic 4 (Size: 230):
  - chelsea: 0.032
  - tuchel: 0.029
  - sports: 0.028
  - game: 0.028
  - football: 0.027
  - business: 0.0