In [11]:
%autoreload 2
from bertopic import BERTopic

In [9]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from umap import UMAP
from hdbscan import HDBSCAN

def create_topic_model():
    """Create and fit a BERTopic model"""
    # Create sample data
    docs = fetch_20newsgroups(subset='all')['data'][:500]
    
    # Initialize BERTopic with specific models
    umap = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=True, random_state=42)
    hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='leaf', prediction_data=True)
    topic_model = BERTopic(umap_model=umap, hdbscan_model=hdbscan_model, embedding_model='all-MiniLM-L6-v2', verbose=True)
    
    # Fit the model
    topics, probs = topic_model.fit_transform(docs)
    return topic_model

def run_deletion_tests():
    """Run comprehensive tests for topic deletion and print detailed comparisons"""
    print("\n=== BERTopic Delete Topics Test Results ===\n")
    
    # Setup model
    print("Initializing and fitting BERTopic model...")
    topic_model = create_topic_model()
    
    # Record initial state
    initial_state = {
        'topic_sizes': topic_model.topic_sizes_.copy(),
        'topics_set': set(topic_model.topics_),
        'c_tf_idf_shape': topic_model.c_tf_idf_.shape,
        'embeddings_shape': topic_model.topic_embeddings_.shape,
        'representations_count': len(topic_model.topic_representations_),
    }
    
    # Get topics sorted by size (excluding -1)
    sorted_topics = sorted(
        [(topic, size) for topic, size in initial_state['topic_sizes'].items() if topic != -1],
        key=lambda x: x[1],
        reverse=True
    )
    
    print("\nInitial State:")
    print(f"Total topics: {len(sorted_topics)}")
    print(f"Topic sizes (top 5): {dict(sorted_topics[:5])}")
    print(f"Matrix shapes - c_tf_idf: {initial_state['c_tf_idf_shape']}, embeddings: {initial_state['embeddings_shape']}")
    
    # Select topics to delete (4th, 5th, 6th largest)
    topics_to_delete = [item[0] for item in sorted_topics[3:6]]
    print(f"\nDeleting topics: {topics_to_delete}")
    print(f"Original sizes of topics to delete: {[initial_state['topic_sizes'][t] for t in topics_to_delete]}")
    
    # Perform deletion
    try:
        topic_model.delete_topics(topics_to_delete)
        print("\n✓ Topic deletion completed")
    except Exception as e:
        print(f"\n❌ Error during topic deletion: {str(e)}")
        return
    
    # Analyze results
    print("\n=== Validation Results ===\n")
    
    # 1. Check topic counts
    new_topics = set(topic_model.topics_) - {-1}
    print(f"1. Topic Counts:")
    print(f"   Before: {len(sorted_topics)} topics")
    print(f"   After: {len(new_topics)} topics")
    print(f"   Expected: {len(sorted_topics) - len(topics_to_delete)} topics")
    
    # 2. Check size ordering
    new_sorted_topics = sorted(
        [(topic, size) for topic, size in topic_model.topic_sizes_.items() if topic != -1],
        key=lambda x: x[1],
        reverse=True
    )
    
    print("\n2. Size-based Ordering:")
    print(f"   Topic IDs by size: {[t[0] for t in new_sorted_topics]}")
    print(f"   Sizes: {[t[1] for t in new_sorted_topics]}")
    is_ordered = all(i == t[0] for i, t in enumerate(new_sorted_topics))
    print(f"   Correctly ordered by size: {is_ordered}")
    if not is_ordered:
        print("   WARNING: Topics not properly ordered by size!")
    
    # 3. Check matrix shapes
    print("\n3. Matrix Shapes:")
    print(f"   c_tf_idf - Before: {initial_state['c_tf_idf_shape']}, After: {topic_model.c_tf_idf_.shape}")
    print(f"   embeddings - Before: {initial_state['embeddings_shape']}, After: {topic_model.topic_embeddings_.shape}")
    
    # 4. Check representations
    print("\n4. Topic Representations:")
    print(f"   Before: {initial_state['representations_count']} representations")
    print(f"   After: {len(topic_model.topic_representations_)} representations")
    
    # 5. Check outlier topic
    print("\n5. Outlier Topic (-1):")
    print(f"   Present in topics_: {-1 in topic_model.topics_}")
    print(f"   Present in sizes: {-1 in topic_model.topic_sizes_}")
    print(f"   Present in representations: {-1 in topic_model.topic_representations_}")
    
    # 6. Verify topic deletion and reordering
    print("\n6. Topic Deletion and Reordering:")
    expected_topic_count = len(sorted_topics) - len(topics_to_delete)
    actual_topic_count = len([t for t in topic_model.topic_sizes_.keys() if t != -1])
    
    print(f"   Expected topic count: {expected_topic_count}")
    print(f"   Actual topic count: {actual_topic_count}")
    
    # Check sequential numbering
    expected_topic_numbers = set(range(expected_topic_count))
    actual_topic_numbers = set(t for t in topic_model.topic_sizes_.keys() if t != -1)
    sequential_numbering = expected_topic_numbers == actual_topic_numbers
    
    print(f"   Sequential topic numbering: {sequential_numbering}")
    if not sequential_numbering:
        print(f"   Expected topics: {sorted(expected_topic_numbers)}")
        print(f"   Actual topics: {sorted(actual_topic_numbers)}")
    
    # Check sizes match (excluding deleted topics)
    expected_sizes = sorted([size for topic, size in sorted_topics if topic not in topics_to_delete], reverse=True)
    actual_sizes = sorted([size for topic, size in topic_model.topic_sizes_.items() if topic != -1], reverse=True)
    sizes_match = expected_sizes == actual_sizes
    
    print(f"   Sizes match: {sizes_match}")
    if not sizes_match:
        print(f"   Expected sizes: {expected_sizes}")
        print(f"   Actual sizes: {actual_sizes}")

    # Update the validations dictionary
    validations = {
        "Topic count correct": actual_topic_count == expected_topic_count,
        "Size ordering correct": is_ordered,
        "Matrix shapes consistent": topic_model.c_tf_idf_.shape[0] == topic_model.topic_embeddings_.shape[0],
        "Sequential topic numbering": sequential_numbering,
        "Topic sizes preserved": sizes_match,
        "Outlier preserved": all([-1 in topic_model.topics_, -1 in topic_model.topic_sizes_, -1 in topic_model.topic_representations_])
    }

if __name__ == "__main__":
    run_deletion_tests()


=== BERTopic Delete Topics Test Results ===

Initializing and fitting BERTopic model...


2025-03-31 21:49:21,445 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 16/16 [00:12<00:00,  1.29it/s]
2025-03-31 21:49:34,425 - BERTopic - Embedding - Completed ✓
2025-03-31 21:49:34,426 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-31 21:49:35,290 - BERTopic - Dimensionality - Completed ✓
2025-03-31 21:49:35,291 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-03-31 21:49:35,307 - BERTopic - Cluster - Completed ✓
2025-03-31 21:49:35,309 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-03-31 21:49:35,493 - BERTopic - Representation - Completed ✓



Initial State:
Total topics: 73
Topic sizes (top 5): {0: 11, 1: 11, 2: 10, 3: 10, 4: 9}
Matrix shapes - c_tf_idf: (74, 19877), embeddings: (74, 384)

Deleting topics: [3, 4, 6]
Original sizes of topics to delete: [10, 9, 9]

✓ Topic deletion completed

=== Validation Results ===

1. Topic Counts:
   Before: 73 topics
   After: 70 topics
   Expected: 70 topics

2. Size-based Ordering:
   Topic IDs by size: [0, 1, 2, 4, 3, 5, 8, 6, 7, 13, 10, 9, 12, 11, 17, 15, 16, 14, 20, 21, 22, 18, 19, 23, 26, 28, 24, 27, 25, 29, 35, 36, 41, 37, 31, 30, 40, 38, 34, 33, 32, 42, 39, 43, 45, 50, 51, 49, 52, 53, 44, 48, 57, 47, 54, 55, 56, 46, 58, 61, 60, 59, 62, 63, 64, 65, 66, 67, 68, 69]
   Sizes: [11, 11, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
   Correctly ordered by size: False

3. Matrix Shapes:
   c_tf_idf - Before: (74, 19877), After