# Topic Modeling Evaluation

This notebook evaluates **all topic versions** in the database using multiple quantitative metrics:

## Metrics Computed

1. **Coherence Metrics** (C_v, NPMI, U_Mass) - measures how well topics capture patterns in your news articles
2. **Topic Diversity** - Vocabulary overlap, unique words ratio
3. **Inter-Topic Similarity** - Embedding-based and keyword-based
4. **Assignment Quality** - Confidence scores, coverage, outlier ratio
5. **Silhouette Score** - Clustering quality in embedding space
6. **Keyword Informativeness** - TF-IDF distinctiveness
7. **Composite Score** - Weighted combination for ranking
8. **Per-Topic Analysis** - Individual topic quality within versions

## Outputs

- **CSVs**: Consolidated metrics, per-topic coherence, summary statistics
- **Visualizations**: Interactive Plotly charts
- **Recommendations**: Best version identification and rankings

## Setup & Data Loading

In [None]:
# Cell 1: Imports & Configuration
import psycopg2
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from scipy.spatial.distance import cosine
from scipy.stats import entropy
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.express as px
import plotly.graph_objects as go
from bertopic import BERTopic
import gensim
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
import yaml
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append('..')
from src.db import get_db
from src.versions import list_versions
from dashboard.data.loaders import load_bertopic_model

print("All imports successful")

In [None]:
# Cell 2: Load Configuration & List Versions
# Load database config
with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# List all topic versions (evaluates ALL versions in database)
versions_list = list_versions(analysis_type='topics')
versions_df = pd.DataFrame(versions_list)
print(f"Found {len(versions_df)} topic versions for evaluation")
display(versions_df[['id', 'name', 'description', 'created_at']])

# Evaluate all versions
version_ids = versions_df['id'].tolist()
print(f"\nEvaluating {len(version_ids)} versions: {versions_df['name'].tolist()}")

## 1: Topic Coherence Metrics

In [None]:
# Cell 4: Helper Functions for Data Extraction
def load_version_data(version_id):
    """Load all data needed for evaluation from database"""
    with get_db() as db:
        conn = db._conn
        schema = db.config['schema']
        
        # Load topics (excluding outliers)
        # NOTE: We need both 'id' (PK, referenced by article_analysis.primary_topic_id)
        # and 'topic_id' (BERTopic's topic number)
        topics_df = pd.read_sql(f"""
            SELECT id as topic_pk, topic_id, name, description, keywords, article_count
            FROM {schema}.topics
            WHERE result_version_id = %s AND topic_id != -1
            ORDER BY article_count DESC
        """, conn, params=(version_id,))
        
        # Load article-topic assignments
        # NOTE: primary_topic_id references topics.id (the PK), not topics.topic_id
        assignments_df = pd.read_sql(f"""
            SELECT aa.article_id, aa.primary_topic_id, aa.topic_confidence,
                   na.title, na.content
            FROM {schema}.article_analysis aa
            JOIN {schema}.news_articles na ON aa.article_id = na.id
            WHERE aa.result_version_id = %s
        """, conn, params=(version_id,))
        
        # Load embeddings
        embeddings_df = pd.read_sql(f"""
            SELECT e.article_id, e.embedding
            FROM {schema}.embeddings e
            WHERE e.result_version_id = %s
        """, conn, params=(version_id,))
        
        # Convert embeddings from string to numpy array if needed
        if len(embeddings_df) > 0 and isinstance(embeddings_df['embedding'].iloc[0], str):
            embeddings_df['embedding'] = embeddings_df['embedding'].apply(
                lambda x: np.fromstring(x.strip('[]'), sep=',')
            )
        
        # Load BERTopic model
        try:
            model = load_bertopic_model(version_id)
        except Exception as e:
            print(f"Warning: Could not load BERTopic model for version {version_id}: {e}")
            model = None
    
    return {
        'topics': topics_df,
        'assignments': assignments_df,
        'embeddings': embeddings_df,
        'model': model
    }

def prepare_corpus_for_coherence(assignments_df, bertopic_model):
    """Prepare corpus and dictionary for Gensim coherence calculation
    
    Uses BERTopic's vectorizer to ensure coherence is calculated on the same
    vocabulary space that BERTopic used for topic modeling. This provides a
    more accurate assessment of topic quality.
    """

    texts_raw = [
        str(row['title']) + ' ' + str(row['content'])
        for _, row in assignments_df.iterrows()
    ]
    
    # Use BERTopic's vectorizer to tokenize (same vocabulary/preprocessing as BERTopic)
    vectorizer = bertopic_model.vectorizer_model
    texts = [vectorizer.build_analyzer()(doc) for doc in texts_raw]
    
    dictionary = Dictionary(texts)
    
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    return texts, dictionary, corpus

print("Helper functions defined")

In [None]:
# Cell 4: Data Loading Loop
# Load data for all versions
version_data = {}
for version_id in version_ids:
    print(f"Loading data for version {version_id}...")
    version_data[version_id] = load_version_data(version_id)
    print(f"  Topics: {len(version_data[version_id]['topics'])}")
    print(f"  Assignments: {len(version_data[version_id]['assignments'])}")
    print(f"  Embeddings: {len(version_data[version_id]['embeddings'])}")

print(f"\nData loaded for {len(version_data)} versions")

In [None]:
# Cell 5: Compute Coherence Scores
def compute_coherence_scores(topics_df, texts, dictionary, corpus):
    """Compute coherence scores using the same corpus that generated topics
    
    Args:
        topics_df: DataFrame with topic keywords
        texts: Tokenized article texts
        dictionary: Gensim dictionary
        corpus: Bag-of-words corpus
    
    Returns:
        dict with coherence scores
    """
    topics_keywords = []
    for _, row in topics_df.iterrows():
        keywords = row['keywords']
        topics_keywords.append(keywords)
    
    results = {}
    
    print("  Computing coherence scores...")
    
    print("    - C_v...")
    coherence_cv = CoherenceModel(
        topics=topics_keywords,
        texts=texts,
        dictionary=dictionary,
        coherence='c_v'
    )
    results['c_v'] = coherence_cv.get_coherence()
    results['c_v_per_topic'] = coherence_cv.get_coherence_per_topic()
    
    print("    - NPMI...")
    coherence_npmi = CoherenceModel(
        topics=topics_keywords,
        texts=texts,
        dictionary=dictionary,
        coherence='c_npmi'
    )
    results['c_npmi'] = coherence_npmi.get_coherence()
    results['c_npmi_per_topic'] = coherence_npmi.get_coherence_per_topic()
    
    print("    - U_Mass...")
    coherence_umass = CoherenceModel(
        topics=topics_keywords,
        corpus=corpus,
        dictionary=dictionary,
        coherence='u_mass'
    )
    results['u_mass'] = coherence_umass.get_coherence()
    results['u_mass_per_topic'] = coherence_umass.get_coherence_per_topic()
    
    return results

In [None]:
# Compute coherence for all versions
coherence_results = {}
for version_id in version_ids:
    print(f"\nComputing coherence for version {version_id}...")
    data = version_data[version_id]
    
    # Check if model is available
    if data['model'] is None:
        print(f"  Skipping: BERTopic model not available")
        continue
    
    # Use BERTopic's vectorizer for corpus preparation
    texts, dictionary, corpus = prepare_corpus_for_coherence(
        data['assignments'], data['model']
    )
    
    # Compute coherence scores
    coherence_results[version_id] = compute_coherence_scores(
        data['topics'], texts, dictionary, corpus
    )
    
    # Print results
    print(f"  Results:")
    print(f"    C_v: {coherence_results[version_id]['c_v']:.3f}")
    print(f"    NPMI: {coherence_results[version_id]['c_npmi']:.3f}")
    print(f"    U_Mass: {coherence_results[version_id]['u_mass']:.3f}")

In [None]:
# Cell 6: Coherence Results DataFrame
coherence_df = pd.DataFrame([
    {
        'version_id': vid,
        'version_name': versions_df[versions_df['id']==vid]['name'].values[0],
        'c_v': results['c_v'],
        'c_npmi': results['c_npmi'],
        'u_mass': results['u_mass']
    }
    for vid, results in coherence_results.items()
])

print("=" * 80)
print("COHERENCE SCORES BY VERSION")
print("=" * 80)
display(coherence_df[['version_name', 'c_v', 'c_npmi', 'u_mass']].sort_values('c_v', ascending=False))

In [None]:
# Cell 7: Coherence Visualization
# Bar chart comparing coherence scores
fig = go.Figure()
for metric, label in [('c_v', 'C_v'), ('c_npmi', 'NPMI'), ('u_mass', 'U_Mass')]:
    fig.add_trace(go.Bar(
        name=label,
        x=coherence_df['version_name'],
        y=coherence_df[metric]
    ))

fig.update_layout(
    title='Coherence Scores Across Versions',
    xaxis_title='Version',
    yaxis_title='Coherence Score',
    barmode='group',
    height=500
)
fig.show()

## 2: Topic Diversity Metrics

In [None]:
# Cell 8: Compute Topic Diversity
def compute_topic_diversity(topics_df, top_n=10):
    """Compute vocabulary overlap and uniqueness metrics"""
    # Extract top-N keywords per topic
    topic_words = [set(row['keywords'][:top_n]) for _, row in topics_df.iterrows()]
    
    # Pairwise Jaccard similarity
    jaccard_scores = []
    for i in range(len(topic_words)):
        for j in range(i+1, len(topic_words)):
            intersection = len(topic_words[i] & topic_words[j])
            union = len(topic_words[i] | topic_words[j])
            jaccard = intersection / union if union > 0 else 0
            jaccard_scores.append(jaccard)
    
    avg_overlap = np.mean(jaccard_scores) if jaccard_scores else 0
    
    # Unique words ratio
    all_words = [word for words in topic_words for word in words]
    unique_words = len(set(all_words))
    total_words = len(all_words)
    unique_ratio = unique_words / total_words if total_words > 0 else 0
    
    # Top-word diversity (words appearing in only one topic)
    word_counts = Counter(all_words)
    unique_count = sum(1 for count in word_counts.values() if count == 1)
    top_word_diversity = unique_count / total_words if total_words > 0 else 0
    
    return {
        'avg_jaccard_overlap': avg_overlap,
        'unique_words_ratio': unique_ratio,
        'top_word_diversity': top_word_diversity,
        'num_topics': len(topic_words),
        'total_unique_words': unique_words
    }

# Compute for all versions
diversity_results = {}
for version_id in version_ids:
    data = version_data[version_id]
    diversity_results[version_id] = compute_topic_diversity(data['topics'])

print("Diversity computation complete")

In [None]:
# Cell 9: Diversity Results DataFrame
diversity_df = pd.DataFrame([
    {
        'version_id': vid,
        'version_name': versions_df[versions_df['id']==vid]['name'].values[0],
        **results
    }
    for vid, results in diversity_results.items()
])

print("Topic Diversity Metrics:")
display(diversity_df)

In [None]:
# Cell 10: Diversity Visualization
fig = go.Figure()
fig.add_trace(go.Bar(
    name='Avg Jaccard Overlap (lower is better)',
    x=diversity_df['version_name'],
    y=diversity_df['avg_jaccard_overlap'],
    marker_color='indianred'
))
fig.add_trace(go.Bar(
    name='Unique Words Ratio (higher is better)',
    x=diversity_df['version_name'],
    y=diversity_df['unique_words_ratio'],
    marker_color='lightseagreen'
))
fig.add_trace(go.Bar(
    name='Top-Word Diversity (higher is better)',
    x=diversity_df['version_name'],
    y=diversity_df['top_word_diversity'],
    marker_color='lightsalmon'
))

fig.update_layout(
    title='Topic Diversity Metrics',
    xaxis_title='Version',
    yaxis_title='Score',
    barmode='group',
    height=500
)
fig.show()

## 3: Inter-Topic Similarity

In [None]:
# Cell 11: Compute Inter-Topic Similarity
def compute_inter_topic_similarity(topics_df, embeddings_df, assignments_df):
    """Compute topic similarities using embeddings"""
    # Create topic embeddings (centroid of article embeddings)
    topic_embeddings = {}
    
    for _, topic in topics_df.iterrows():
        # Use topic_pk (the primary key) to match with assignments.primary_topic_id
        topic_pk = topic['topic_pk']
        topic_id = topic['topic_id']  # BERTopic's topic number (for display/results)
        
        # Get articles assigned to this topic
        # NOTE: assignments.primary_topic_id references topics.id (topic_pk), not topics.topic_id
        article_ids = assignments_df[
            assignments_df['primary_topic_id'] == topic_pk
        ]['article_id'].tolist()

        # Get embeddings for these articles
        topic_vecs = embeddings_df[
            embeddings_df['article_id'].isin(article_ids)
        ]['embedding'].tolist()

        if topic_vecs:
            # Compute centroid - use topic_id as the key for results
            topic_embeddings[topic_id] = np.mean(topic_vecs, axis=0)
    
    # Compute pairwise cosine similarities
    topic_ids = list(topic_embeddings.keys())
    n_topics = len(topic_ids)
    
    print(f"  Found embeddings for {n_topics} topics out of {len(topics_df)}")
    
    # Handle edge case: fewer than 2 topics
    if n_topics < 2:
        print("  Not enough topics with embeddings to compute similarities.")
        return {
            'mean_similarity': np.nan,
            'median_similarity': np.nan,
            'std_similarity': np.nan,
            'max_similarity': np.nan,
            'min_similarity': np.nan,
            'similarity_matrix': np.array([]),
            'topic_ids': topic_ids
        }
    
    similarity_matrix = np.zeros((n_topics, n_topics))
    
    for i, tid1 in enumerate(topic_ids):
        for j, tid2 in enumerate(topic_ids):
            if i != j:
                sim = 1 - cosine(topic_embeddings[tid1], topic_embeddings[tid2])
                similarity_matrix[i, j] = sim
    
    # Extract upper triangle (exclude diagonal)
    upper_tri = similarity_matrix[np.triu_indices(n_topics, k=1)]
    
    return {
        'mean_similarity': np.mean(upper_tri),
        'median_similarity': np.median(upper_tri),
        'std_similarity': np.std(upper_tri),
        'max_similarity': np.max(upper_tri),
        'min_similarity': np.min(upper_tri),
        'similarity_matrix': similarity_matrix,
        'topic_ids': topic_ids
    }

# Compute for all versions
similarity_results = {}
for version_id in version_ids:
    print(f"Computing inter-topic similarity for {version_id}...")
    data = version_data[version_id]
    similarity_results[version_id] = compute_inter_topic_similarity(
        data['topics'], data['embeddings'], data['assignments']
    )

print("\nSimilarity computation complete")

In [None]:
# Cell 12: Similarity Results DataFrame
similarity_df = pd.DataFrame([
    {
        'version_id': vid,
        'version_name': versions_df[versions_df['id']==vid]['name'].values[0],
        'mean_similarity': results['mean_similarity'],
        'median_similarity': results['median_similarity'],
        'std_similarity': results['std_similarity'],
        'max_similarity': results['max_similarity'],
        'min_similarity': results['min_similarity']
    }
    for vid, results in similarity_results.items()
])

print("Inter-Topic Similarity Metrics:")
display(similarity_df)

In [None]:
# Cell 13: Similarity Heatmap
# Show heatmap for first version with valid similarity matrix
for version_id in version_ids:
    results = similarity_results[version_id]
    version_name = versions_df[versions_df['id']==version_id]['name'].values[0]
    
    # Skip versions with fewer than 2 topics (empty similarity matrix)
    if results['similarity_matrix'].size == 0:
        print(f"Skipping {version_name}: fewer than 2 topics, no similarity matrix available")
        continue
    
    fig = px.imshow(
        results['similarity_matrix'],
        labels=dict(x="Topic", y="Topic", color="Cosine Similarity"),
        x=results['topic_ids'],
        y=results['topic_ids'],
        color_continuous_scale='RdYlGn_r',
        title=f"Inter-Topic Similarity Matrix - {version_name}"
    )
    fig.update_layout(height=600, width=700)
    fig.show()
    break  # Only show first valid heatmap as example

## Phase 5: Assignment Quality Metrics

In [None]:
# Cell 14: Compute Assignment Quality
def compute_assignment_quality(assignments_df, topics_df):
    """Analyze document-topic assignment confidence and coverage"""
    # Confidence score statistics
    confidence_scores = assignments_df['topic_confidence'].dropna()
    
    # Coverage metrics
    total_articles = len(assignments_df)
    outliers = len(assignments_df[assignments_df['primary_topic_id'] == -1])
    outlier_ratio = outliers / total_articles if total_articles > 0 else 0
    
    # High-confidence assignment percentages
    high_conf_30 = len(confidence_scores[confidence_scores > 0.3]) / len(confidence_scores) if len(confidence_scores) > 0 else 0
    high_conf_50 = len(confidence_scores[confidence_scores > 0.5]) / len(confidence_scores) if len(confidence_scores) > 0 else 0
    high_conf_70 = len(confidence_scores[confidence_scores > 0.7]) / len(confidence_scores) if len(confidence_scores) > 0 else 0
    
    # Topic size distribution
    topic_sizes = assignments_df[
        assignments_df['primary_topic_id'] != -1
    ].groupby('primary_topic_id').size()
    
    return {
        'mean_confidence': confidence_scores.mean(),
        'median_confidence': confidence_scores.median(),
        'std_confidence': confidence_scores.std(),
        'outlier_ratio': outlier_ratio,
        'coverage': 1 - outlier_ratio,
        'high_conf_30': high_conf_30,
        'high_conf_50': high_conf_50,
        'high_conf_70': high_conf_70,
        'num_topics': len(topics_df),
        'mean_topic_size': topic_sizes.mean(),
        'std_topic_size': topic_sizes.std(),
        'min_topic_size': topic_sizes.min(),
        'max_topic_size': topic_sizes.max(),
        'topic_size_cv': topic_sizes.std() / topic_sizes.mean() if topic_sizes.mean() > 0 else 0
    }

# Compute for all versions
assignment_results = {}
for version_id in version_ids:
    data = version_data[version_id]
    assignment_results[version_id] = compute_assignment_quality(
        data['assignments'], data['topics']
    )

print("‚úÖ Assignment quality computation complete")

In [None]:
# Cell 15: Assignment Quality DataFrame
assignment_df = pd.DataFrame([
    {
        'version_id': vid,
        'version_name': versions_df[versions_df['id']==vid]['name'].values[0],
        **results
    }
    for vid, results in assignment_results.items()
])

print("Assignment Quality Metrics:")
display(assignment_df[['version_name', 'mean_confidence', 'coverage', 'outlier_ratio', 'high_conf_50', 'num_topics']])

In [None]:
# Cell 16: Confidence Distribution Visualization
# Histogram of confidence scores for each version
for version_id in version_ids:
    data = version_data[version_id]
    version_name = versions_df[versions_df['id']==version_id]['name'].values[0]
    
    confidence_scores = data['assignments']['topic_confidence'].dropna()
    
    fig = px.histogram(
        confidence_scores,
        nbins=50,
        title=f'Confidence Score Distribution - {version_name}',
        labels={'value': 'Confidence Score', 'count': 'Frequency'}
    )
    fig.add_vline(x=0.3, line_dash="dash", line_color="red", annotation_text="0.3")
    fig.add_vline(x=0.5, line_dash="dash", line_color="orange", annotation_text="0.5")
    fig.add_vline(x=0.7, line_dash="dash", line_color="green", annotation_text="0.7")
    fig.update_layout(height=400)
    fig.show()

In [None]:
# Cell 17: Coverage Comparison
fig = go.Figure()
fig.add_trace(go.Bar(
    name='Coverage (% non-outliers)',
    x=assignment_df['version_name'],
    y=assignment_df['coverage'] * 100,
    marker_color='lightseagreen'
))
fig.add_trace(go.Bar(
    name='Outlier Ratio',
    x=assignment_df['version_name'],
    y=assignment_df['outlier_ratio'] * 100,
    marker_color='indianred'
))

fig.update_layout(
    title='Topic Coverage Across Versions',
    xaxis_title='Version',
    yaxis_title='Percentage (%)',
    barmode='group',
    height=500
)
fig.show()

## Phase 6: Additional Advanced Metrics

In [None]:
# Cell 18: Silhouette Score
def compute_silhouette_score_for_version(embeddings_df, assignments_df):
    """Compute silhouette score for topic clustering quality"""
    # Merge embeddings with topic assignments
    merged = embeddings_df.merge(
        assignments_df[['article_id', 'primary_topic_id']],
        on='article_id'
    )
    
    # Exclude outliers (topic -1)
    merged = merged[merged['primary_topic_id'] != -1]
    
    if len(merged) == 0:
        return 0.0
    
    # Convert embeddings to numpy array
    X = np.array(merged['embedding'].tolist())
    labels = merged['primary_topic_id'].values
    
    # Check if we have enough samples
    if len(np.unique(labels)) < 2:
        return 0.0
    
    # Compute silhouette score (sample for efficiency)
    sample_size = min(10000, len(X))
    score = silhouette_score(X, labels, metric='cosine', sample_size=sample_size)
    
    return score

# Compute for all versions
silhouette_scores = {}
for version_id in version_ids:
    print(f"Computing silhouette score for {version_id}...")
    data = version_data[version_id]
    silhouette_scores[version_id] = compute_silhouette_score_for_version(
        data['embeddings'], data['assignments']
    )
    print(f"  Score: {silhouette_scores[version_id]:.3f}")

silhouette_df = pd.DataFrame([
    {
        'version_id': vid,
        'version_name': versions_df[versions_df['id']==vid]['name'].values[0],
        'silhouette_score': score
    }
    for vid, score in silhouette_scores.items()
])

print("\nSilhouette Scores:")
display(silhouette_df.sort_values('silhouette_score', ascending=False))

In [None]:
# Cell 20: Composite Quality Score
def compute_composite_score(version_id):
    """Compute weighted composite quality score"""
    # Normalize metrics to 0-1 range
    coherence = coherence_results[version_id]['c_v']  # Already 0-1
    diversity = diversity_results[version_id]['unique_words_ratio']  # Already 0-1
    coverage = assignment_results[version_id]['coverage']  # Already 0-1
    silhouette = (silhouette_scores[version_id] + 1) / 2  # Scale from [-1,1] to [0,1]
    
    # Inverse of overlap (lower overlap is better)
    low_overlap = 1 - diversity_results[version_id]['avg_jaccard_overlap']
    
    # Weighted combination (adjust weights as needed)
    composite = (
        0.3 * coherence +      # Semantic quality
        0.2 * diversity +      # Topic distinctiveness
        0.2 * coverage +       # Document coverage
        0.2 * silhouette +     # Clustering quality
        0.1 * low_overlap      # Topic separation
    )
    
    return {
        'coherence': coherence,
        'diversity': diversity,
        'coverage': coverage,
        'silhouette': silhouette,
        'low_overlap': low_overlap,
        'composite_score': composite
    }

composite_results = {}
for version_id in version_ids:
    composite_results[version_id] = compute_composite_score(version_id)

composite_df = pd.DataFrame([
    {
        'version_id': vid,
        'version_name': versions_df[versions_df['id']==vid]['name'].values[0],
        **results
    }
    for vid, results in composite_results.items()
])

print("Composite Quality Scores:")
display(composite_df[['version_name', 'coherence', 'diversity', 'coverage', 'silhouette', 'low_overlap', 'composite_score']].sort_values('composite_score', ascending=False))

In [None]:
# Cell 21: Per-Topic Coherence Analysis
# Extract per-topic coherence scores for each version
per_topic_coherence = {}

for version_id in version_ids:
    version_name = versions_df[versions_df['id']==version_id]['name'].values[0]
    topics = version_data[version_id]['topics']
    
    # Get per-topic scores
    c_v_scores = coherence_results[version_id]['c_v_per_topic']
    c_npmi_scores = coherence_results[version_id]['c_npmi_per_topic']
    u_mass_scores = coherence_results[version_id]['u_mass_per_topic']
    
    # Create DataFrame
    topic_scores_df = pd.DataFrame({
        'version_id': version_id,
        'version_name': version_name,
        'topic_id': topics['topic_id'].values,
        'topic_name': topics['name'].values,
        'article_count': topics['article_count'].values,
        'c_v': c_v_scores[:len(topics)],
        'c_npmi': c_npmi_scores[:len(topics)],
        'u_mass': u_mass_scores[:len(topics)]
    })
    
    per_topic_coherence[version_id] = topic_scores_df

# Combine all versions
all_topics_df = pd.concat(per_topic_coherence.values(), ignore_index=True)

print(f"Per-topic analysis for {len(all_topics_df)} topics across {len(version_ids)} versions")
print("\nSample (showing first 20 topics):")
display(all_topics_df[['version_name', 'topic_name', 'article_count', 'c_v', 'c_npmi', 'u_mass']].head(20))

In [None]:
# Cell 22: Top & Bottom Performing Topics
# For each version, identify best and worst topics
for version_id in version_ids:
    version_name = versions_df[versions_df['id']==version_id]['name'].values[0]
    topic_scores = per_topic_coherence[version_id]
    
    print(f"\n{'='*60}")
    print(f"Version: {version_name}")
    print(f"{'='*60}")
    
    # Top 5 topics by C_v
    top_5 = topic_scores.nlargest(5, 'c_v')
    print("\nüèÜ Top 5 Topics by Coherence (C_v):")
    for idx, row in top_5.iterrows():
        print(f"  {row['topic_name'][:50]:50s} | C_v: {row['c_v']:.3f} | Articles: {int(row['article_count'])}")
    
    # Bottom 5 topics by C_v
    bottom_5 = topic_scores.nsmallest(5, 'c_v')
    print("\n‚ö†Ô∏è  Bottom 5 Topics by Coherence (C_v):")
    for idx, row in bottom_5.iterrows():
        print(f"  {row['topic_name'][:50]:50s} | C_v: {row['c_v']:.3f} | Articles: {int(row['article_count'])}")
    
    # Topics with low coherence AND high article count (problematic)
    problematic = topic_scores[
        (topic_scores['c_v'] < topic_scores['c_v'].median()) &
        (topic_scores['article_count'] > topic_scores['article_count'].median())
    ]
    
    if len(problematic) > 0:
        print("\nüö® Problematic Topics (Low coherence, High article count):")
        for idx, row in problematic.iterrows():
            print(f"  {row['topic_name'][:50]:50s} | C_v: {row['c_v']:.3f} | Articles: {int(row['article_count'])}")

In [None]:
# Cell 23: Per-Topic Coherence Distribution
# Violin plot showing coherence distribution per version
fig = go.Figure()

for version_id in version_ids:
    version_name = versions_df[versions_df['id']==version_id]['name'].values[0]
    topic_scores = per_topic_coherence[version_id]
    
    fig.add_trace(go.Violin(
        y=topic_scores['c_v'],
        name=version_name,
        box_visible=True,
        meanline_visible=True
    ))

fig.update_layout(
    title='Per-Topic Coherence Distribution Across Versions',
    yaxis_title='C_v Coherence Score',
    xaxis_title='Version',
    showlegend=True,
    height=500
)
fig.show()

In [None]:
# Cell 24: Topic Quality vs Size Scatter Plot
# Scatter plot: Article count vs Coherence for all topics
for version_id in version_ids:
    version_name = versions_df[versions_df['id']==version_id]['name'].values[0]
    topic_scores = per_topic_coherence[version_id]
    
    fig = px.scatter(
        topic_scores,
        x='article_count',
        y='c_v',
        hover_data=['topic_name'],
        title=f'Topic Quality vs Size - {version_name}',
        labels={'article_count': 'Number of Articles', 'c_v': 'Coherence (C_v)'},
        size='article_count',
        size_max=20
    )
    
    # Add reference lines
    fig.add_hline(y=topic_scores['c_v'].mean(), line_dash="dash", line_color="red",
                  annotation_text="Mean Coherence")
    fig.add_vline(x=topic_scores['article_count'].median(), line_dash="dash", line_color="blue",
                  annotation_text="Median Size")
    
    fig.update_layout(height=500)
    fig.show()

In [None]:
# Cell 25: Consolidated Metrics Table
# Merge all metrics into single DataFrame
consolidated_df = (
    coherence_df
    .merge(diversity_df, on=['version_id', 'version_name'], suffixes=('', '_diversity'))
    .merge(similarity_df, on=['version_id', 'version_name'])
    .merge(assignment_df, on=['version_id', 'version_name'])
    .merge(silhouette_df, on=['version_id', 'version_name'])
    .merge(composite_df[['version_id', 'composite_score']], on='version_id')
)

# Select key columns for display
key_columns = [
    'version_name', 'c_v', 'unique_words_ratio', 'mean_similarity',
    'coverage', 'mean_confidence', 'silhouette_score', 'composite_score'
]

print("Consolidated Metrics (sorted by composite score):")
display(consolidated_df[key_columns].sort_values('composite_score', ascending=False))

# Save to CSV
consolidated_df.to_csv('topic_evaluation_results.csv', index=False)
print("\nResults saved to topic_evaluation_results.csv")

In [None]:
# Cell 26: Radar Chart Comparison
# Create radar chart comparing versions across normalized metrics
from math import pi

def create_radar_chart(df, version_name):
    """Create radar chart for a single version"""
    categories = ['Coherence', 'Diversity', 'Coverage', 'Silhouette', 'Low Overlap']
    
    # Get values for this version
    row = df[df['version_name'] == version_name].iloc[0]
    values = [
        row['c_v'],
        row['unique_words_ratio'],
        row['coverage'],
        (row['silhouette_score'] + 1) / 2,  # Normalize to 0-1
        1 - row['avg_jaccard_overlap']  # Inverse overlap
    ]
    
    # Close the plot
    values += values[:1]
    
    return categories, values

# Create radar chart for each version
fig = go.Figure()

for version_id in version_ids:
    version_name = versions_df[versions_df['id']==version_id]['name'].values[0]
    categories, values = create_radar_chart(consolidated_df, version_name)
    
    fig.add_trace(go.Scatterpolar(
        r=values,
        theta=categories + [categories[0]],
        fill='toself',
        name=version_name
    ))

fig.update_layout(
    polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
    title='Topic Model Quality Comparison (Normalized Metrics)',
    showlegend=True,
    height=600
)
fig.show()

In [None]:
# Cell 29: Recommendations & Summary
# Generate recommendations based on metrics
best_version = consolidated_df.loc[consolidated_df['composite_score'].idxmax()]

print("=" * 60)
print("TOPIC MODELING EVALUATION SUMMARY")
print("=" * 60)
print(f"\nBest Overall Version: {best_version['version_name']}")
print(f"Composite Score: {best_version['composite_score']:.3f}\n")

print("Breakdown:")
print(f"  ‚Ä¢ Coherence (C_v): {best_version['c_v']:.3f}")
print(f"  ‚Ä¢ Topic Diversity: {best_version['unique_words_ratio']:.3f}")
print(f"  ‚Ä¢ Coverage: {best_version['coverage']:.1%}")
print(f"  ‚Ä¢ Silhouette Score: {best_version['silhouette_score']:.3f}")
print(f"  ‚Ä¢ Mean Confidence: {best_version['mean_confidence']:.3f}")
print(f"  ‚Ä¢ Number of Topics: {int(best_version['num_topics_x'])}")
print(f"  ‚Ä¢ Outlier Ratio: {best_version['outlier_ratio']:.1%}")

print("\n" + "=" * 60)
print("Version Rankings by Metric:")
print("=" * 60)

metrics_to_rank = {
    'Coherence (C_v)': 'c_v',
    'Topic Diversity': 'unique_words_ratio',
    'Coverage': 'coverage',
    'Silhouette Score': 'silhouette_score',
    'Mean Confidence': 'mean_confidence'
}

for metric_name, metric_col in metrics_to_rank.items():
    ranked = consolidated_df.sort_values(metric_col, ascending=False)
    top_3 = ranked.head(3)['version_name'].tolist()
    print(f"\n{metric_name}:")
    for i, name in enumerate(top_3, 1):
        print(f"  {i}. {name}")

print("\n" + "=" * 60)
print("Per-Topic Insights:")
print("=" * 60)

# Best topic overall across all versions
best_topic = all_topics_df.loc[all_topics_df['c_v'].idxmax()]
print(f"\nüèÜ Best Topic Overall (Coherence):")
print(f"  Version: {best_topic['version_name']}")
print(f"  Topic: {best_topic['topic_name']}")
print(f"  Coherence (C_v): {best_topic['c_v']:.3f}")
print(f"  Articles: {int(best_topic['article_count'])}")

# Topic with most articles
largest_topic = all_topics_df.loc[all_topics_df['article_count'].idxmax()]
print(f"\nüìä Largest Topic:")
print(f"  Version: {largest_topic['version_name']}")
print(f"  Topic: {largest_topic['topic_name']}")
print(f"  Coherence (C_v): {largest_topic['c_v']:.3f}")
print(f"  Articles: {int(largest_topic['article_count'])}")

# Average coherence range across versions
print(f"\nüìà Per-Topic Coherence Ranges:")
for version_id in version_ids:
    version_name = versions_df[versions_df['id']==version_id]['name'].values[0]
    topic_scores = per_topic_coherence[version_id]
    print(f"  {version_name}: {topic_scores['c_v'].min():.3f} - {topic_scores['c_v'].max():.3f} (range: {topic_scores['c_v'].max() - topic_scores['c_v'].min():.3f})")