# Topic Modeling Visualization Notebook

This notebook provides interactive visualization of LDA topic modeling results.

## Running this Notebook

To run this notebook:

1. Make sure you have run the topic modeling system first to generate the necessary output files
2. Install the required packages: `pip install pandas numpy matplotlib seaborn plotly ipywidgets wordcloud scikit-learn networkx`
3. Start Jupyter: `jupyter notebook` or `jupyter lab`
4. Open this notebook and run all cells

## Customization

You can customize this notebook by:

- Changing the file paths at the top if your results are in a different location
- Adjusting the visualization parameters (colors, sizes, etc.)
- Adding new visualizations based on your specific analysis needs




In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import display, HTML
import ipywidgets as widgets
from wordcloud import WordCloud
import datetime
import os

In [None]:
# Load the user topic data
USER_TOPIC_DATA_PATH = "topic_analysis_results/user_topic_data.csv"
SUSPICIOUS_USERS_PATH = "topic_analysis_results/suspicious_users.csv"

# Load data
user_topic_df = pd.read_csv(USER_TOPIC_DATA_PATH)
suspicious_df = pd.read_csv(SUSPICIOUS_USERS_PATH)

# Count number of topics
topic_cols = [col for col in user_topic_df.columns if col.startswith('Topic_')]
num_topics = len(topic_cols)

# Count users and suspicious users
total_users = len(user_topic_df)
suspicious_users = sum(suspicious_df['suspicious'])

print(f"Loaded topic data for {total_users} users across {num_topics} topics")
print(f"Found {suspicious_users} suspicious users")


# 1. Topic Word Distributions (if available)

In [None]:
# We'll assume you have a topics.csv file with top words per topic
topics_file = "topic_analysis_results/topics.csv"
if os.path.exists(topics_file):
    topics_df = pd.read_csv(topics_file)
    
    def display_topic_words():
        topic_id = topic_selector.value
        topic_words = topics_df[topics_df['topic_id'] == topic_id]
        
        # Create word cloud
        word_dict = dict(zip(topic_words['word'], topic_words['weight']))
        wc = WordCloud(width=800, height=400, background_color='white', 
                      colormap='viridis', max_words=30)
        wc.generate_from_frequencies(word_dict)
        
        # Display
        plt.figure(figsize=(10, 5))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'Top Words for Topic {topic_id}')
        plt.show()
        
        # Also show as a bar chart
        plt.figure(figsize=(12, 5))
        sns.barplot(x='weight', y='word', data=topic_words.sort_values('weight', ascending=False).head(15))
        plt.title(f'Top 15 Words for Topic {topic_id}')
        plt.tight_layout()
        plt.show()
    
    # Create a topic selector widget
    topic_selector = widgets.IntSlider(
        value=0,
        min=0,
        max=num_topics-1,
        step=1,
        description='Topic:',
        continuous_update=False
    )
    
    # Display interactive widget
    display(widgets.VBox([
        widgets.HTML('<h3>Topic Word Distributions</h3>'),
        topic_selector,
        widgets.interactive_output(display_topic_words, {'topic_id': topic_selector})
    ]))
else:
    print("Topics file not found. Skipping topic word visualization.")


# 2. User Topic Distribution Visualization


In [1]:
def plot_user_topics():
    user_id = user_selector.value
    
    # Get topic distribution for this user
    user_data = user_topic_df[user_topic_df['user_id'] == user_id].iloc[0]
    
    # Extract topic proportions
    topic_props = [user_data[f'Topic_{i}'] for i in range(num_topics)]
    
    # Only include topics with significant contribution
    threshold = 0.03
    significant_topics = [(i, prop) for i, prop in enumerate(topic_props) if prop > threshold]
    other_prop = sum(prop for i, prop in enumerate(topic_props) if prop <= threshold)
    
    # Sort by proportion
    significant_topics.sort(key=lambda x: x[1], reverse=True)
    
    # Create labels and values
    labels = [f'Topic {i} ({prop:.2f})' for i, prop in significant_topics]
    if other_prop > 0:
        labels.append(f'Other ({other_prop:.2f})')
        
    values = [prop for _, prop in significant_topics]
    if other_prop > 0:
        values.append(other_prop)
    
    # Create pie chart with Plotly
    fig = go.Figure(data=[go.Pie(
        labels=labels,
        values=values,
        textinfo='percent',
        hoverinfo='label+percent',
        hole=0.3
    )])
    
    fig.update_layout(
        title=f'Topic Distribution for User {user_id}',
        height=500
    )
    
    fig.show()
    
    # Display user metrics
    print(f"User: {user_id}")
    print(f"Post count: {user_data['post_count']}")
    print(f"Gini coefficient: {user_data['gini_coefficient']:.4f}")
    print(f"Shannon entropy: {user_data['shannon_entropy']:.4f}")
    print(f"Top-1 topic ratio: {user_data['top1_ratio']:.4f}")
    print(f"Top-2 topics ratio: {user_data['top2_ratio']:.4f}")
    
    # Check if user is suspicious
    if user_id in suspicious_df['user_id'].values:
        susp_data = suspicious_df[suspicious_df['user_id'] == user_id].iloc[0]
        if susp_data['suspicious']:
            print("⚠️ This user has been flagged as suspicious")
            reasons = []
            if susp_data['suspicious_post_freq']:
                reasons.append("unusually high posting frequency")
            if susp_data['suspicious_narrowness']:
                reasons.append("extremely narrow topic focus")
            if susp_data['suspicious_duplicates']:
                reasons.append("high rate of similar/duplicate content")
            print(f"Reasons: {', '.join(reasons)}")

# Create user selector
user_list = sorted(user_topic_df['user_id'].unique())
user_selector = widgets.Dropdown(
    options=user_list,
    value=user_list[0],
    description='User:',
    style={'description_width': 'initial'}
)

# Display interactive widget
display(widgets.VBox([
    widgets.HTML('<h3>User Topic Distribution</h3>'),
    user_selector,
    widgets.interactive_output(plot_user_topics, {'user_id': user_selector})
]))


NameError: name 'user_topic_df' is not defined

# 3. Topic Concentration Visualization


In [None]:
# Create scatter plot of Gini coefficient vs post count
def plot_topic_concentration():
    metric = metric_selector.value
    
    # Create figure
    fig = px.scatter(
        user_topic_df, 
        x='post_count', 
        y=metric,
        hover_data=['user_id', 'post_count', 'gini_coefficient', 'shannon_entropy', 'top1_ratio'],
        color='dominant_topic',
        size='post_count',
        size_max=30,
        opacity=0.7,
        log_x=log_scale.value,
        title=f'User Topic Concentration ({metric.replace("_", " ").title()}) vs Post Frequency'
    )
    
    # Add reference line for mean
    mean_val = user_topic_df[metric].mean()
    fig.add_hline(y=mean_val, line_dash="dash", line_color="red",
                 annotation_text=f"Mean: {mean_val:.3f}", 
                 annotation_position="top right")
    
    # Highlight suspicious users if available
    if 'suspicious' in suspicious_df.columns:
        susp_users = suspicious_df[suspicious_df['suspicious']]['user_id'].tolist()
        susp_df = user_topic_df[user_topic_df['user_id'].isin(susp_users)]
        
        fig.add_trace(
            go.Scatter(
                x=susp_df['post_count'],
                y=susp_df[metric],
                mode='markers',
                marker=dict(
                    color='red',
                    size=15,
                    line=dict(color='black', width=2)
                ),
                name='Suspicious',
                text=susp_df['user_id'],
                hoverinfo='text'
            )
        )
    
    fig.update_layout(
        xaxis_title="Number of Posts",
        yaxis_title=f"{metric.replace('_', ' ').title()}",
        height=600,
        width=900
    )
    
    fig.show()

# Create metric selector
metric_selector = widgets.Dropdown(
    options=[
        ('Gini Coefficient', 'gini_coefficient'),
        ('Shannon Entropy', 'shannon_entropy'),
        ('Top-1 Topic Ratio', 'top1_ratio'),
        ('Top-2 Topics Ratio', 'top2_ratio')
    ],
    value='gini_coefficient',
    description='Metric:',
    style={'description_width': 'initial'}
)

# Create log scale toggle
log_scale = widgets.Checkbox(
    value=False,
    description='Log Scale for Post Count',
    style={'description_width': 'initial'}
)

# Display interactive widget
display(widgets.VBox([
    widgets.HTML('<h3>Topic Concentration Analysis</h3>'),
    widgets.HBox([metric_selector, log_scale]),
    widgets.interactive_output(plot_topic_concentration, {
        'metric': metric_selector,
        'log_scale': log_scale
    })
]))

# 4. Suspicious Users Analysis


In [None]:
if 'suspicious' in suspicious_df.columns and sum(suspicious_df['suspicious']) > 0:
    susp_users = suspicious_df[suspicious_df['suspicious']]
    
    # Display table of suspicious users
    display(widgets.HTML('<h3>Suspicious User Analysis</h3>'))
    display(susp_users[['user_id', 'post_count', 'gini_coefficient', 'top1_ratio', 
                      'duplicate_post_ratio']].head(10).style.background_gradient(
                          cmap='Reds', subset=['gini_coefficient', 'top1_ratio', 'duplicate_post_ratio']
                      ))
    
    # Plot distribution of suspicious vs normal users
    def plot_suspicious_comparison():
        metric = susp_metric_selector.value
        
        # Merge data
        merged_df = user_topic_df.merge(
            suspicious_df[['user_id', 'suspicious']], 
            on='user_id', 
            how='left'
        )
        merged_df['suspicious'] = merged_df['suspicious'].fillna(False)
        merged_df['user_type'] = merged_df['suspicious'].map({True: 'Suspicious', False: 'Normal'})
        
        # Create violin plot
        fig = px.violin(
            merged_df, 
            x='user_type', 
            y=metric,
            color='user_type',
            box=True,
            points='all',
            hover_data=['user_id', 'post_count'],
            title=f'Distribution of {metric.replace("_", " ").title()}: Suspicious vs Normal Users'
        )
        
        fig.update_layout(height=500, width=700)
        fig.show()
    
    # Create metric selector for suspicious user analysis
    susp_metric_selector = widgets.Dropdown(
        options=[
            ('Gini Coefficient', 'gini_coefficient'),
            ('Shannon Entropy', 'shannon_entropy'),
            ('Top-1 Topic Ratio', 'top1_ratio'),
            ('Top-2 Topics Ratio', 'top2_ratio'),
            ('Duplicate Post Ratio', 'duplicate_post_ratio')
        ],
        value='gini_coefficient',
        description='Metric:',
        style={'description_width': 'initial'}
    )
    
    # Display interactive widget
    display(widgets.VBox([
        widgets.HTML('<h3>Suspicious vs Normal Users</h3>'),
        susp_metric_selector,
        widgets.interactive_output(plot_suspicious_comparison, {'metric': susp_metric_selector})
    ]))

# 5. Temporal Topic Analysis


In [None]:
# Load temporal data if available
temporal_data_file = "topic_analysis_results/temporal_topic_data.pkl"
if os.path.exists(temporal_data_file):
    import pickle
    with open(temporal_data_file, 'rb') as f:
        temporal_topic_data = pickle.load(f)
    
    def plot_temporal_topics():
        user_id = temporal_user_selector.value
        
        if user_id not in temporal_topic_data:
            print(f"No temporal data available for user {user_id}")
            return
        
        # Get temporal data for this user
        user_temporal = temporal_topic_data[user_id]
        
        # Only include top topics
        top_n = 5
        avg_dist = user_temporal.mean()
        top_topics = avg_dist.nlargest(top_n).index.tolist()
        
        # Create 'Other' category for remaining topics
        user_temporal_plot = user_temporal.copy()
        user_temporal_plot['Other'] = user_temporal_plot.drop(columns=top_topics).sum(axis=1)
        
        # Plot using plotly
        fig = go.Figure()
        
        # Add traces for each topic
        for topic in top_topics:
            fig.add_trace(go.Scatter(
                x=user_temporal_plot.index,
                y=user_temporal_plot[topic],
                mode='lines',
                stackgroup='one',
                name=topic
            ))
        
        # Add trace for 'Other'
        fig.add_trace(go.Scatter(
            x=user_temporal_plot.index,
            y=user_temporal_plot['Other'],
            mode='lines',
            stackgroup='one',
            name='Other'
        ))
        
        fig.update_layout(
            title=f'Topic Evolution Over Time for User {user_id}',
            xaxis_title='Time',
            yaxis_title='Topic Proportion',
            hovermode='x unified',
            height=500,
            width=900
        )
        
        fig.show()
        
        # Also show line chart for individual topic evolution
        if show_individual.value:
            fig2 = go.Figure()
            
            for topic in top_topics:
                fig2.add_trace(go.Scatter(
                    x=user_temporal.index,
                    y=user_temporal[topic],
                    mode='lines+markers',
                    name=topic
                ))
            
            fig2.update_layout(
                title=f'Individual Topic Evolution for User {user_id}',
                xaxis_title='Time',
                yaxis_title='Topic Proportion',
                height=500,
                width=900
            )
            
            fig2.show()
    
    # Create user selector for temporal analysis
    temporal_user_list = sorted(list(temporal_topic_data.keys()))
    temporal_user_selector = widgets.Dropdown(
        options=temporal_user_list,
        value=temporal_user_list[0] if temporal_user_list else None,
        description='User:',
        style={'description_width': 'initial'}
    )
    
    # Create toggle for showing individual topic lines
    show_individual = widgets.Checkbox(
        value=False,
        description='Show Individual Topic Lines',
        style={'description_width': 'initial'}
    )
    
    # Display interactive widget
    display(widgets.VBox([
        widgets.HTML('<h3>Temporal Topic Analysis</h3>'),
        widgets.HBox([temporal_user_selector, show_individual]),
        widgets.interactive_output(plot_temporal_topics, {
            'user_id': temporal_user_selector,
            'show_individual': show_individual
        })
    ]))
else:
    print("Temporal topic data not found. Skipping temporal analysis.")

# 6. Topic Similarity and Clustering


In [None]:
# Create topic similarity visualization based on top words

def plot_topic_similarity():
    # Create a similarity matrix based on user distributions
    topic_vectors = np.array([user_topic_df[f'Topic_{i}'].values for i in range(num_topics)])
    
    # Calculate cosine similarity between topics
    from sklearn.metrics.pairwise import cosine_similarity
    similarity_matrix = cosine_similarity(topic_vectors)
    
    # Create heatmap
    fig = px.imshow(
        similarity_matrix,
        labels=dict(x="Topic", y="Topic", color="Similarity"),
        x=[f'Topic {i}' for i in range(num_topics)],
        y=[f'Topic {i}' for i in range(num_topics)],
        color_continuous_scale='Viridis'
    )
    
    fig.update_layout(
        title='Topic Similarity Matrix',
        height=600,
        width=800
    )
    
    fig.show()
    
    # Create a network visualization of topic relationships
    if show_network.value:
        import networkx as nx
        
        # Create a graph
        G = nx.Graph()
        
        # Add nodes
        for i in range(num_topics):
            G.add_node(i)
        
        # Add edges with weight based on similarity
        threshold = min_similarity.value
        for i in range(num_topics):
            for j in range(i+1, num_topics):
                if similarity_matrix[i, j] >= threshold:
                    G.add_edge(i, j, weight=similarity_matrix[i, j])
        
        # Create positions
        pos = nx.spring_layout(G, seed=42)
        
        # Create edge trace
        edge_x = []
        edge_y = []
        edge_weights = []
        
        for edge in G.edges(data=True):
            x0, y0 = pos[edge[0]]
            x1, y1 = pos[edge[1]]
            edge_x.extend([x0, x1, None])
            edge_y.extend([y0, y1, None])
            edge_weights.append(edge[2]['weight'])
        
        edge_trace = go.Scatter(
            x=edge_x, y=edge_y,
            line=dict(width=1, color='rgba(150,150,150,0.8)'),
            hoverinfo='none',
            mode='lines')
        
        # Create node trace
        node_x = []
        node_y = []
        
        for node in G.nodes():
            x, y = pos[node]
            node_x.append(x)
            node_y.append(y)
        
        node_trace = go.Scatter(
            x=node_x, y=node_y,
            mode='markers+text',
            text=[f'Topic {i}' for i in range(num_topics)],
            textposition="top center",
            marker=dict(
                showscale=True,
                colorscale='Viridis',
                size=20,
                colorbar=dict(
                    thickness=15,
                    title='Node Connections',
                    xanchor='left',
                    titleside='right'
                ),
                line_width=2))
        
        # Color nodes by number of connections
        node_adjacencies = []
        for node, adjacencies in enumerate(G.adjacency()):
            node_adjacencies.append(len(adjacencies[1]))
        
        node_trace.marker.color = node_adjacencies
        
        # Create figure
        network_fig = go.Figure(data=[edge_trace, node_trace],
                         layout=go.Layout(
                            title='Topic Similarity Network',
                            showlegend=False,
                            hovermode='closest',
                            margin=dict(b=20,l=5,r=5,t=40),
                            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                            width=700,
                            height=600
                         ))
        
        network_fig.show()

# Create controls for topic similarity visualization
show_network = widgets.Checkbox(
    value=True,
    description='Show Network Visualization',
    style={'description_width': 'initial'}
)

min_similarity = widgets.FloatSlider(
    value=0.3,
    min=0.0,
    max=1.0,
    step=0.05,
    description='Min Similarity:',
    style={'description_width': 'initial'}
)

# Display interactive widget
display(widgets.VBox([
    widgets.HTML('<h3>Topic Similarity Analysis</h3>'),
    widgets.HBox([show_network, min_similarity]),
    widgets.interactive_output(plot_topic_similarity, {
        'show_network': show_network,
        'min_similarity': min_similarity
    })
]))

# 7. User Clustering based on Topic Distribution


In [None]:
def plot_user_clustering():
    from sklearn.manifold import TSNE
    from sklearn.cluster import KMeans
    
    # Extract topic distributions for each user
    X = np.array([
        [user_topic_df.iloc[i][f'Topic_{j}'] for j in range(num_topics)]
        for i in range(len(user_topic_df))
    ])
    
    # Apply dimensionality reduction
    tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(X)-1))
    X_tsne = tsne.fit_transform(X)
    
    # Apply clustering
    n_clusters = num_clusters.value
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(X)
    
    # Create DataFrame for plotting
    plot_df = pd.DataFrame({
        'x': X_tsne[:, 0],
        'y': X_tsne[:, 1],
        'user_id': user_topic_df['user_id'],
        'cluster': clusters,
        'post_count': user_topic_df['post_count'],
        'gini': user_topic_df['gini_coefficient']
    })
    
    # Add suspicious flag if available
    if 'suspicious' in suspicious_df.columns:
        susp_dict = dict(zip(suspicious_df['user_id'], suspicious_df['suspicious']))
        plot_df['suspicious'] = plot_df['user_id'].map(susp_dict).fillna(False)
    
    # Create scatter plot
    fig = px.scatter(
        plot_df,
        x='x',
        y='y',
        color='cluster',
        size='post_count',
        hover_data=['user_id', 'gini', 'post_count'],
        size_max=20,
        opacity=0.7,
        title=f'User Clustering based on Topic Distribution (K-means, {n_clusters} clusters)'
    )
    
    # Highlight suspicious users if available
    if 'suspicious' in plot_df.columns:
        susp_df = plot_df[plot_df['suspicious']]
        
        fig.add_trace(
            go.Scatter(
                x=susp_df['x'],
                y=susp_df['y'],
                mode='markers',
                marker=dict(
                    color='red',
                    size=15,
                    line=dict(color='black', width=2)
                ),
                name='Suspicious',
                text=susp_df['user_id'],
                hoverinfo='text'
            )
        )
    
    fig.update_layout(
        xaxis_title="t-SNE Dimension 1",
        yaxis_title="t-SNE Dimension 2",
        height=700,
        width=900
    )
    
    fig.show()
    
    # Show cluster statistics
    if show_cluster_stats.value:
        cluster_stats = plot_df.groupby('cluster').agg({
            'user_id': 'count',
            'gini': 'mean',
            'post_count': 'mean'
        }).reset_index()
        
        cluster_stats.columns = ['Cluster', 'User Count', 'Avg Gini', 'Avg Post Count']
        display(cluster_stats.style.background_gradient(cmap='Blues'))
        
        # Show dominant topics per cluster
        print("\nDominant Topics per Cluster:")
        cluster_topic_means = []
        
        for cluster_id in range(n_clusters):
            cluster_users = plot_df[plot_df['cluster'] == cluster_id]['user_id']
            cluster_user_data = user_topic_df[user_topic_df['user_id'].isin(cluster_users)]
            
            # Calculate mean topic distribution for this cluster
            topic_means = [cluster_user_data[f'Topic_{i}'].mean() for i in range(num_topics)]
            
            # Get top 3 topics
            top_topics = np.argsort(topic_means)[-3:][::-1]
            top_values = [topic_means[i] for i in top_topics]
            
            print(f"Cluster {cluster_id}: Topic {top_topics[0]} ({top_values[0]:.3f}), "
                 f"Topic {top_topics[1]} ({top_values[1]:.3f}), "
                 f"Topic {top_topics[2]} ({top_values[2]:.3f})")
            
            cluster_topic_means.append(topic_means)
        
        # Create heatmap of cluster-topic distributions
        if show_heatmap.value:
            cluster_topic_df = pd.DataFrame(
                cluster_topic_means,
                columns=[f'Topic {i}' for i in range(num_topics)]
            )
            
            fig = px.imshow(
                cluster_topic_df,
                labels=dict(x="Topic", y="Cluster", color="Mean Weight"),
                x=[f'Topic {i}' for i in range(num_topics)],
                y=[f'Cluster {i}' for i in range(n_clusters)],
                color_continuous_scale='Viridis',
                aspect="auto"
            )
            
            fig.update_layout(
                title='Cluster-Topic Distribution Heatmap',
                height=400,
                width=900
            )
            
            fig.show()

# Create controls for user clustering visualization
num_clusters = widgets.IntSlider(
    value=5,
    min=2,
    max=15,
    step=1,
    description='Clusters:',
    style={'description_width': 'initial'}
)

show_cluster_stats = widgets.Checkbox(
    value=True,
    description='Show Cluster Statistics',
    style={'description_width': 'initial'}
)

show_heatmap = widgets.Checkbox(
    value=True,
    description='Show Cluster-Topic Heatmap',
    style={'description_width': 'initial'}
)

# Display interactive widget
display(widgets.VBox([
    widgets.HTML('<h3>User Clustering Analysis</h3>'),
    widgets.HBox([num_clusters, show_cluster_stats, show_heatmap]),
    widgets.interactive_output(plot_user_clustering, {
        'num_clusters': num_clusters,
        'show_cluster_stats': show_cluster_stats,
        'show_heatmap': show_heatmap
    })
]))