In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import math

# Hyperparameters
thresholds = [1000, 10000, 100000]  # User count thresholds
case_sensitive = False  # Set True for case sensitive subreddit names
top_n = 50  # Number of top subreddits to display

# Load the data from CSV files
submissions = pd.read_csv("filtered_submissions.csv")
comments = pd.read_csv("filtered_comments.csv")
subreddit_user_counts = pd.read_csv("subreddit_user_counts.csv")

# Adjust for case sensitivity if necessary
if not case_sensitive:
    submissions['subreddit'] = submissions['subreddit'].str.lower()
    comments['subreddit'] = comments['subreddit'].str.lower()
    subreddit_user_counts['subreddit'] = subreddit_user_counts['subreddit'].str.lower()

# Function to calculate and plot data
def process_and_plot(data, user_counts, title_prefix):
    # Unique SW user counts per subreddit
    sw_user_counts = data.groupby('subreddit')['author'].nunique()

    # Plotting setup
    num_rows = math.ceil((len(thresholds) + 1) / 2)
    fig, axes = plt.subplots(nrows=num_rows, ncols=2, figsize=(20, 6 * num_rows))

    # Plot raw user counts
    top_user_counts = sw_user_counts.sort_values(ascending=False).head(top_n)
    top_user_counts.plot(kind='bar', ax=axes[0, 0], color='blue', title=f'{title_prefix} by Raw SW User Count')
    axes[0, 0].set_ylabel('Raw SW User Count')

    # List of subreddits in the raw count
    raw_count_subreddits = set(top_user_counts.index)

    # Process and plot for each threshold
    for i, threshold in enumerate(thresholds, start=1):
        row, col = divmod(i, 2)
        filtered_counts = user_counts[user_counts['unique_users'] >= threshold]
        normalized_counts = {}

        for subreddit, total_users in tqdm(filtered_counts.set_index('subreddit')['unique_users'].items(), total=len(filtered_counts)):
            sw_user_count = sw_user_counts.get(subreddit, 0)
            normalized_counts[subreddit] = sw_user_count / total_users

        normalized_counts_series = pd.Series(normalized_counts)
        top_normalized = normalized_counts_series.sort_values(ascending=False).head(top_n)
        bar_colors = ['green' if sub in raw_count_subreddits else 'blue' for sub in top_normalized.index]

        # Plot normalized data
        top_normalized.plot(kind='bar', ax=axes[row, col], color=bar_colors, title=f'{title_prefix} (Threshold: {threshold})')
        axes[row, col].set_ylabel('Normalized SW User Post Count')

    plt.tight_layout()
    plt.show()

# Process and plot submissions data
process_and_plot(submissions, subreddit_user_counts, 'Top Subreddits for Submissions')

# Process and plot comments data
process_and_plot(comments, subreddit_user_counts, 'Top Subreddits for Comments')
