In [2]:
import os
import re
import datetime
import matplotlib.pyplot as plt
from collections import defaultdict
from fileStreams import getFileJsonStream

# 配置
subreddits = {
    "democrats": r"datasets/democrats_comments.zst",
    "republican": r"datasets/Republican_comments.zst"
    # "conservative": r"datasets/Conservative_comments.zst",
    # "liberal": r"datasets/Liberal_comments.zst",
    # "backpacking": r"datasets/backpacking_comments.zst",
    # "vagabond": r"datasets/vagabond_comments.zst"
}
bots = {"AutoModerator", "election_info_bot"}
periods = ["before_2016", "2017_2020", "2021_2024"]

def get_period(year):
    if year <= 2016:
        return "before_2016"
    elif 2017 <= year <= 2020:
        return "2017_2020"
    elif 2021 <= year <= 2024:
        return "2021_2024"
    else:
        return None

results = defaultdict(lambda: defaultdict(lambda: {"bot": 0, "total": 0}))

for subreddit, path in subreddits.items():
    print(f"Processing {subreddit} ...")
    with open(path, "rb") as f:
        stream = getFileJsonStream(path, f)
        if stream is None:
            print(f"Cannot open {path}")
            continue
        for row in stream:
            if "author" not in row or "created_utc" not in row:
                continue
            author = row["author"]
            try:
                year = datetime.datetime.fromtimestamp(int(row["created_utc"])).year
            except Exception:
                continue
            period = get_period(year)
            if not period:
                continue
            results[subreddit][period]["total"] += 1
            if author in bots:
                results[subreddit][period]["bot"] += 1

# 画图
for subreddit in subreddits:
    print(f"\n==== {subreddit} ====")
    total_all = 0
    bot_all = 0
    for period in periods:
        total = results[subreddit][period]["total"]
        bot = results[subreddit][period]["bot"]
        percent = (bot / total * 100) if total > 0 else 0
        total_all += total
        bot_all += bot
        print(f"{period}: total={total}, bot={bot}, percent={percent:.2f}%")
    percent_all = (bot_all / total_all * 100) if total_all > 0 else 0
    print(f"ALL: total={total_all}, bot={bot_all}, percent={percent_all:.2f}%")
    
    bot_percents = []
    for period in periods:
        total = results[subreddit][period]["total"]
        bot = results[subreddit][period]["bot"]
        percent = (bot / total * 100) if total > 0 else 0
        bot_percents.append(percent)
    plt.figure(figsize=(6,4))
    plt.bar(periods, bot_percents, color="#ff7f0e", alpha=0.7)
    plt.title(f"{subreddit}: % Comments from Bots (AutoModerator/election_info_bot)")
    plt.ylabel("% of Comments")
    plt.xlabel("Period")
    plt.ylim(0, max(bot_percents)*1.2 + 1)
    plt.grid(axis="y", alpha=0.3)
    for i, v in enumerate(bot_percents):
        plt.text(i, v+0.5, f"{v:.2f}%", ha="center", fontsize=10)
    plt.tight_layout()
    plt.savefig(f"output/bot_ratio_{subreddit}.png")
    plt.close()
    print(f"Saved plot for {subreddit}")

print("Done.")

Processing democrats ...
Processing republican ...

==== democrats ====
before_2016: total=130578, bot=1368, percent=1.05%
2017_2020: total=494506, bot=15188, percent=3.07%
2021_2024: total=1386441, bot=35765, percent=2.58%
ALL: total=2011525, bot=52321, percent=2.60%
Saved plot for democrats

==== republican ====
before_2016: total=266598, bot=225, percent=0.08%
2017_2020: total=454438, bot=33965, percent=7.47%
2021_2024: total=684450, bot=63398, percent=9.26%
ALL: total=1405486, bot=97588, percent=6.94%
Saved plot for republican
Done.
