In [None]:
import os
import pickle
from tqdm import tqdm
from collections import defaultdict
from fileStreams import getFileJsonStream

def collect_comments_by_user(zst_path):
    user_comments = defaultdict(list)
    with open(zst_path, "rb") as f:
        jsonStream = getFileJsonStream(zst_path, f)
        if jsonStream is None:
            raise ValueError(f"Cannot read file: {zst_path}")
        for row in tqdm(jsonStream, desc=f"Processing {os.path.basename(zst_path)}"):
            if "author" not in row or "body" not in row:
                continue
            author = row["author"]
            text = row["body"]
            user_comments[author].append(text)
    return dict(user_comments)

if __name__ == "__main__":

    zst_paths = [
        "data/Republican_comments.zst",
        "data/democrats_comments.zst"
        # Add more subreddit comment files here if needed
    ]
    output_path = "output/user_comments/all_users_1.pkl"
    os.makedirs("output/user_comments", exist_ok=True)

    all_user_comments = defaultdict(list)
    for zst_path in zst_paths:
        print(f"Handling {zst_path} ...")
        user_comments = collect_comments_by_user(zst_path)
        for user, comments in user_comments.items():
            all_user_comments[user].extend(comments)
        print(f"Collected {len(user_comments)} users from {zst_path}")

    print(f"Total unique users: {len(all_user_comments)}, total comments: {sum(len(coms) for coms in all_user_comments.values())}")

    with open(output_path, "wb") as f:
        pickle.dump(dict(all_user_comments), f)
    print(f"Saved to {output_path}")

Handling data/Republican_comments.zst ...


Processing Republican_comments.zst: 1405486it [00:05, 257621.31it/s]


Collected 115145 users from data/Republican_comments.zst
Handling data/democrats_comments.zst ...


Processing democrats_comments.zst: 2011525it [00:08, 238479.47it/s]


Collected 140087 users from data/democrats_comments.zst
Total unique users: 242287, total comments: 3417011
Saved to output/user_comments/all_users_1.pkl
Sample user: political, Comments: 16
First two comments: ['Somebody please take away his batteries.', "Why would you post this here? Republicans don't care if they are being manipulated by those in charge. The vast majority of them are natural followers that are frightened as little white rabbits unless a big masculine man with cowboy boots lead them to their bed at night.   \n  \nWhich is why they need so many guns. They're sceered."]


In [None]:
sample_users = list(all_user_comments.keys())[:5]
for user in sample_users:
    print(f"User: {user}, Number of comments: {len(all_user_comments[user])}")
    print("First comment:", all_user_comments[user][0])
    print("---")

User: political, Number of comments: 16
First comment: Somebody please take away his batteries.
---
User: [deleted], Number of comments: 668051
First comment: So now the "Left" and the liberal media is so insecure that they have to pontificate to the rest of us how "WRONG" a radio talk show host is? He is only one man, but with a voice that alarms the whole political and cultural establishment when he speaks. This type of media focus is very reminiscent of "police states". Only approved opinions are acceptable for the public to read and hear. Rush is that good and the establishment is that worried they have to shut him up. The more attention, the better.
---
User: mayonesa, Number of comments: 427
First comment: What a simple world we live in!

Religion bad, science good, sad bad, happy good!

No brains active there.
---
User: Turbo_X, Number of comments: 1
First comment: just now discovered r/republican, and as a centrist who leans left..... im glad this place exist, I only wish more 