Analyzing Subreddit Data from ~10 Years

In [13]:
datapath = "/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/"

In [43]:
import gzip
import json
import os
import pickle
from bz2 import BZ2File
from glob import glob
#from tqdm import tqdm
from tqdm.notebook import tqdm

In [48]:
def _for_each_comment(all_comment_files, fn, comment_checkpoint, file_checkpoint):
    """Run 'fn' once with each comment.  
    At the end of running 'fn', call comment_checkpoint with a line number.
    At the end of running a file, call the file_checkpoint.
    """
    errors = dict()
    for comment_file in all_comment_files:
        print(comment_file)
        fileref = BZ2File(comment_file)
        for linenum, line in enumerate(fileref):
            try:
                comment = json.loads(line)
                fn(comment)
            except InterruptedError:
                break
            except KeyboardInterrupt:
                break
            except Exception as e:
                errors[f"{comment_file}:{linenum}"] = e
            if comment_checkpoint:
                comment_checkpoint(linenum)
        if file_checkpoint:
            file_checkpoint()
    return errors

def for_each_comment(fn, comment_checkpoint=None, file_checkpoint=None):
    all_comment_files = glob(str(os.path.join(datapath, "**", "*.bz2")))
    return _for_each_comment(all_comment_files, fn, comment_checkpoint, file_checkpoint)

def for_each_comment_in_year(year, fn, comment_checkpoint=None, file_checkpoint=None):
    comment_files = glob(str(os.path.join(datapath, "**", f"RC_{year}-*.bz2")))
    return _for_each_comment(comment_files, fn, comment_checkpoint, file_checkpoint)

In [5]:
print(all_comment_files)

['/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2011/RC_2011-10.bz2', '/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2011/RC_2011-01.bz2', '/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2011/RC_2011-06.bz2', '/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2011/RC_2011-08.bz2', '/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2011/RC_2011-11.bz2', '/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2011/RC_2011-09.bz2', '/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2011/RC_2011-07.bz2', '/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2011/RC_2011-03.bz2', '/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2011/RC_2011-04.bz2', '/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2011/RC_2011-12.bz2', '/run/use

In [21]:
json.loads(comment)

{'id': 'c2nx2zj',
 'edited': False,
 'link_id': 't3_kwmiv',
 'downs': 0,
 'gilded': 0,
 'ups': -1,
 'author': 'Kah-Neth',
 'score': -1,
 'controversiality': 0,
 'subreddit': 'hardware',
 'parent_id': 't1_c2nttvb',
 'distinguished': None,
 'author_flair_css_class': None,
 'retrieved_on': 1427670115,
 'author_flair_text': None,
 'name': 't1_c2nx2zj',
 'created_utc': '1317427200',
 'score_hidden': False,
 'body': "A bigger and more important difference is the Xeon's have much more L2 cache.",
 'archived': True,
 'subreddit_id': 't5_2qh18'}

In [27]:
user_comments_by_subreddit = dict()  # Map of username -> subreddit comment count
def tally_comment(com):
    username = com.get('author')
    subreddit = com.get('subreddit')
    if username and subreddit:
        if username not in user_comments_by_subreddit:
            user_comments_by_subreddit[username] = dict()
        user_comments_by_subreddit[username][subreddit] = user_comments_by_subreddit[username].get(subreddit, 0)+1

In [49]:
import time
for year in range(2007, 2016):
    # Clear old user comments...
    print(f"Setting up {year}...")
    user_comments_by_subreddit = dict()
    def tally_comment(com):
        username = com.get('author')
        subreddit = com.get('subreddit')
        if username and subreddit:
            if username not in user_comments_by_subreddit:
                user_comments_by_subreddit[username] = dict()
            user_comments_by_subreddit[username][subreddit] = user_comments_by_subreddit[username].get(subreddit, 0)+1
    def sleep_after_5k(line_count):
        if line_count % 5000 == 0:
            time.sleep(1)
    def checkpoint():
        print("Saving...")
        with gzip.open(f"reddit_comment_tally_{year}.json.gz", 'wt') as fout:
            json.dump(user_comments_by_subreddit, fout)
    # Process comments for year...
    print("Processing...")
    _ = for_each_comment_in_year(year, tally_comment, comment_checkpoint=sleep_after_5k, file_checkpoint=checkpoint)

Setting up 2007...
Processing...
/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2007/RC_2007-10.bz2
/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2007/RC_2007-11.bz2
/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2007/RC_2007-12.bz2
Saving...
Setting up 2008...
Processing...
/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2008/RC_2008-10.bz2
/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2008/RC_2008-08.bz2
/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2008/RC_2008-06.bz2
/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2008/RC_2008-01.bz2
/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2008/RC_2008-11.bz2
/run/user/1000/gvfs/smb-share:server=freenas.local,share=mldata/reddit_data/2008/RC_2008-07.bz2


KeyboardInterrupt: 

In [29]:
len(user_comments_by_subreddit)

420861

In [41]:
user_comments_by_subreddit['omgitsjo']

KeyError: 'omgitsjo'

'/home/joseph/Dropbox/Source'