In [1]:
import bz2
import json
import random
from time import time
import csv
from nltk.tokenize import word_tokenize
import joblib
from joblib import Parallel, delayed
from tqdm import tqdm
import pickle

In [2]:
def get_subreddit_data(filename, banned_subreddits, max_comments=100000, print_freq=-1):
    comments = []
    
    with bz2.BZ2File(filename, "r") as f:
        i = 0
        for line in f:
            if print_freq > 0 and i%print_freq ==0:
                print(i, flush=True)
            if i >= max_comments:
                break
            i += 1
            
            comm = json.loads(line.decode())
            subreddit = comm["subreddit"]
            trimmed_comm = {"subreddit": subreddit,
                            "banned": subreddit in banned_subreddits,
                            "body": comm["body"]}
            comments.append(trimmed_comm)
            
    return comments



def read_banned_subreddits(filename):
    """read in banned subreddits file with each line as 'r/asfasdf' """
    with open(filename, "r") as f:
        banned = set()
        for line in f:
            if line.strip() != "":
                banned.update([line.strip().split("r/")[1]])
        return banned


In [3]:
def flatten(l):
    return [item for sublist in l for item in sublist]

def divide_chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]




def chunkify_test(test, words_per_example=200):
    subreddit_to_comments = dict()
    for comm in test:
        if comm["subreddit"] in subreddit_to_comments:
            subreddit_to_comments[comm["subreddit"]].append(comm)
        else:
            subreddit_to_comments[comm["subreddit"]] = [comm]

    examples = flatten([chunkify_subreddit(subreddit, words_per_example)
                        for _, subreddit in subreddit_to_comments.items()])
    return examples

    

def split_data(subreddit_to_comments, ratio, words_per_example=200):
    random.seed(42)

    flattened = flatten([comments for _, comments in subreddit_to_comments.items()])
    random.shuffle(flattened)

    split_idx = int(len(flattened) * ratio)
    
    train = flattened[:split_idx]
    test = flattened[split_idx:]

    train_banned, train_notbanned = chunkify_train(train, words_per_example)
    test_examples = chunkify_test(test, words_per_example)
    return train_banned, train_notbanned, test_examples
    

def write_train_examples(examples, filename):
    with open(filename, "w") as f:
        writer = csv.writer(f)
        writer.writerow(["words"])
        for example in examples:
            writer.writerow([" ".join(example)])


def write_test_examples(examples, filename):
    with open(filename, "w") as f:
        writer = csv.writer(f)
        writer.writerow(["subreddit", "banned", "words"])
        for example in examples:
            writer.writerow([example["subreddit"], example["banned"], " ".join(example["words"])])

    

In [4]:
# def process_block(comments, words_per_example=200, ratio=0.8):
#     random.shuffle(comments)
#     split_idx = int(len(comments) * ratio)
    
#     train = comments[:split_idx]
#     test = comments[split_idx:]
    
#     train_banned, train_notbanned = chunkify_train(train, words_per_example)
#     
#     return train_banned, train_notbanned, test

In [5]:
def chunkify_train(comments, words_per_example=200):
    banned_stream = []
    notbanned_stream = []
    
    for comm in comments:
        if comm["banned"]:
            banned_stream += word_tokenize(comm["body"])
        else:
            notbanned_stream += word_tokenize(comm["body"])
    
    banned_examples = list(divide_chunks(banned_stream, words_per_example))
    notbanned_examples = list(divide_chunks(notbanned_stream, words_per_example))

    return banned_examples, notbanned_examples

def chunkify_subreddit(subreddit, words_per_example=200):
    banned = subreddit[0]["banned"]
    my_subreddit = subreddit[0]["subreddit"]
    
    stream = []
    for comm in subreddit:
        stream += word_tokenize(comm["body"])
    
    chunks = list(divide_chunks(stream, words_per_example))
    return [{"subreddit": my_subreddit, "banned": banned, "words": chunk} for chunk in chunks]

def chunkify_subreddits(subreddits, words_per_example=200):
    return [chunkify_subreddit(subreddit, words_per_example) for subreddit in subreddits]

def group_test_subreddits(test, words_per_example=200):
    subreddit_to_comments = dict()
    for comm in test:
        if comm["subreddit"] in subreddit_to_comments:
            subreddit_to_comments[comm["subreddit"]].append(comm)
        else:
            subreddit_to_comments[comm["subreddit"]] = [comm]

    return subreddit_to_comments



In [11]:
CACHED_COMMENTS = "Data/Cached/2016_10_10M_comments.pkl"
RATIO = 0.8
BLOCK_SIZE = 50000
WORDS_PER_EXAMPLE = 200

try:
    comments = pickle.load(open(CACHED_COMMENTS, "rb"))
    print("Loaded from cache")

except:
    print("Failed to load from cache")
    data_folder = "Data/"

    banned_subreddits = read_banned_subreddits(data_folder + "banned-subreddits.txt")
    comments  = get_subreddit_data(data_folder + "RC_2016-10.bz2", banned_subreddits, max_comments=int(1e7), print_freq=10000)   
    with open(CACHED_COMMENTS, "wb") as f:
        pickle.dump(comments, f)
print("Shuffling")
random.shuffle(comments)
split_idx = int(len(comments) * RATIO)

In [12]:
BLOCK_SIZE = 50000
train = comments[:split_idx]
train_blocks = list(divide_chunks(train, BLOCK_SIZE))
train_res = Parallel(n_jobs=-1)(delayed(chunkify_train)(block, WORDS_PER_EXAMPLE) for block in tqdm(train_blocks))


100%|██████████| 160/160 [07:15<00:00,  2.89s/it]


In [13]:
FNAME = "Data/Generated/200_words_10M_%s.csv"
fbanned = open(FNAME%"banned", "w")
fnotbanned = open(FNAME%"notbanned", "w")

writer_banned  = csv.writer(fbanned)
writer_notbanned  = csv.writer(fnotbanned)


writer_banned.writerow(["words"])
writer_notbanned.writerow(["words"])


for banned_words_set, not_banned_words_set in tqdm(train_res):
    for banned_words in banned_words_set:
        writer_banned.writerow([' '.join(banned_words)])
    for not_banned_words in not_banned_words_set:
        writer_notbanned.writerow([' '.join(not_banned_words)])


100%|██████████| 160/160 [00:46<00:00,  3.35it/s]


In [14]:
test = comments[split_idx:]
test_subreddit_to_comments = group_test_subreddits(test)

In [15]:
test_blocks = list(divide_chunks(list(test_subreddit_to_comments.values()), int(len(test_subreddit_to_comments) / 31) + 1))
test_res = Parallel(n_jobs=-1)(delayed(chunkify_subreddits)(subreddits, WORDS_PER_EXAMPLE)
                        for subreddits in tqdm(test_blocks))

100%|██████████| 31/31 [00:00<00:00, 100.29it/s]


In [16]:
len(test_subreddit_to_comments)

21057

In [17]:
FNAME = "Data/Generated/200_words_10M_test.csv"
ftest = open(FNAME, "w")

writer_test = csv.writer(ftest)
writer_test.writerow(["subreddit", "banned", "words"])
for batch in test_res:
    for subreddit in batch:
        for example in subreddit:
            writer_test.writerow([example["subreddit"], example["banned"], ' '.join(example["words"])])