In [1]:
import os
import pickle
import datetime
import re
import html
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from fileStreams import getFileJsonStream
import numpy as np
from tqdm import tqdm
import random


# Initialize global resources once
STOP_WORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

# POS tag cache to avoid redundant tagging
POS_CACHE = {}
# Lemma cache to avoid redundant lemmatization
LEMMA_CACHE = {}

def get_wordnet_pos(tag):
    """Convert NLTK POS tag to WordNet POS tag"""
    if tag.startswith('J'):
        return 'a'  # adjective
    elif tag.startswith('V'):
        return 'v'  # verb
    elif tag.startswith('N'):
        return 'n'  # noun
    elif tag.startswith('R'):
        return 'r'  # adverb
    else:
        return 'n'  # default as noun

def preprocess_text(text, lemmatize=True, without_stopwords=True):
    """Preprocess Reddit text content with optimized NLTK operations"""
    # Handle HTML entities
    text = html.unescape(text)
    
    # Unicode normalization
    text = unicodedata.normalize('NFKD', text)
    
    # Remove URLs and Markdown formatting
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)
    text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
    text = re.sub(r'\*(.*?)\*', r'\1', text)
    
    # Remove subreddit and user references
    text = re.sub(r'/r/\w+', '', text)
    text = re.sub(r'r/\w+', '', text)
    text = re.sub(r'/u/\w+', '', text)
    text = re.sub(r'u/\w+', '', text)
    
    # Basic text cleaning
    text = re.sub("[^A-Za-z]+", ' ', text).lower()
    
    words = text.split()
    if not words:
        return []
    
    # Lemmatization first
    if lemmatize:
        # POS tagging (with cache)
        uncached_words = [w for w in words if w not in POS_CACHE]
        if uncached_words:
            tagged_uncached = nltk.pos_tag(uncached_words)
            for word, tag in tagged_uncached:
                POS_CACHE[word] = tag
        processed_words = []
        for word in words:
            tag = POS_CACHE[word]
            wordnet_pos = get_wordnet_pos(tag)
            lemma_key = (word, wordnet_pos)
            if lemma_key in LEMMA_CACHE:
                lemma = LEMMA_CACHE[lemma_key]
            else:
                lemma = LEMMATIZER.lemmatize(word, pos=wordnet_pos)
                LEMMA_CACHE[lemma_key] = lemma
            processed_words.append(lemma)
    else:
        processed_words = words

    # Remove stopwords after lemmatization
    if without_stopwords:
        processed_words = [w for w in processed_words if w not in STOP_WORDS]

    # Remove all words with length <= 2 and > 15
    processed_words = [w for w in processed_words if len(w) > 2 and len(w) <= 15]

    return processed_words


def process_and_save_comments(path, subreddit, output_dir, without_stopwords=True, batch_size=1000000):
    """Process comments and save in batches"""
    print(f"Processing file: {path}")
    
    # Batch processing counters
    batch_count = 0
    batch_number = 1
    total_count = 0
    
    # Create data structure for comments
    comments_batch = []

    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Unable to read file {path}")
            return
        
        for row in tqdm(jsonStream, desc=f"Processing {subreddit} comments"):
            if "body" not in row or "created_utc" not in row or "author" not in row or "id" not in row:
                continue
                
            author = row["author"]
            if author in {"AutoModerator", "election_info_bot"}:
                continue
            
            comment_id = row["id"]
            text = row["body"]
            created_timestamp = row["created_utc"]
            date = datetime.datetime.fromtimestamp(int(created_timestamp))
            
            # Process text with optimized functions
            processed_words = preprocess_text(text, lemmatize=True, without_stopwords=without_stopwords)
            
            if processed_words:
                # Save processed comment with metadata
                comment_data = {
                    "comment_id": comment_id,
                    "author": author,
                    "date": date.strftime("%Y-%m-%d"),
                    "timestamp": created_timestamp,
                    "processed_text": processed_words,  # Original order preserved
                    "original": text
                }
                
                comments_batch.append(comment_data)
                batch_count += 1
                
            # Check if we need to save the current batch
            if batch_count >= batch_size:
                print(f"\nReached {batch_size} comments, saving batch {batch_number}...")
                
                # Save batch directly without filtering
                save_path = f"{output_dir}/{subreddit}_batch{batch_number}.pkl"
                with open(save_path, "wb") as out_file:
                    pickle.dump(comments_batch, out_file)
                
                print(f"Saved {len(comments_batch)} comments to {save_path}")
                
                # Reset batch data
                comments_batch = []
                batch_count = 0
                batch_number += 1
                total_count += batch_size
    
    # Process any remaining comments
    if batch_count > 0:
        print(f"\nSaving remaining {batch_count} comments...")
        
        # Save batch
        save_path = f"{output_dir}/{subreddit}_batch{batch_number}.pkl"
        with open(save_path, "wb") as out_file:
            pickle.dump(comments_batch, out_file)
        
        print(f"Saved {len(comments_batch)} comments to {save_path}")
        total_count += batch_count
    
    print(f"\nCompleted processing {subreddit} comments!")
    print(f"Total comments saved: {total_count}")


def main():
    """Main function"""
    random.seed(23)
    np.random.seed(23)
    
    # Define data file paths
    files = {
        "democrats": r"datasets/democrats_comments.zst",
        "republican": r"datasets/Republican_comments.zst",
        "conservative": r"datasets/Conservative_comments.zst",
        "liberal": r"datasets/Liberal_comments.zst"
        # "vagabond": r"datasets/vagabond_comments.zst",
        # "backpacking": r"datasets/backpacking_comments.zst"
        # "cooking": r"datasets/Cooking_comments.zst",
        # "travel": r"datasets/travel_comments.zst",
        # "books": r"datasets/books_comments.zst",
        # "gaming": r"datasets/gaming_comments.zst",
        # "movies": r"datasets/movies_comments.zst",
        # "technology": r"datasets/technology_comments.zst",
        # "personalfinance": r"datasets/personalfinance_comments.zst"
    }
    
    # List of subreddits to process (process all by default)
    subreddits_to_process = list(files.keys())

    for subreddit in subreddits_to_process:
        output_dir = f"processed_comments_2/{subreddit}"
        os.makedirs(output_dir, exist_ok=True)
        print(f"\nProcessing subreddit: {subreddit}")
        process_and_save_comments(
            files[subreddit],
            subreddit,
            output_dir,
            without_stopwords=False,
            batch_size=1000000
        )

if __name__ == "__main__":
    main()


Processing subreddit: democrats
Processing file: datasets/democrats_comments.zst


Processing democrats comments: 1035748it [00:33, 36947.44it/s]


Reached 1000000 comments, saving batch 1...
Saved 1000000 comments to processed_comments_2/democrats/democrats_batch1.pkl


Processing democrats comments: 2011525it [01:02, 32007.62it/s]



Saving remaining 941514 comments...
Saved 941514 comments to processed_comments_2/democrats/democrats_batch2.pkl

Completed processing democrats comments!
Total comments saved: 1941514

Processing subreddit: republican
Processing file: datasets/Republican_comments.zst


Processing republican comments: 1071593it [00:33, 37639.71it/s]


Reached 1000000 comments, saving batch 1...


Processing republican comments: 1082696it [00:36, 8237.65it/s] 

Saved 1000000 comments to processed_comments_2/republican/republican_batch1.pkl


Processing republican comments: 1405486it [00:44, 31463.31it/s]



Saving remaining 296609 comments...
Saved 296609 comments to processed_comments_2/republican/republican_batch2.pkl

Completed processing republican comments!
Total comments saved: 1296609

Processing subreddit: conservative
Processing file: datasets/Conservative_comments.zst


Processing conservative comments: 1006762it [00:35, 36803.70it/s]


Reached 1000000 comments, saving batch 1...


Processing conservative comments: 1014581it [00:38, 5820.53it/s] 

Saved 1000000 comments to processed_comments_2/conservative/conservative_batch1.pkl


Processing conservative comments: 2017516it [01:12, 33795.11it/s]


Reached 1000000 comments, saving batch 2...
Saved 1000000 comments to processed_comments_2/conservative/conservative_batch2.pkl


Processing conservative comments: 3027286it [01:46, 38016.38it/s]


Reached 1000000 comments, saving batch 3...


Processing conservative comments: 3031119it [01:48, 4961.89it/s] 

Saved 1000000 comments to processed_comments_2/conservative/conservative_batch3.pkl


Processing conservative comments: 4035096it [02:19, 31260.41it/s]


Reached 1000000 comments, saving batch 4...


Processing conservative comments: 4040739it [02:22, 5201.18it/s] 

Saved 1000000 comments to processed_comments_2/conservative/conservative_batch4.pkl


Processing conservative comments: 5041290it [02:56, 35423.96it/s]


Reached 1000000 comments, saving batch 5...


Processing conservative comments: 5048101it [02:58, 5489.24it/s] 

Saved 1000000 comments to processed_comments_2/conservative/conservative_batch5.pkl


Processing conservative comments: 6056257it [03:30, 38948.72it/s] 


Reached 1000000 comments, saving batch 6...
Saved 1000000 comments to processed_comments_2/conservative/conservative_batch6.pkl


Processing conservative comments: 7062498it [04:02, 35291.61it/s]


Reached 1000000 comments, saving batch 7...
Saved 1000000 comments to processed_comments_2/conservative/conservative_batch7.pkl


Processing conservative comments: 8074131it [04:32, 36454.52it/s]


Reached 1000000 comments, saving batch 8...


Processing conservative comments: 8081473it [04:34, 6967.19it/s] 

Saved 1000000 comments to processed_comments_2/conservative/conservative_batch8.pkl


Processing conservative comments: 9083618it [05:03, 40876.27it/s]


Reached 1000000 comments, saving batch 9...


Processing conservative comments: 9091379it [05:05, 7210.58it/s] 

Saved 1000000 comments to processed_comments_2/conservative/conservative_batch9.pkl


Processing conservative comments: 10094490it [05:36, 33497.27it/s]


Reached 1000000 comments, saving batch 10...


Processing conservative comments: 10100945it [05:38, 5879.85it/s] 

Saved 1000000 comments to processed_comments_2/conservative/conservative_batch10.pkl


Processing conservative comments: 11103686it [06:09, 37609.44it/s]


Reached 1000000 comments, saving batch 11...


Processing conservative comments: 11111381it [06:11, 7087.76it/s] 

Saved 1000000 comments to processed_comments_2/conservative/conservative_batch11.pkl


Processing conservative comments: 12109823it [06:41, 37346.77it/s]


Reached 1000000 comments, saving batch 12...
Saved 1000000 comments to processed_comments_2/conservative/conservative_batch12.pkl


Processing conservative comments: 13115643it [07:14, 38794.36it/s]


Reached 1000000 comments, saving batch 13...
Saved 1000000 comments to processed_comments_2/conservative/conservative_batch13.pkl


Processing conservative comments: 14120897it [07:44, 51417.42it/s]


Reached 1000000 comments, saving batch 14...
Saved 1000000 comments to processed_comments_2/conservative/conservative_batch14.pkl


Processing conservative comments: 15125693it [08:14, 45443.68it/s]


Reached 1000000 comments, saving batch 15...


Processing conservative comments: 15134435it [08:16, 8667.25it/s] 

Saved 1000000 comments to processed_comments_2/conservative/conservative_batch15.pkl


Processing conservative comments: 16132434it [08:44, 37425.75it/s]


Reached 1000000 comments, saving batch 16...


Processing conservative comments: 16139613it [08:46, 7006.94it/s] 

Saved 1000000 comments to processed_comments_2/conservative/conservative_batch16.pkl


Processing conservative comments: 17140972it [09:13, 44113.54it/s]


Reached 1000000 comments, saving batch 17...
Saved 1000000 comments to processed_comments_2/conservative/conservative_batch17.pkl


Processing conservative comments: 18144421it [09:42, 48903.54it/s]


Reached 1000000 comments, saving batch 18...
Saved 1000000 comments to processed_comments_2/conservative/conservative_batch18.pkl


Processing conservative comments: 18984143it [10:02, 31485.70it/s]



Saving remaining 832387 comments...
Saved 832387 comments to processed_comments_2/conservative/conservative_batch19.pkl

Completed processing conservative comments!
Total comments saved: 18832387

Processing subreddit: liberal
Processing file: datasets/Liberal_comments.zst


Processing liberal comments: 497079it [00:18, 27418.37it/s]



Saving remaining 492131 comments...
Saved 492131 comments to processed_comments_2/liberal/liberal_batch1.pkl

Completed processing liberal comments!
Total comments saved: 492131
