In [None]:
import os
import pickle
import datetime
import re
import html
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from fileStreams import getFileJsonStream
from collections import defaultdict

# Pre-download required NLTK resources once at the module level
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

# Initialize global resources once
STOP_WORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

# POS tag cache to avoid redundant tagging
POS_CACHE = {}
# Lemma cache to avoid redundant lemmatization
LEMMA_CACHE = {}

def get_wordnet_pos(tag):
    """Convert NLTK POS tag to WordNet POS tag"""
    if tag.startswith('J'):
        return 'a'  # adjective
    elif tag.startswith('V'):
        return 'v'  # verb
    elif tag.startswith('N'):
        return 'n'  # noun
    elif tag.startswith('R'):
        return 'r'  # adverb
    else:
        return 'n'  # default as noun

def preprocess_text(text, lemmatize=True, without_stopwords=True):
    """Preprocess Reddit text content with optimized NLTK operations"""
    # Handle HTML entities
    text = html.unescape(text)
    
    # Unicode normalization
    text = unicodedata.normalize('NFKD', text)
    
    # Remove URLs and Markdown formatting
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)
    text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
    text = re.sub(r'\*(.*?)\*', r'\1', text)
    
    # Remove subreddit and user references
    text = re.sub(r'/r/\w+', '', text)
    text = re.sub(r'r/\w+', '', text)
    text = re.sub(r'/u/\w+', '', text)
    text = re.sub(r'u/\w+', '', text)
    
    # Basic text cleaning
    text = re.sub("[^A-Za-z]+", ' ', text).lower()
    
    # Remove single letters (except 'i')
    text = re.sub(r'\b([a-hj-z])\b', '', text)
    
    words = text.split()
    if not words:
        return []
        
    # Lemmatization
    if lemmatize:
        stop_words = STOP_WORDS if without_stopwords else set()
        
        # Process words in a single batch for better performance
        words_to_tag = [word for word in words if not (without_stopwords and word in stop_words)]
        
        if not words_to_tag:
            return []
            
        # First check our caches
        uncached_words = [word for word in words_to_tag if word not in POS_CACHE]
        
        # Only perform POS tagging on words not in cache
        if uncached_words:
            tagged_uncached = nltk.pos_tag(uncached_words)
            # Update cache with new tags
            for word, tag in tagged_uncached:
                POS_CACHE[word] = tag
        
        processed_words = []
        
        # Process each word with cached information
        for word in words_to_tag:
            tag = POS_CACHE[word]
            wordnet_pos = get_wordnet_pos(tag)
            
            # Check lemma cache first
            lemma_key = (word, wordnet_pos)
            if lemma_key in LEMMA_CACHE:
                lemma = LEMMA_CACHE[lemma_key]
            else:
                lemma = LEMMATIZER.lemmatize(word, pos=wordnet_pos)
                LEMMA_CACHE[lemma_key] = lemma
                
            processed_words.append(lemma)
                
        return processed_words
    
    return words

def process_and_save_comments(path, subreddit, without_stopwords=True, batch_size=1000000):
    """Process comments and save in batches"""
    print(f"Processing file: {path}")
    
    # Batch processing counters
    batch_count = 0
    batch_number = 1
    total_count = 0
    
    # Create output directory
    output_dir = f"processed_comments/{subreddit}"
    os.makedirs(output_dir, exist_ok=True)
    
    # Create data structure for comments without author filtering
    comments_batch = []
    
    print(f"Starting to process {subreddit} comments...")
    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Unable to read file {path}")
            return
        
        for row in tqdm(jsonStream, desc=f"Processing {subreddit} comments"):
            if "body" not in row or "created_utc" not in row or "author" not in row or "id" not in row:
                continue
                
            author = row["author"]
            if author in {"AutoModerator", "election_info_bot"}:
                continue
            
            comment_id = row["id"]
            text = row["body"]
            created_timestamp = row["created_utc"]
            date = datetime.datetime.fromtimestamp(int(created_timestamp))
            
            # Process text with optimized functions
            processed_words = preprocess_text(text, lemmatize=True, without_stopwords=without_stopwords)
            
            if processed_words:
                # Save processed comment with metadata
                comment_data = {
                    "comment_id": comment_id,
                    "author": author,
                    "date": date.strftime("%Y-%m-%d"),
                    "timestamp": created_timestamp,
                    "processed_text": processed_words,  # Original order preserved
                    "original": text
                }
                
                comments_batch.append(comment_data)
                batch_count += 1
                
            # Check if we need to save the current batch
            if batch_count >= batch_size:
                print(f"\nReached {batch_size} comments, saving batch {batch_number}...")
                
                # Save batch directly without filtering
                save_path = f"{output_dir}/{subreddit}_batch{batch_number}.pkl"
                with open(save_path, "wb") as out_file:
                    pickle.dump(comments_batch, out_file)
                
                print(f"Saved {len(comments_batch)} comments to {save_path}")
                
                # Reset batch data
                comments_batch = []
                batch_count = 0
                batch_number += 1
                total_count += batch_size
                
                # Report cache stats
                print(f"POS tag cache size: {len(POS_CACHE)} words")
                print(f"Lemma cache size: {len(LEMMA_CACHE)} word-POS pairs")
    
    # Process any remaining comments
    if batch_count > 0:
        print(f"\nSaving remaining {batch_count} comments...")
        
        # Save batch
        save_path = f"{output_dir}/{subreddit}_batch{batch_number}.pkl"
        with open(save_path, "wb") as out_file:
            pickle.dump(comments_batch, out_file)
        
        print(f"Saved {len(comments_batch)} comments to {save_path}")
        total_count += batch_count
    
    print(f"\nCompleted processing {subreddit} comments!")
    print(f"Total comments saved: {total_count}")
    print(f"Final POS tag cache size: {len(POS_CACHE)} words")
    print(f"Final lemma cache size: {len(LEMMA_CACHE)} word-POS pairs")

def main():
    """Main function"""
    # Define data file paths
    files = {
        "democrats": r"datasets/democrats_comments.zst",
        "republican": r"datasets/Republican_comments.zst",
        "conservative": r"datasets/Conservative_comments.zst",
        "liberal": r"datasets/Liberal_comments.zst",
        "backpacking": r"datasets/backpacking_comments.zst",
        "vagabond": r"datasets/vagabond_comments.zst"
    }
    
    # Choose subreddit to process
    subreddit_to_process = "conservative"  # Change this to process other subreddits
    
    # Ensure output directory exists
    os.makedirs(f"processed_comments/{subreddit_to_process}", exist_ok=True)

    if subreddit_to_process in files:
        process_and_save_comments(  
            files[subreddit_to_process],
            subreddit_to_process,
            without_stopwords=True,
            batch_size=1000000
        )
    else:
        print(f"Subreddit not found: {subreddit_to_process}")
        print(f"Available subreddits: {', '.join(files.keys())}")
        
if __name__ == "__main__":
    main()

Processing file: datasets/Republican_comments.zst
Starting to process republican comments...


Processing republican comments: 157671it [00:07, 21837.92it/s]

In [12]:
import pickle

def inspect_pkl_file(file_path, num_examples=5):
    """
    Load a pickle file and print a few example records
    
    Args:
        file_path: Path to the pickle file
        num_examples: Number of examples to show (default 5)
    """
    # Load the pickle file
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    
    # Print information about the data structure
    print(f"Data type: {type(data)}")
    
    if isinstance(data, list):
        print(f"Number of items: {len(data)}")
        
        # Display examples
        print(f"\nShowing first {min(num_examples, len(data))} examples:")
        for i, item in enumerate(data[:num_examples]):
            print(f"\nExample {i+1}:")
            if isinstance(item, dict):
                for key, value in item.items():
                    # For processed text, just show a few words
                    if key == "processed_text" and isinstance(value, list) and len(value) > 10:
                        print(f"  {key}: {value[:10]} ... (total: {len(value)} words)")
                    else:
                        print(f"  {key}: {value}")
            else:
                print(item)
    else:
        print("Data is not a list. Structure:", data)

# Example usage
inspect_pkl_file("processed_comments/democrats/democrats_batch1.pkl")

Data type: <class 'list'>
Number of items: 1000000

Showing first 5 examples:

Example 1:
  comment_id: c07p2u0
  author: Garak
  date: 2009-02-16
  timestamp: 1234791099
  processed_text: ['allow', 'legend', 'grow', 'ill', 'mythical', 'proportion', 'lie', 'fund', 'acorn', 'nowhere'] ... (total: 49 words)
  original: &gt;  And they have allowed its legend to grow to ill and mythical proportions: lies about funding for ACORN, which is nowhere mentioned in the bill. Gross and unanswered misrepresentations by McCain about the "honey bee insurance" provision.

This is a great point. Democrats have this habit of letting the Republicans not only control the conversation, but reduce it to a fourth-grade level. "Honey bee insurance!" (Insurance for livestock producers in general, including honeybees because they are [very important and are have had a rough few years](http://en.wikipedia.org/wiki/Colony_Collapse_Disorder).) "$30M for mice!" (Wetlands restoration.) "Fruit fly research!" (Genetic