In [None]:
import os
import pickle
import datetime
import re
import html
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from fileStreams import getFileJsonStream
import numpy as np
from tqdm import tqdm
import random


# Initialize global resources once
STOP_WORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

# POS tag cache to avoid redundant tagging
POS_CACHE = {}
# Lemma cache to avoid redundant lemmatization
LEMMA_CACHE = {}

def get_wordnet_pos(tag):
    """Convert NLTK POS tag to WordNet POS tag"""
    if tag.startswith('J'):
        return 'a'  # adjective
    elif tag.startswith('V'):
        return 'v'  # verb
    elif tag.startswith('N'):
        return 'n'  # noun
    elif tag.startswith('R'):
        return 'r'  # adverb
    else:
        return 'n'  # default as noun

def preprocess_text(text, lemmatize=True, without_stopwords=True):
    """Preprocess Reddit text content with optimized NLTK operations"""
    # Handle HTML entities
    text = html.unescape(text)
    
    # Unicode normalization
    text = unicodedata.normalize('NFKD', text)
    
    # Remove URLs and Markdown formatting
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)
    text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
    text = re.sub(r'\*(.*?)\*', r'\1', text)
    
    # Remove subreddit and user references
    text = re.sub(r'/r/\w+', '', text)
    text = re.sub(r'r/\w+', '', text)
    text = re.sub(r'/u/\w+', '', text)
    text = re.sub(r'u/\w+', '', text)
    
    # Basic text cleaning
    text = re.sub("[^A-Za-z]+", ' ', text).lower()
    
    words = text.split()
    if not words:
        return []
    
    # Lemmatization first
    if lemmatize:
        # POS tagging (with cache)
        uncached_words = [w for w in words if w not in POS_CACHE]
        if uncached_words:
            tagged_uncached = nltk.pos_tag(uncached_words)
            for word, tag in tagged_uncached:
                POS_CACHE[word] = tag
        processed_words = []
        for word in words:
            tag = POS_CACHE[word]
            wordnet_pos = get_wordnet_pos(tag)
            lemma_key = (word, wordnet_pos)
            if lemma_key in LEMMA_CACHE:
                lemma = LEMMA_CACHE[lemma_key]
            else:
                lemma = LEMMATIZER.lemmatize(word, pos=wordnet_pos)
                LEMMA_CACHE[lemma_key] = lemma
            processed_words.append(lemma)
    else:
        processed_words = words

    # Remove stopwords after lemmatization
    if without_stopwords:
        processed_words = [w for w in processed_words if w not in STOP_WORDS]

    # Remove all words with length <= 2 and > 15
    processed_words = [w for w in processed_words if len(w) > 2 and len(w) <= 15]

    return processed_words


def process_and_save_comments(path, subreddit, output_dir, without_stopwords=True, batch_size=1000000):
    """Process comments and save in batches"""
    print(f"Processing file: {path}")
    
    # Batch processing counters
    batch_count = 0
    batch_number = 1
    total_count = 0
    
    # Create data structure for comments
    comments_batch = []

    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Unable to read file {path}")
            return
        
        for row in tqdm(jsonStream, desc=f"Processing {subreddit} comments"):
            if "body" not in row or "created_utc" not in row or "author" not in row or "id" not in row:
                continue
                
            author = row["author"]
            if author in {"AutoModerator", "election_info_bot"}:
                continue
            
            comment_id = row["id"]
            text = row["body"]
            created_timestamp = row["created_utc"]
            date = datetime.datetime.fromtimestamp(int(created_timestamp))
            
            # Process text with optimized functions
            processed_words = preprocess_text(text, lemmatize=True, without_stopwords=without_stopwords)
            
            if processed_words:
                # Save processed comment with metadata
                comment_data = {
                    "comment_id": comment_id,
                    "author": author,
                    "date": date.strftime("%Y-%m-%d"),
                    "timestamp": created_timestamp,
                    "processed_text": processed_words,  # Original order preserved
                    "original": text
                }
                
                comments_batch.append(comment_data)
                batch_count += 1
                
            # Check if we need to save the current batch
            if batch_count >= batch_size:
                print(f"\nReached {batch_size} comments, saving batch {batch_number}...")
                
                # Save batch directly without filtering
                save_path = f"{output_dir}/{subreddit}_batch{batch_number}.pkl"
                with open(save_path, "wb") as out_file:
                    pickle.dump(comments_batch, out_file)
                
                print(f"Saved {len(comments_batch)} comments to {save_path}")
                
                # Reset batch data
                comments_batch = []
                batch_count = 0
                batch_number += 1
                total_count += batch_size
    
    # Process any remaining comments
    if batch_count > 0:
        print(f"\nSaving remaining {batch_count} comments...")
        
        # Save batch
        save_path = f"{output_dir}/{subreddit}_batch{batch_number}.pkl"
        with open(save_path, "wb") as out_file:
            pickle.dump(comments_batch, out_file)
        
        print(f"Saved {len(comments_batch)} comments to {save_path}")
        total_count += batch_count
    
    print(f"\nCompleted processing {subreddit} comments!")
    print(f"Total comments saved: {total_count}")


def main():
    """Main function"""
    random.seed(23)
    np.random.seed(23)
    
    # Define data file paths
    files = {
        "democrats": r"datasets/democrats_comments.zst",
        "republican": r"datasets/Republican_comments.zst",
        "conservative": r"datasets/Conservative_comments.zst",
        "liberal": r"datasets/Liberal_comments.zst"
        # "vagabond": r"datasets/vagabond_comments.zst",
        # "backpacking": r"datasets/backpacking_comments.zst"
        # "cooking": r"datasets/Cooking_comments.zst",
        # "travel": r"datasets/travel_comments.zst",
        # "books": r"datasets/books_comments.zst",
        # "gaming": r"datasets/gaming_comments.zst",
        # "movies": r"datasets/movies_comments.zst",
        # "technology": r"datasets/technology_comments.zst",
        # "personalfinance": r"datasets/personalfinance_comments.zst"
    }
    
    # List of subreddits to process (process all by default)
    subreddits_to_process = list(files.keys())

    for subreddit in subreddits_to_process:
        output_dir = f"processed_comments_1/{subreddit}"
        os.makedirs(output_dir, exist_ok=True)
        print(f"\nProcessing subreddit: {subreddit}")
        process_and_save_comments(
            files[subreddit],
            subreddit,
            output_dir,
            without_stopwords=True,
            batch_size=1000000
        )

if __name__ == "__main__":
    main()


Processing subreddit: cooking
Processing file: datasets/Cooking_comments.zst


Processing cooking comments: 1006830it [00:38, 32467.49it/s]


Reached 1000000 comments, saving batch 1...


Processing cooking comments: 1013040it [00:41, 5333.07it/s] 

Saved 1000000 comments to processed_comments_1/cooking/cooking_batch1.pkl


Processing cooking comments: 2015722it [01:15, 31711.77it/s]


Reached 1000000 comments, saving batch 2...


Processing cooking comments: 2021941it [01:17, 5463.85it/s] 

Saved 1000000 comments to processed_comments_1/cooking/cooking_batch2.pkl


Processing cooking comments: 3025678it [01:50, 31632.48it/s]


Reached 1000000 comments, saving batch 3...


Processing cooking comments: 3031428it [01:53, 5258.44it/s] 

Saved 1000000 comments to processed_comments_1/cooking/cooking_batch3.pkl


Processing cooking comments: 4033826it [02:27, 31439.81it/s]


Reached 1000000 comments, saving batch 4...


Processing cooking comments: 4039945it [02:30, 5254.98it/s] 

Saved 1000000 comments to processed_comments_1/cooking/cooking_batch4.pkl


Processing cooking comments: 5043172it [03:04, 30189.22it/s]


Reached 1000000 comments, saving batch 5...


Processing cooking comments: 5048933it [03:06, 5299.45it/s] 

Saved 1000000 comments to processed_comments_1/cooking/cooking_batch5.pkl


Processing cooking comments: 6052131it [03:41, 31336.73it/s]


Reached 1000000 comments, saving batch 6...


Processing cooking comments: 6057904it [03:43, 5582.45it/s] 

Saved 1000000 comments to processed_comments_1/cooking/cooking_batch6.pkl


Processing cooking comments: 7058735it [04:17, 30955.51it/s]


Reached 1000000 comments, saving batch 7...


Processing cooking comments: 7064828it [04:20, 5548.08it/s] 

Saved 1000000 comments to processed_comments_1/cooking/cooking_batch7.pkl


Processing cooking comments: 8068783it [04:54, 32782.38it/s]


Reached 1000000 comments, saving batch 8...


Processing cooking comments: 8075208it [04:56, 5567.02it/s] 

Saved 1000000 comments to processed_comments_1/cooking/cooking_batch8.pkl


Processing cooking comments: 9077577it [05:29, 30912.01it/s]


Reached 1000000 comments, saving batch 9...


Processing cooking comments: 9084092it [05:31, 5585.33it/s] 

Saved 1000000 comments to processed_comments_1/cooking/cooking_batch9.pkl


Processing cooking comments: 9795085it [05:56, 27509.66it/s]



Saving remaining 708440 comments...
Saved 708440 comments to processed_comments_1/cooking/cooking_batch10.pkl

Completed processing cooking comments!
Total comments saved: 9708440

Processing subreddit: travel
Processing file: datasets/travel_comments.zst


Processing travel comments: 1010162it [00:43, 26765.34it/s]


Reached 1000000 comments, saving batch 1...


Processing travel comments: 1012853it [00:46, 3025.71it/s] 

Saved 1000000 comments to processed_comments_1/travel/travel_batch1.pkl


Processing travel comments: 2039959it [01:25, 29214.26it/s]


Reached 1000000 comments, saving batch 2...
Saved 1000000 comments to processed_comments_1/travel/travel_batch2.pkl


Processing travel comments: 3082749it [02:05, 27563.85it/s]


Reached 1000000 comments, saving batch 3...


Processing travel comments: 3087723it [02:08, 4333.19it/s] 

Saved 1000000 comments to processed_comments_1/travel/travel_batch3.pkl


Processing travel comments: 4125023it [02:46, 28702.84it/s]


Reached 1000000 comments, saving batch 4...


Processing travel comments: 4130639it [02:49, 4771.55it/s] 

Saved 1000000 comments to processed_comments_1/travel/travel_batch4.pkl


Processing travel comments: 5159877it [03:28, 30086.57it/s]


Reached 1000000 comments, saving batch 5...


Processing travel comments: 5165935it [03:31, 4816.04it/s] 

Saved 1000000 comments to processed_comments_1/travel/travel_batch5.pkl


Processing travel comments: 6213499it [04:09, 30265.40it/s]


Reached 1000000 comments, saving batch 6...


Processing travel comments: 6219111it [04:12, 4643.82it/s] 

Saved 1000000 comments to processed_comments_1/travel/travel_batch6.pkl


Processing travel comments: 6341252it [04:17, 24647.16it/s]



Saving remaining 119901 comments...
Saved 119901 comments to processed_comments_1/travel/travel_batch7.pkl

Completed processing travel comments!
Total comments saved: 6119901

Processing subreddit: books
Processing file: datasets/books_comments.zst


Processing books comments: 1005806it [00:35, 31998.56it/s]


Reached 1000000 comments, saving batch 1...
Saved 1000000 comments to processed_comments_1/books/books_batch1.pkl


Processing books comments: 2020458it [01:14, 26803.26it/s]


Reached 1000000 comments, saving batch 2...


Processing books comments: 2026417it [01:17, 4864.06it/s] 

Saved 1000000 comments to processed_comments_1/books/books_batch2.pkl


Processing books comments: 3044965it [01:52, 30589.16it/s]


Reached 1000000 comments, saving batch 3...


Processing books comments: 3051112it [01:55, 4943.90it/s] 

Saved 1000000 comments to processed_comments_1/books/books_batch3.pkl


Processing books comments: 4075859it [02:30, 32508.86it/s]


Reached 1000000 comments, saving batch 4...


Processing books comments: 4082384it [02:32, 5734.94it/s] 

Saved 1000000 comments to processed_comments_1/books/books_batch4.pkl


Processing books comments: 5097062it [03:08, 30100.18it/s]


Reached 1000000 comments, saving batch 5...
Saved 1000000 comments to processed_comments_1/books/books_batch5.pkl


Processing books comments: 6127460it [03:48, 32104.94it/s]


Reached 1000000 comments, saving batch 6...
Saved 1000000 comments to processed_comments_1/books/books_batch6.pkl


Processing books comments: 7162916it [04:28, 27543.41it/s]


Reached 1000000 comments, saving batch 7...


Processing books comments: 7168521it [04:30, 4681.34it/s] 

Saved 1000000 comments to processed_comments_1/books/books_batch7.pkl


Processing books comments: 8192066it [05:10, 26241.63it/s]


Reached 1000000 comments, saving batch 8...


Processing books comments: 8196602it [05:13, 4048.52it/s] 

Saved 1000000 comments to processed_comments_1/books/books_batch8.pkl


Processing books comments: 9218650it [05:53, 29219.39it/s]


Reached 1000000 comments, saving batch 9...
Saved 1000000 comments to processed_comments_1/books/books_batch9.pkl


Processing books comments: 10243366it [06:36, 29742.64it/s]


Reached 1000000 comments, saving batch 10...
Saved 1000000 comments to processed_comments_1/books/books_batch10.pkl


Processing books comments: 11271639it [07:18, 27033.07it/s]


Reached 1000000 comments, saving batch 11...
Saved 1000000 comments to processed_comments_1/books/books_batch11.pkl


Processing books comments: 12323052it [08:02, 27756.06it/s]


Reached 1000000 comments, saving batch 12...


Processing books comments: 12328575it [08:05, 4212.08it/s] 

Saved 1000000 comments to processed_comments_1/books/books_batch12.pkl


Processing books comments: 12394572it [08:08, 25374.43it/s]



Saving remaining 65429 comments...
Saved 65429 comments to processed_comments_1/books/books_batch13.pkl

Completed processing books comments!
Total comments saved: 12065429

Processing subreddit: gaming
Processing file: datasets/gaming_comments.zst


Processing gaming comments: 1009415it [00:32, 35981.97it/s]


Reached 1000000 comments, saving batch 1...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch1.pkl


Processing gaming comments: 2023622it [01:07, 9552.39it/s] 


Reached 1000000 comments, saving batch 2...


Processing gaming comments: 2029106it [01:09, 4494.93it/s]

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch2.pkl


Processing gaming comments: 3034624it [01:39, 39781.93it/s]


Reached 1000000 comments, saving batch 3...


Processing gaming comments: 3042257it [01:41, 6842.23it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch3.pkl


Processing gaming comments: 4050791it [02:12, 40275.36it/s]


Reached 1000000 comments, saving batch 4...


Processing gaming comments: 4054832it [02:15, 5032.05it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch4.pkl


Processing gaming comments: 5063123it [02:45, 33959.41it/s]


Reached 1000000 comments, saving batch 5...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch5.pkl


Processing gaming comments: 6080332it [03:16, 42229.62it/s]


Reached 1000000 comments, saving batch 6...


Processing gaming comments: 6084569it [03:18, 5685.15it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch6.pkl


Processing gaming comments: 7094020it [03:47, 38121.63it/s]


Reached 1000000 comments, saving batch 7...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch7.pkl


Processing gaming comments: 8112269it [04:16, 45242.33it/s]


Reached 1000000 comments, saving batch 8...


Processing gaming comments: 8120943it [04:19, 7910.26it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch8.pkl


Processing gaming comments: 9128902it [04:47, 42366.04it/s]


Reached 1000000 comments, saving batch 9...


Processing gaming comments: 9136902it [04:49, 7470.16it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch9.pkl


Processing gaming comments: 10145391it [05:16, 41420.03it/s]


Reached 1000000 comments, saving batch 10...


Processing gaming comments: 10153550it [05:19, 7520.23it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch10.pkl


Processing gaming comments: 11165146it [05:46, 39930.52it/s]


Reached 1000000 comments, saving batch 11...


Processing gaming comments: 11173394it [05:49, 7559.44it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch11.pkl


Processing gaming comments: 12253432it [06:16, 39677.82it/s]


Reached 1000000 comments, saving batch 12...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch12.pkl


Processing gaming comments: 13276608it [06:45, 41850.60it/s]


Reached 1000000 comments, saving batch 13...


Processing gaming comments: 13285136it [06:47, 7974.78it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch13.pkl


Processing gaming comments: 14299634it [07:13, 42611.91it/s]


Reached 1000000 comments, saving batch 14...


Processing gaming comments: 14303931it [07:15, 5899.94it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch14.pkl


Processing gaming comments: 15318314it [07:42, 41415.60it/s]


Reached 1000000 comments, saving batch 15...


Processing gaming comments: 15326921it [07:44, 7919.99it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch15.pkl


Processing gaming comments: 16339987it [08:11, 44243.99it/s]


Reached 1000000 comments, saving batch 16...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch16.pkl


Processing gaming comments: 17359504it [08:40, 45828.87it/s]


Reached 1000000 comments, saving batch 17...


Processing gaming comments: 17367686it [08:42, 8019.48it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch17.pkl


Processing gaming comments: 18382169it [09:11, 45971.30it/s]


Reached 1000000 comments, saving batch 18...


Processing gaming comments: 18390305it [09:13, 8109.39it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch18.pkl


Processing gaming comments: 19404606it [09:41, 41005.87it/s]


Reached 1000000 comments, saving batch 19...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch19.pkl


Processing gaming comments: 20420606it [10:12, 44998.26it/s]


Reached 1000000 comments, saving batch 20...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch20.pkl


Processing gaming comments: 21444597it [10:40, 47361.22it/s]


Reached 1000000 comments, saving batch 21...


Processing gaming comments: 21454132it [10:43, 9188.88it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch21.pkl


Processing gaming comments: 22462506it [11:09, 48871.07it/s]


Reached 1000000 comments, saving batch 22...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch22.pkl


Processing gaming comments: 23482799it [11:37, 44163.44it/s]


Reached 1000000 comments, saving batch 23...


Processing gaming comments: 23496520it [11:40, 11268.66it/s]

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch23.pkl


Processing gaming comments: 24504926it [12:07, 46279.31it/s]


Reached 1000000 comments, saving batch 24...


Processing gaming comments: 24513977it [12:09, 8394.41it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch24.pkl


Processing gaming comments: 25524168it [12:35, 41122.68it/s]


Reached 1000000 comments, saving batch 25...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch25.pkl


Processing gaming comments: 26542069it [13:03, 46889.12it/s]


Reached 1000000 comments, saving batch 26...


Processing gaming comments: 26550319it [13:06, 8124.73it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch26.pkl


Processing gaming comments: 27566882it [13:32, 47577.13it/s]


Reached 1000000 comments, saving batch 27...


Processing gaming comments: 27575106it [13:34, 8352.37it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch27.pkl


Processing gaming comments: 28584185it [14:01, 47417.53it/s]


Reached 1000000 comments, saving batch 28...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch28.pkl


Processing gaming comments: 29606733it [14:30, 42870.47it/s]


Reached 1000000 comments, saving batch 29...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch29.pkl


Processing gaming comments: 30625848it [15:00, 16995.13it/s]


Reached 1000000 comments, saving batch 30...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch30.pkl


Processing gaming comments: 31647033it [15:28, 44245.20it/s]


Reached 1000000 comments, saving batch 31...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch31.pkl


Processing gaming comments: 32669024it [15:58, 42191.51it/s]


Reached 1000000 comments, saving batch 32...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch32.pkl


Processing gaming comments: 33690933it [16:27, 43697.65it/s]


Reached 1000000 comments, saving batch 33...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch33.pkl


Processing gaming comments: 34715424it [16:56, 42202.88it/s]


Reached 1000000 comments, saving batch 34...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch34.pkl


Processing gaming comments: 35735503it [17:24, 44247.39it/s]


Reached 1000000 comments, saving batch 35...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch35.pkl


Processing gaming comments: 36756161it [17:54, 41918.30it/s]


Reached 1000000 comments, saving batch 36...


Processing gaming comments: 36765019it [17:56, 8739.32it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch36.pkl


Processing gaming comments: 37778342it [18:22, 48204.75it/s]


Reached 1000000 comments, saving batch 37...


Processing gaming comments: 37787210it [18:24, 9131.32it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch37.pkl


Processing gaming comments: 38797916it [18:51, 43595.41it/s]


Reached 1000000 comments, saving batch 38...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch38.pkl


Processing gaming comments: 39822673it [19:19, 45887.88it/s]


Reached 1000000 comments, saving batch 39...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch39.pkl


Processing gaming comments: 40844362it [19:46, 47667.19it/s]


Reached 1000000 comments, saving batch 40...


Processing gaming comments: 40853554it [19:48, 9570.71it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch40.pkl


Processing gaming comments: 41864561it [20:15, 47798.07it/s]


Reached 1000000 comments, saving batch 41...


Processing gaming comments: 41872939it [20:17, 8526.35it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch41.pkl


Processing gaming comments: 42886489it [20:43, 42844.23it/s]


Reached 1000000 comments, saving batch 42...


Processing gaming comments: 42895062it [20:45, 8575.33it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch42.pkl


Processing gaming comments: 43914191it [21:12, 21746.47it/s]


Reached 1000000 comments, saving batch 43...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch43.pkl


Processing gaming comments: 44935702it [21:39, 42118.47it/s]


Reached 1000000 comments, saving batch 44...


Processing gaming comments: 44944698it [21:42, 8541.74it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch44.pkl


Processing gaming comments: 45958875it [22:10, 43049.97it/s]


Reached 1000000 comments, saving batch 45...


Processing gaming comments: 45966788it [22:12, 7731.88it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch45.pkl


Processing gaming comments: 46985760it [22:39, 44671.28it/s]


Reached 1000000 comments, saving batch 46...


Processing gaming comments: 46990400it [22:41, 6793.52it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch46.pkl


Processing gaming comments: 48014004it [23:08, 41868.00it/s]


Reached 1000000 comments, saving batch 47...


Processing gaming comments: 48018354it [23:10, 6405.07it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch47.pkl


Processing gaming comments: 49038385it [23:36, 32933.88it/s]


Reached 1000000 comments, saving batch 48...


Processing gaming comments: 49051545it [23:38, 11230.66it/s]

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch48.pkl


Processing gaming comments: 50067778it [24:03, 45004.58it/s]


Reached 1000000 comments, saving batch 49...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch49.pkl


Processing gaming comments: 51098799it [24:32, 43913.15it/s]


Reached 1000000 comments, saving batch 50...


Processing gaming comments: 51107066it [24:34, 8543.03it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch50.pkl


Processing gaming comments: 52133516it [25:01, 42507.70it/s]


Reached 1000000 comments, saving batch 51...


Processing gaming comments: 52142378it [25:03, 8726.73it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch51.pkl


Processing gaming comments: 53169755it [25:29, 38996.94it/s]


Reached 1000000 comments, saving batch 52...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch52.pkl


Processing gaming comments: 54202830it [26:00, 16026.30it/s]


Reached 1000000 comments, saving batch 53...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch53.pkl


Processing gaming comments: 55237730it [26:27, 46379.70it/s]


Reached 1000000 comments, saving batch 54...


Processing gaming comments: 55246364it [26:29, 9116.37it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch54.pkl


Processing gaming comments: 56265556it [26:55, 48224.68it/s]


Reached 1000000 comments, saving batch 55...


Processing gaming comments: 56275136it [26:58, 9223.72it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch55.pkl


Processing gaming comments: 57299102it [27:25, 48092.01it/s]


Reached 1000000 comments, saving batch 56...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch56.pkl


Processing gaming comments: 58329022it [27:53, 47467.09it/s]


Reached 1000000 comments, saving batch 57...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch57.pkl


Processing gaming comments: 59358852it [28:22, 43806.47it/s]


Reached 1000000 comments, saving batch 58...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch58.pkl


Processing gaming comments: 60389773it [28:51, 16735.89it/s]


Reached 1000000 comments, saving batch 59...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch59.pkl


Processing gaming comments: 61418296it [29:19, 45297.07it/s]


Reached 1000000 comments, saving batch 60...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch60.pkl


Processing gaming comments: 62450683it [29:48, 46902.98it/s]


Reached 1000000 comments, saving batch 61...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch61.pkl


Processing gaming comments: 63484949it [30:18, 24885.50it/s]


Reached 1000000 comments, saving batch 62...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch62.pkl


Processing gaming comments: 64517779it [30:45, 45358.08it/s]


Reached 1000000 comments, saving batch 63...


Processing gaming comments: 64526401it [30:47, 8786.36it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch63.pkl


Processing gaming comments: 65545760it [31:15, 44172.95it/s]


Reached 1000000 comments, saving batch 64...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch64.pkl


Processing gaming comments: 66569158it [31:44, 43183.72it/s]


Reached 1000000 comments, saving batch 65...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch65.pkl


Processing gaming comments: 67602094it [32:13, 37970.18it/s]


Reached 1000000 comments, saving batch 66...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch66.pkl


Processing gaming comments: 68627400it [32:42, 12684.05it/s]


Reached 1000000 comments, saving batch 67...


Processing gaming comments: 68635295it [32:45, 6643.49it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch67.pkl


Processing gaming comments: 69650517it [33:11, 44601.65it/s]


Reached 1000000 comments, saving batch 68...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch68.pkl


Processing gaming comments: 70677541it [33:41, 45283.45it/s]


Reached 1000000 comments, saving batch 69...


Processing gaming comments: 70686228it [33:43, 8330.34it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch69.pkl


Processing gaming comments: 71698996it [34:11, 44736.73it/s]


Reached 1000000 comments, saving batch 70...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch70.pkl


Processing gaming comments: 72721522it [34:42, 12308.36it/s]


Reached 1000000 comments, saving batch 71...


Processing gaming comments: 72729107it [34:44, 6238.59it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch71.pkl


Processing gaming comments: 73738544it [35:11, 40975.49it/s]


Reached 1000000 comments, saving batch 72...


Processing gaming comments: 73745858it [35:13, 7645.57it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch72.pkl


Processing gaming comments: 74759290it [35:42, 42741.21it/s]


Reached 1000000 comments, saving batch 73...


Processing gaming comments: 74766993it [35:44, 8211.48it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch73.pkl


Processing gaming comments: 75784715it [36:14, 11162.85it/s]


Reached 1000000 comments, saving batch 74...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch74.pkl


Processing gaming comments: 76805138it [36:44, 43533.82it/s]


Reached 1000000 comments, saving batch 75...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch75.pkl


Processing gaming comments: 77829377it [37:15, 43623.26it/s]


Reached 1000000 comments, saving batch 76...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch76.pkl


Processing gaming comments: 78844669it [37:47, 11560.56it/s]


Reached 1000000 comments, saving batch 77...


Processing gaming comments: 78851559it [37:49, 5590.72it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch77.pkl


Processing gaming comments: 79870187it [38:17, 41364.63it/s]


Reached 1000000 comments, saving batch 78...


Processing gaming comments: 79878288it [38:20, 7835.82it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch78.pkl


Processing gaming comments: 80887233it [38:50, 37609.39it/s]


Reached 1000000 comments, saving batch 79...


Processing gaming comments: 80893818it [38:52, 6907.05it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch79.pkl


Processing gaming comments: 81901960it [39:22, 41917.05it/s]


Reached 1000000 comments, saving batch 80...


Processing gaming comments: 81910654it [39:25, 8056.25it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch80.pkl


Processing gaming comments: 82920224it [39:56, 10590.42it/s]


Reached 1000000 comments, saving batch 81...


Processing gaming comments: 82925674it [39:58, 4785.34it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch81.pkl


Processing gaming comments: 83937889it [40:28, 38692.35it/s]


Reached 1000000 comments, saving batch 82...


Processing gaming comments: 83944860it [40:31, 7281.94it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch82.pkl


Processing gaming comments: 84956379it [41:01, 40165.33it/s]


Reached 1000000 comments, saving batch 83...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch83.pkl


Processing gaming comments: 85973852it [41:34, 40258.31it/s]


Reached 1000000 comments, saving batch 84...


Processing gaming comments: 85977910it [41:37, 5879.25it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch84.pkl


Processing gaming comments: 86990185it [42:08, 35832.30it/s]


Reached 1000000 comments, saving batch 85...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch85.pkl


Processing gaming comments: 88007284it [42:43, 35074.55it/s]


Reached 1000000 comments, saving batch 86...


Processing gaming comments: 88017084it [42:45, 7995.47it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch86.pkl


Processing gaming comments: 89046122it [43:18, 36656.27it/s]


Reached 1000000 comments, saving batch 87...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch87.pkl


Processing gaming comments: 90073938it [43:55, 32137.02it/s]


Reached 1000000 comments, saving batch 88...


Processing gaming comments: 90077180it [43:57, 3963.58it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch88.pkl


Processing gaming comments: 91101998it [44:33, 31549.12it/s]


Reached 1000000 comments, saving batch 89...


Processing gaming comments: 91107995it [44:35, 5484.04it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch89.pkl


Processing gaming comments: 92131958it [45:11, 31458.41it/s]


Reached 1000000 comments, saving batch 90...


Processing gaming comments: 92137548it [45:14, 5072.76it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch90.pkl


Processing gaming comments: 93167997it [45:49, 34568.84it/s]


Reached 1000000 comments, saving batch 91...


Processing gaming comments: 93174326it [45:52, 5261.49it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch91.pkl


Processing gaming comments: 94201213it [46:29, 35072.45it/s]


Reached 1000000 comments, saving batch 92...


Processing gaming comments: 94207665it [46:31, 5862.34it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch92.pkl


Processing gaming comments: 95235961it [47:07, 31892.72it/s]


Reached 1000000 comments, saving batch 93...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch93.pkl


Processing gaming comments: 96272867it [47:45, 30382.13it/s]


Reached 1000000 comments, saving batch 94...


Processing gaming comments: 96278385it [47:48, 4757.96it/s] 

Saved 1000000 comments to processed_comments_1/gaming/gaming_batch94.pkl


Processing gaming comments: 97313071it [48:24, 32707.82it/s]


Reached 1000000 comments, saving batch 95...
Saved 1000000 comments to processed_comments_1/gaming/gaming_batch95.pkl


Processing gaming comments: 97317330it [48:27, 33474.64it/s]



Saving remaining 876 comments...
Saved 876 comments to processed_comments_1/gaming/gaming_batch96.pkl

Completed processing gaming comments!
Total comments saved: 95000876

Processing subreddit: movies
Processing file: datasets/movies_comments.zst


Processing movies comments: 1008364it [00:32, 37845.52it/s]


Reached 1000000 comments, saving batch 1...
Saved 1000000 comments to processed_comments_1/movies/movies_batch1.pkl


Processing movies comments: 2023125it [01:06, 37629.66it/s]


Reached 1000000 comments, saving batch 2...


Processing movies comments: 2030902it [01:08, 6781.50it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch2.pkl


Processing movies comments: 3034335it [01:40, 40160.23it/s]


Reached 1000000 comments, saving batch 3...


Processing movies comments: 3042256it [01:42, 6725.18it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch3.pkl


Processing movies comments: 4051701it [02:13, 39787.13it/s]


Reached 1000000 comments, saving batch 4...


Processing movies comments: 4055691it [02:16, 4967.04it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch4.pkl


Processing movies comments: 5062606it [02:48, 35564.60it/s]


Reached 1000000 comments, saving batch 5...


Processing movies comments: 5069907it [02:51, 6139.33it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch5.pkl


Processing movies comments: 6077314it [03:21, 41593.16it/s]


Reached 1000000 comments, saving batch 6...
Saved 1000000 comments to processed_comments_1/movies/movies_batch6.pkl


Processing movies comments: 7094243it [03:54, 37948.05it/s]


Reached 1000000 comments, saving batch 7...


Processing movies comments: 7101775it [03:56, 6427.65it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch7.pkl


Processing movies comments: 8108868it [04:27, 40541.16it/s]


Reached 1000000 comments, saving batch 8...
Saved 1000000 comments to processed_comments_1/movies/movies_batch8.pkl


Processing movies comments: 9122585it [05:01, 35749.96it/s]


Reached 1000000 comments, saving batch 9...


Processing movies comments: 9126177it [05:03, 4704.66it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch9.pkl


Processing movies comments: 10140395it [05:33, 38325.04it/s]


Reached 1000000 comments, saving batch 10...


Processing movies comments: 10147731it [05:36, 6504.55it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch10.pkl


Processing movies comments: 11155137it [06:07, 24903.85it/s]


Reached 1000000 comments, saving batch 11...
Saved 1000000 comments to processed_comments_1/movies/movies_batch11.pkl


Processing movies comments: 12169249it [06:39, 38615.89it/s]


Reached 1000000 comments, saving batch 12...


Processing movies comments: 12175884it [06:42, 6028.82it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch12.pkl


Processing movies comments: 13184513it [07:13, 26231.36it/s]


Reached 1000000 comments, saving batch 13...


Processing movies comments: 13191296it [07:16, 5719.35it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch13.pkl


Processing movies comments: 14200589it [07:45, 60283.67it/s]


Reached 1000000 comments, saving batch 14...


Processing movies comments: 14211100it [07:48, 9908.66it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch14.pkl


Processing movies comments: 15219898it [08:20, 36438.56it/s]


Reached 1000000 comments, saving batch 15...


Processing movies comments: 15223551it [08:22, 4871.77it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch15.pkl


Processing movies comments: 16232772it [08:53, 34461.41it/s]


Reached 1000000 comments, saving batch 16...


Processing movies comments: 16239431it [08:56, 6025.34it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch16.pkl


Processing movies comments: 17248434it [09:27, 38942.22it/s]


Reached 1000000 comments, saving batch 17...
Saved 1000000 comments to processed_comments_1/movies/movies_batch17.pkl


Processing movies comments: 18265856it [10:01, 13831.53it/s]


Reached 1000000 comments, saving batch 18...
Saved 1000000 comments to processed_comments_1/movies/movies_batch18.pkl


Processing movies comments: 19279596it [10:34, 45656.12it/s]


Reached 1000000 comments, saving batch 19...


Processing movies comments: 19287904it [10:36, 8164.82it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch19.pkl


Processing movies comments: 20293781it [11:07, 39275.86it/s]


Reached 1000000 comments, saving batch 20...


Processing movies comments: 20301284it [11:10, 6734.99it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch20.pkl


Processing movies comments: 21307001it [11:42, 17959.48it/s]


Reached 1000000 comments, saving batch 21...
Saved 1000000 comments to processed_comments_1/movies/movies_batch21.pkl


Processing movies comments: 22322605it [12:15, 38111.00it/s]


Reached 1000000 comments, saving batch 22...


Processing movies comments: 22329821it [12:17, 6748.27it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch22.pkl


Processing movies comments: 23338796it [12:50, 27068.14it/s]


Reached 1000000 comments, saving batch 23...


Processing movies comments: 23345227it [12:53, 5503.03it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch23.pkl


Processing movies comments: 24350185it [13:25, 39494.66it/s]


Reached 1000000 comments, saving batch 24...
Saved 1000000 comments to processed_comments_1/movies/movies_batch24.pkl


Processing movies comments: 25365921it [14:00, 37739.37it/s]


Reached 1000000 comments, saving batch 25...


Processing movies comments: 25373295it [14:02, 6727.53it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch25.pkl


Processing movies comments: 26379475it [14:34, 35642.35it/s]


Reached 1000000 comments, saving batch 26...


Processing movies comments: 26386283it [14:37, 6271.85it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch26.pkl


Processing movies comments: 27394671it [15:10, 34621.95it/s]


Reached 1000000 comments, saving batch 27...


Processing movies comments: 27401741it [15:12, 5973.65it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch27.pkl


Processing movies comments: 28409962it [15:45, 35744.89it/s]


Reached 1000000 comments, saving batch 28...


Processing movies comments: 28416976it [15:47, 6023.63it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch28.pkl


Processing movies comments: 29423359it [16:19, 35038.63it/s]


Reached 1000000 comments, saving batch 29...
Saved 1000000 comments to processed_comments_1/movies/movies_batch29.pkl


Processing movies comments: 30441248it [16:53, 35190.28it/s]


Reached 1000000 comments, saving batch 30...
Saved 1000000 comments to processed_comments_1/movies/movies_batch30.pkl


Processing movies comments: 31455417it [17:29, 33177.11it/s]


Reached 1000000 comments, saving batch 31...
Saved 1000000 comments to processed_comments_1/movies/movies_batch31.pkl


Processing movies comments: 32468129it [18:04, 38624.14it/s]


Reached 1000000 comments, saving batch 32...
Saved 1000000 comments to processed_comments_1/movies/movies_batch32.pkl


Processing movies comments: 33481501it [18:40, 35447.45it/s]


Reached 1000000 comments, saving batch 33...


Processing movies comments: 33487795it [18:42, 5757.58it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch33.pkl


Processing movies comments: 34494643it [19:16, 11445.88it/s]


Reached 1000000 comments, saving batch 34...
Saved 1000000 comments to processed_comments_1/movies/movies_batch34.pkl


Processing movies comments: 35510296it [19:50, 35350.05it/s]


Reached 1000000 comments, saving batch 35...
Saved 1000000 comments to processed_comments_1/movies/movies_batch35.pkl


Processing movies comments: 36522233it [20:25, 39318.20it/s]


Reached 1000000 comments, saving batch 36...


Processing movies comments: 36526169it [20:28, 4886.52it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch36.pkl


Processing movies comments: 37532962it [21:01, 37893.27it/s]


Reached 1000000 comments, saving batch 37...
Saved 1000000 comments to processed_comments_1/movies/movies_batch37.pkl


Processing movies comments: 38546999it [21:38, 10474.32it/s]


Reached 1000000 comments, saving batch 38...


Processing movies comments: 38551774it [21:40, 4141.24it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch38.pkl


Processing movies comments: 39558337it [22:13, 37095.61it/s]


Reached 1000000 comments, saving batch 39...


Processing movies comments: 39565554it [22:16, 6342.31it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch39.pkl


Processing movies comments: 40568882it [22:49, 34870.61it/s]


Reached 1000000 comments, saving batch 40...


Processing movies comments: 40576129it [22:51, 6137.87it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch40.pkl


Processing movies comments: 41582680it [23:26, 31754.40it/s]


Reached 1000000 comments, saving batch 41...
Saved 1000000 comments to processed_comments_1/movies/movies_batch41.pkl


Processing movies comments: 42590357it [24:04, 8715.36it/s] 


Reached 1000000 comments, saving batch 42...
Saved 1000000 comments to processed_comments_1/movies/movies_batch42.pkl


Processing movies comments: 43600029it [24:40, 33499.92it/s]


Reached 1000000 comments, saving batch 43...
Saved 1000000 comments to processed_comments_1/movies/movies_batch43.pkl


Processing movies comments: 44615118it [25:16, 47867.70it/s]


Reached 1000000 comments, saving batch 44...


Processing movies comments: 44619930it [25:18, 6535.51it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch44.pkl


Processing movies comments: 45624093it [25:53, 34355.95it/s]


Reached 1000000 comments, saving batch 45...
Saved 1000000 comments to processed_comments_1/movies/movies_batch45.pkl


Processing movies comments: 46643451it [26:30, 32009.64it/s]


Reached 1000000 comments, saving batch 46...
Saved 1000000 comments to processed_comments_1/movies/movies_batch46.pkl


Processing movies comments: 47658799it [27:06, 32097.02it/s]


Reached 1000000 comments, saving batch 47...
Saved 1000000 comments to processed_comments_1/movies/movies_batch47.pkl


Processing movies comments: 48677253it [27:43, 41113.49it/s]


Reached 1000000 comments, saving batch 48...
Saved 1000000 comments to processed_comments_1/movies/movies_batch48.pkl


Processing movies comments: 49690648it [28:19, 35026.78it/s]


Reached 1000000 comments, saving batch 49...
Saved 1000000 comments to processed_comments_1/movies/movies_batch49.pkl


Processing movies comments: 50704747it [28:55, 36522.56it/s]


Reached 1000000 comments, saving batch 50...


Processing movies comments: 50712170it [28:58, 6413.74it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch50.pkl


Processing movies comments: 51725065it [29:31, 34991.08it/s]


Reached 1000000 comments, saving batch 51...


Processing movies comments: 51731438it [29:34, 5703.60it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch51.pkl


Processing movies comments: 52740603it [30:09, 35830.84it/s]


Reached 1000000 comments, saving batch 52...


Processing movies comments: 52747078it [30:11, 5672.51it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch52.pkl


Processing movies comments: 53753577it [30:44, 32767.32it/s]


Reached 1000000 comments, saving batch 53...


Processing movies comments: 53760023it [30:46, 5671.62it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch53.pkl


Processing movies comments: 54771698it [31:20, 33893.52it/s]


Reached 1000000 comments, saving batch 54...
Saved 1000000 comments to processed_comments_1/movies/movies_batch54.pkl


Processing movies comments: 55785089it [31:56, 35810.71it/s]


Reached 1000000 comments, saving batch 55...
Saved 1000000 comments to processed_comments_1/movies/movies_batch55.pkl


Processing movies comments: 56804234it [32:32, 31957.53it/s]


Reached 1000000 comments, saving batch 56...


Processing movies comments: 56810471it [32:35, 5497.88it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch56.pkl


Processing movies comments: 57817508it [33:08, 37162.20it/s]


Reached 1000000 comments, saving batch 57...


Processing movies comments: 57824185it [33:10, 5924.66it/s] 

Saved 1000000 comments to processed_comments_1/movies/movies_batch57.pkl


Processing movies comments: 58828035it [33:45, 32747.90it/s]


Reached 1000000 comments, saving batch 58...
Saved 1000000 comments to processed_comments_1/movies/movies_batch58.pkl


Processing movies comments: 59838973it [34:22, 33811.02it/s]


Reached 1000000 comments, saving batch 59...
Saved 1000000 comments to processed_comments_1/movies/movies_batch59.pkl


Processing movies comments: 60288326it [34:39, 28990.34it/s]



Saving remaining 442515 comments...
Saved 442515 comments to processed_comments_1/movies/movies_batch60.pkl

Completed processing movies comments!
Total comments saved: 59442515

Processing subreddit: technology
Processing file: datasets/technology_comments.zst


Processing technology comments: 1012550it [00:36, 29156.37it/s]


Reached 1000000 comments, saving batch 1...


Processing technology comments: 1015836it [00:39, 3971.61it/s] 

Saved 1000000 comments to processed_comments_1/technology/technology_batch1.pkl


Processing technology comments: 2022356it [01:14, 32589.42it/s]


Reached 1000000 comments, saving batch 2...
Saved 1000000 comments to processed_comments_1/technology/technology_batch2.pkl


Processing technology comments: 3031913it [01:54, 31737.01it/s]


Reached 1000000 comments, saving batch 3...
Saved 1000000 comments to processed_comments_1/technology/technology_batch3.pkl


Processing technology comments: 4045927it [02:34, 31002.06it/s]


Reached 1000000 comments, saving batch 4...
Saved 1000000 comments to processed_comments_1/technology/technology_batch4.pkl


Processing technology comments: 5055807it [03:13, 30452.36it/s]


Reached 1000000 comments, saving batch 5...
Saved 1000000 comments to processed_comments_1/technology/technology_batch5.pkl


Processing technology comments: 6068408it [03:52, 34352.30it/s]


Reached 1000000 comments, saving batch 6...
Saved 1000000 comments to processed_comments_1/technology/technology_batch6.pkl


Processing technology comments: 7087846it [04:32, 34568.59it/s]


Reached 1000000 comments, saving batch 7...


Processing technology comments: 7091316it [04:34, 4072.13it/s] 

Saved 1000000 comments to processed_comments_1/technology/technology_batch7.pkl


Processing technology comments: 8114118it [05:11, 33700.09it/s]


Reached 1000000 comments, saving batch 8...
Saved 1000000 comments to processed_comments_1/technology/technology_batch8.pkl


Processing technology comments: 9137053it [05:50, 33056.08it/s]


Reached 1000000 comments, saving batch 9...
Saved 1000000 comments to processed_comments_1/technology/technology_batch9.pkl


Processing technology comments: 10166097it [06:30, 32630.54it/s]


Reached 1000000 comments, saving batch 10...


Processing technology comments: 10171887it [06:33, 5162.81it/s] 

Saved 1000000 comments to processed_comments_1/technology/technology_batch10.pkl


Processing technology comments: 11198110it [07:09, 30696.95it/s]


Reached 1000000 comments, saving batch 11...
Saved 1000000 comments to processed_comments_1/technology/technology_batch11.pkl


Processing technology comments: 12233125it [07:49, 34375.10it/s]


Reached 1000000 comments, saving batch 12...
Saved 1000000 comments to processed_comments_1/technology/technology_batch12.pkl


Processing technology comments: 13272390it [08:29, 30584.44it/s]


Reached 1000000 comments, saving batch 13...


Processing technology comments: 13278209it [08:32, 4790.86it/s] 

Saved 1000000 comments to processed_comments_1/technology/technology_batch13.pkl


Processing technology comments: 14305791it [09:09, 29561.47it/s]


Reached 1000000 comments, saving batch 14...


Processing technology comments: 14311819it [09:12, 5004.55it/s] 

Saved 1000000 comments to processed_comments_1/technology/technology_batch14.pkl


Processing technology comments: 15337395it [09:48, 33072.23it/s]


Reached 1000000 comments, saving batch 15...


Processing technology comments: 15340733it [09:51, 4094.14it/s] 

Saved 1000000 comments to processed_comments_1/technology/technology_batch15.pkl


Processing technology comments: 16362010it [10:29, 32309.72it/s]


Reached 1000000 comments, saving batch 16...
Saved 1000000 comments to processed_comments_1/technology/technology_batch16.pkl


Processing technology comments: 17385026it [11:08, 33252.87it/s]


Reached 1000000 comments, saving batch 17...
Saved 1000000 comments to processed_comments_1/technology/technology_batch17.pkl


Processing technology comments: 18405364it [11:47, 33702.18it/s]


Reached 1000000 comments, saving batch 18...


Processing technology comments: 18411886it [11:50, 5488.49it/s] 

Saved 1000000 comments to processed_comments_1/technology/technology_batch18.pkl


Processing technology comments: 19424847it [12:26, 31876.47it/s]


Reached 1000000 comments, saving batch 19...
Saved 1000000 comments to processed_comments_1/technology/technology_batch19.pkl


Processing technology comments: 20441680it [13:06, 31215.48it/s]


Reached 1000000 comments, saving batch 20...
Saved 1000000 comments to processed_comments_1/technology/technology_batch20.pkl


Processing technology comments: 21457120it [13:45, 7758.26it/s] 


Reached 1000000 comments, saving batch 21...


Processing technology comments: 21462313it [13:48, 3943.04it/s]

Saved 1000000 comments to processed_comments_1/technology/technology_batch21.pkl


Processing technology comments: 22468414it [14:24, 33489.64it/s]


Reached 1000000 comments, saving batch 22...


Processing technology comments: 22474184it [14:26, 5340.71it/s] 

Saved 1000000 comments to processed_comments_1/technology/technology_batch22.pkl


Processing technology comments: 23481139it [15:04, 29570.71it/s]


Reached 1000000 comments, saving batch 23...
Saved 1000000 comments to processed_comments_1/technology/technology_batch23.pkl


Processing technology comments: 24491783it [15:45, 35630.74it/s]


Reached 1000000 comments, saving batch 24...


Processing technology comments: 24495363it [15:47, 4603.22it/s] 

Saved 1000000 comments to processed_comments_1/technology/technology_batch24.pkl


Processing technology comments: 25503160it [16:27, 7962.80it/s] 


Reached 1000000 comments, saving batch 25...


Processing technology comments: 25510441it [16:30, 4660.34it/s]

Saved 1000000 comments to processed_comments_1/technology/technology_batch25.pkl


Processing technology comments: 26513690it [17:07, 30267.02it/s]


Reached 1000000 comments, saving batch 26...


Processing technology comments: 26519166it [17:10, 4645.21it/s] 

Saved 1000000 comments to processed_comments_1/technology/technology_batch26.pkl


Processing technology comments: 27525737it [17:47, 32975.84it/s]


Reached 1000000 comments, saving batch 27...


Processing technology comments: 27532018it [17:50, 5449.74it/s] 

Saved 1000000 comments to processed_comments_1/technology/technology_batch27.pkl


Processing technology comments: 28539117it [18:28, 32626.73it/s]


Reached 1000000 comments, saving batch 28...


Processing technology comments: 28545625it [18:30, 5288.50it/s] 

Saved 1000000 comments to processed_comments_1/technology/technology_batch28.pkl


Processing technology comments: 29547986it [19:09, 9002.55it/s] 


Reached 1000000 comments, saving batch 29...
Saved 1000000 comments to processed_comments_1/technology/technology_batch29.pkl


Processing technology comments: 30557505it [19:47, 30970.10it/s]


Reached 1000000 comments, saving batch 30...


Processing technology comments: 30563464it [19:49, 5288.67it/s] 

Saved 1000000 comments to processed_comments_1/technology/technology_batch30.pkl


Processing technology comments: 30804312it [19:58, 25701.02it/s]



Saving remaining 241869 comments...
Saved 241869 comments to processed_comments_1/technology/technology_batch31.pkl

Completed processing technology comments!
Total comments saved: 30241869

Processing subreddit: personalfinance
Processing file: datasets/personalfinance_comments.zst


Processing personalfinance comments: 1005286it [00:48, 24937.75it/s]


Reached 1000000 comments, saving batch 1...
Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch1.pkl


Processing personalfinance comments: 2034671it [01:37, 25375.03it/s]


Reached 1000000 comments, saving batch 2...


Processing personalfinance comments: 2039024it [01:40, 3217.71it/s] 

Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch2.pkl


Processing personalfinance comments: 3066504it [02:26, 26574.88it/s]


Reached 1000000 comments, saving batch 3...
Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch3.pkl


Processing personalfinance comments: 4102805it [03:13, 28404.05it/s]


Reached 1000000 comments, saving batch 4...


Processing personalfinance comments: 4107692it [03:16, 4021.02it/s] 

Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch4.pkl


Processing personalfinance comments: 5137798it [03:59, 28451.16it/s]


Reached 1000000 comments, saving batch 5...


Processing personalfinance comments: 5143465it [04:02, 4393.83it/s] 

Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch5.pkl


Processing personalfinance comments: 6173597it [04:45, 26414.16it/s]


Reached 1000000 comments, saving batch 6...


Processing personalfinance comments: 6179068it [04:48, 4323.65it/s] 

Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch6.pkl


Processing personalfinance comments: 7205770it [05:30, 26833.68it/s]


Reached 1000000 comments, saving batch 7...
Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch7.pkl


Processing personalfinance comments: 8235248it [06:15, 26543.53it/s]


Reached 1000000 comments, saving batch 8...


Processing personalfinance comments: 8240144it [06:18, 4001.94it/s] 

Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch8.pkl


Processing personalfinance comments: 9265606it [07:02, 27393.76it/s]


Reached 1000000 comments, saving batch 9...
Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch9.pkl


Processing personalfinance comments: 10298351it [07:48, 28436.04it/s]


Reached 1000000 comments, saving batch 10...


Processing personalfinance comments: 10301204it [07:51, 3293.09it/s] 

Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch10.pkl


Processing personalfinance comments: 11329109it [1:48:34, 25499.79it/s]


Reached 1000000 comments, saving batch 11...
Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch11.pkl


Processing personalfinance comments: 12364453it [1:49:19, 27721.49it/s]


Reached 1000000 comments, saving batch 12...
Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch12.pkl


Processing personalfinance comments: 13407899it [1:50:05, 26849.13it/s]


Reached 1000000 comments, saving batch 13...


Processing personalfinance comments: 13410589it [1:50:08, 3406.59it/s] 

Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch13.pkl


Processing personalfinance comments: 14442134it [1:50:51, 27489.42it/s]


Reached 1000000 comments, saving batch 14...
Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch14.pkl


Processing personalfinance comments: 15470830it [1:51:39, 24994.30it/s]


Reached 1000000 comments, saving batch 15...


Processing personalfinance comments: 15475176it [1:51:42, 3810.92it/s] 

Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch15.pkl


Processing personalfinance comments: 16496018it [1:52:27, 25797.08it/s]


Reached 1000000 comments, saving batch 16...


Processing personalfinance comments: 16501521it [1:52:30, 4298.30it/s] 

Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch16.pkl


Processing personalfinance comments: 17521098it [1:53:15, 24767.56it/s]


Reached 1000000 comments, saving batch 17...
Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch17.pkl


Processing personalfinance comments: 18547880it [1:54:03, 24446.86it/s]


Reached 1000000 comments, saving batch 18...


Processing personalfinance comments: 18550328it [1:54:06, 2892.41it/s] 

Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch18.pkl


Processing personalfinance comments: 19574654it [1:54:50, 25232.28it/s]


Reached 1000000 comments, saving batch 19...


Processing personalfinance comments: 19579665it [1:54:53, 4154.92it/s] 

Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch19.pkl


Processing personalfinance comments: 20610928it [1:55:37, 26726.35it/s]


Reached 1000000 comments, saving batch 20...
Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch20.pkl


Processing personalfinance comments: 21645457it [1:56:25, 26893.94it/s]


Reached 1000000 comments, saving batch 21...


Processing personalfinance comments: 21650227it [1:56:28, 3906.89it/s] 

Saved 1000000 comments to processed_comments_1/personalfinance/personalfinance_batch21.pkl


Processing personalfinance comments: 21709367it [1:56:30, 3105.45it/s] 



Saving remaining 59614 comments...
Saved 59614 comments to processed_comments_1/personalfinance/personalfinance_batch22.pkl

Completed processing personalfinance comments!
Total comments saved: 21059614
