In [2]:
# For streaming
import sys
version = sys.version_info
if version.major < 3 or (version.major == 3 and version.minor < 10):
    raise RuntimeError("This script requires Python 3.10 or higher")
import os
from typing import Iterable

from fileStreams import getFileJsonStream
from utils import FileProgressLog

# For processing
import gensim
from gensim.models import Word2Vec
import re
import html
import unicodedata
import logging
from tqdm import tqdm
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models.phrases import Phrases, Phraser
    
import datetime
from collections import defaultdict

recursive = False

In [None]:
def preprocess_reddit_text(text):
    # Handle HTML entities
    text = html.unescape(text)                         # &amp; becomes &, etc.
    
    # Handle Unicode normalization
    text = unicodedata.normalize('NFKD', text)         # converts café with a single combined "é" character to "e" with an accent
    
    # Remove all URLs
    text = re.sub(r'http\S+', '', text)                # Remove URLs
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)        # Images/GIFs
    
    # Handle Reddit's link format
    text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)    # [text](link) becomes text
    
    # Handle markdown formatting (bold, italics)
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)       # **word** becomes word
    text = re.sub(r'\*(.*?)\*', r'\1', text)           # *word* becomes word
    
    # Handle subreddit and user references - both with and without leading slash
    text = re.sub(r'/r/\w+', '', text)                 # /r/politics becomes empty
    text = re.sub(r'r/\w+', '', text)                  # r/politics becomes empty
    text = re.sub(r'/u/\w+', '', text)                 # /u/username becomes empty
    text = re.sub(r'u/\w+', '', text)                  # u/username becomes empty
    
    # Remove time-related terms that create noise
    # Time markers
    text = re.sub(r'\b(?:am|pm|AM|PM|a\.m\.|p\.m\.)\b', '', text)
    
    # Days of week - both full and abbreviated forms
    days_pattern = r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|'
    days_pattern += r'Mon|Tue|Tues|Wed|Thu|Thurs|Fri|Sat|Sun)\b'
    text = re.sub(days_pattern, '', text, flags=re.IGNORECASE)
    
    # Month names - both full and abbreviated forms
    months_pattern = r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December|'
    months_pattern += r'Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\b'
    text = re.sub(months_pattern, '', text, flags=re.IGNORECASE)
    
    # Time units that often appear in Reddit comments
    time_units = r'\b(?:second|minute|hour|day|week|month|year)s?\b'
    text = re.sub(time_units, '', text, flags=re.IGNORECASE)
    
    # Basic text cleaning
    text = re.sub("[^A-Za-z]+", ' ', text).lower()

    # Remove single letters except 'i'
    text = re.sub(r'\b([a-hj-z])\b', '', text, flags=re.IGNORECASE)
    
    return text

def build_word2vect_model(path, party, without_stopwords=True, phrases_min_count=5, word2vec_min_count=5, 
                          batch_size=1000000, save_interim=True):
    print(f"Processing file {path}")
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # For each period, track comments and user-word usage
    chunks = {
        "before_2016": [],
        "2017_2020": [],
        "2021_2024": [],
    }
    user_words = {
        "before_2016": defaultdict(set),
        "2017_2020": defaultdict(set),
        "2021_2024": defaultdict(set),
    }
    user_comments = {
        "before_2016": defaultdict(list),
        "2017_2020": defaultdict(list),
        "2021_2024": defaultdict(list),
    }
    counts = {period: 0 for period in chunks.keys()}
    
    # Batch processing counters
    batch_counts = {period: 0 for period in chunks.keys()}
    models = {period: None for period in chunks.keys()}

    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {path}")
            return
            
        for row in tqdm(jsonStream, desc="Processing comments"):
            if "body" not in row or "created_utc" not in row or "author" not in row:
                continue
            author = row["author"]
            if author in {"AutoModerator", "election_info_bot"}:
                continue
                
            text = row["body"]
            created_timestamp = row["created_utc"]
            year = datetime.datetime.fromtimestamp(int(created_timestamp)).year
            
            if year <= 2016:
                chunk_key = "before_2016"
            elif 2017 <= year <= 2020:
                chunk_key = "2017_2020"
            elif 2021 <= year <= 2024:
                chunk_key = "2021_2024"
            else:
                continue
            
            text = preprocess_reddit_text(text)

            words = text.split()
            if not words:
                continue
                
            tagged_words = nltk.pos_tag(words)
            processed_words = []
            
            for word, tag in tagged_words:
                if without_stopwords and word in stop_words:
                    continue
                if tag.startswith('J'):
                    wordnet_pos = 'a'
                elif tag.startswith('V'):
                    wordnet_pos = 'v'
                elif tag.startswith('N'):
                    wordnet_pos = 'n'
                elif tag.startswith('R'):
                    wordnet_pos = 'r'
                else:
                    wordnet_pos = 'n'
                lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
                processed_words.append(lemma)
                user_words[chunk_key][lemma].add(author)
                
            if processed_words:
                user_comments[chunk_key][author].append(processed_words)
                counts[chunk_key] += 1
                batch_counts[chunk_key] += 1
            
            # Process batch when we reach the batch size for any period
            for period in chunks.keys():
                if batch_counts[period] >= batch_size:
                    print(f"\nReached {batch_size} comments for {period}, processing batch...")
                    process_batch(period, user_words[period], user_comments[period], 
                                 models[period], phrases_min_count, word2vec_min_count, party)
                    
                    # Clear batch data to free memory
                    user_words[period] = defaultdict(set)
                    user_comments[period] = defaultdict(list)
                    batch_counts[period] = 0

    print("\n=== Final Comment Counts by Period ===")
    for period, count in counts.items():
        print(f"{period}: {count} comments")
    
    # Process any remaining comments
    for period in chunks.keys():
        if batch_counts[period] > 0:
            print(f"\nProcessing final batch of {batch_counts[period]} comments for {period}...")
            process_batch(period, user_words[period], user_comments[period], 
                         models[period], phrases_min_count, word2vec_min_count, party)

    # Save final models
    for period, model in models.items():
        if model is not None:
            final_path = f"models/model_v4/reddit_word2vec_{phrases_min_count}_{word2vec_min_count}_{party}_{period}.model"
            model.save(final_path)
            print(f"Final model saved to {final_path}")


def process_batch(period, user_words_dict, user_comments_dict, existing_model, 
                 phrases_min_count, word2vec_min_count, party):
    """Process a batch of comments and update the model incrementally"""
    
    # Filter words by user count
    valid_words = {w for w, users in user_words_dict.items() if len(users) >= 3}
    filtered_comments = []
    
    for comments in user_comments_dict.values():
        for comment in comments:
            filtered = [w for w in comment if w in valid_words]
            if filtered:
                filtered_comments.append(filtered)
                
    print(f"{period}: Processing {len(filtered_comments)} comments after filtering words by user count")
    
    if not filtered_comments:
        print(f"No valid comments for {period} after filtering, skipping batch")
        return existing_model
    
    # Extract bigrams
    print(f"Extracting bigrams...")
    phrases = Phrases(filtered_comments, 
                      min_count=phrases_min_count, 
                      threshold=0.7,
                      scoring='npmi')
    bigram_model = Phraser(phrases)
    bigrammed_comments = [bigram_model[comment] for comment in filtered_comments]
    
    # Either create a new model or update existing one
    if existing_model is None:
        print(f"Creating new Word2Vec model for {period}")
        model = Word2Vec(
            vector_size=300,
            window=5,
            min_count=word2vec_min_count,
            workers=16
        )
        model.build_vocab(bigrammed_comments)
        
    else:
        print(f"Updating existing Word2Vec model for {period}")
        model = existing_model
        model.build_vocab(bigrammed_comments, update=True)
    
    print(f"Training model on {len(bigrammed_comments)} comments")
    model.train(
        bigrammed_comments,
        total_examples=len(bigrammed_comments),
        epochs=5
    )
    
    # Save interim model
    interim_path = f"models/model_v4/interim/reddit_word2vec_{phrases_min_count}_{word2vec_min_count}_{party}_{period}_interim.model"
    os.makedirs(os.path.dirname(interim_path), exist_ok=True)
    model.save(interim_path)
    print(f"Interim model saved to {interim_path}")
    
    return model


def main():
    # Create output directories
    os.makedirs("models/model_v4", exist_ok=True)
    os.makedirs("models/model_v4/interim", exist_ok=True)
    
    filePathforDemocrats = r"datasets/democrats_comments.zst"
    filePathforRepublican = r"datasets/Republican_comments.zst"
    filePathforBackpacking = r"datasets/backpacking_comments.zst"
    filePathforVagabond = r"datasets/vagabond_comments.zst"
    filePathforConservative = r"datasets/Conservative_comments.zst"
    filePathforLiberal = r"datasets/Liberal_comments.zst"

    # Set batch size to 1 million comments
    batch_size = 1000000
    
    # build_word2vect_model(filePathforDemocrats, "democrats", 
    #                       phrases_min_count=10, word2vec_min_count=10,
    #                       batch_size=batch_size)
    # build_word2vect_model(filePathforConservative, "conservatives", 
    #                       phrases_min_count=10, word2vec_min_count=10,
    #                       batch_size=batch_size)
    # build_word2vect_model(filePathforRepublican, "republican", 
    #                       phrases_min_count=10, word2vec_min_count=10,
    #                       batch_size=batch_size)
    # build_word2vect_model(filePathforBackpacking, "backpacking", 
    #                       phrases_min_count=10, word2vec_min_count=10,
    #                       batch_size=batch_size)
    # build_word2vect_model(filePathforVagabond, "vagabond", 
    #                       phrases_min_count=10, word2vec_min_count=10,
    #                       batch_size=batch_size)
    build_word2vect_model(filePathforLiberal, "liberal", 
                          phrases_min_count=10, word2vec_min_count=10,
                          batch_size=batch_size)

    print("Done :>")
    
    
if __name__ == "__main__":
    main()

Processing file datasets/Liberal_comments.zst


Processing comments: 497079it [04:02, 2045.93it/s]



=== Final Comment Counts by Period ===
before_2016: 131482 comments
2017_2020: 145788 comments
2021_2024: 213308 comments

Processing final batch of 131482 comments for before_2016...
before_2016: Processing 131273 comments after filtering words by user count
Extracting bigrams...
Creating new Word2Vec model for before_2016
Training model on 131273 comments
Interim model saved to models/model_v4/interim/reddit_word2vec_10_10_liberal_before_2016_interim.model

Processing final batch of 145788 comments for 2017_2020...
2017_2020: Processing 145477 comments after filtering words by user count
Extracting bigrams...
Creating new Word2Vec model for 2017_2020
Training model on 145477 comments
Interim model saved to models/model_v4/interim/reddit_word2vec_10_10_liberal_2017_2020_interim.model

Processing final batch of 213308 comments for 2021_2024...
2021_2024: Processing 212844 comments after filtering words by user count
Extracting bigrams...
Creating new Word2Vec model for 2021_2024
Train

In [None]:
def preprocess_reddit_text(text, lemmatize=True, without_stopwords=True):
    # Handle HTML entities
    text = html.unescape(text)                         # &amp; becomes &, etc.
    
    # Handle Unicode normalization
    text = unicodedata.normalize('NFKD', text)         # converts café with a single combined "é" character to "e" with an accent
    
    # Remove all URLs
    text = re.sub(r'http\S+', '', text)                # Remove URLs
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)        # Images/GIFs
    
    # Handle Reddit's link format
    text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)    # [text](link) becomes text
    
    # Handle markdown formatting (bold, italics)
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)       # **word** becomes word
    text = re.sub(r'\*(.*?)\*', r'\1', text)           # *word* becomes word
    
    # Handle subreddit and user references - both with and without leading slash
    text = re.sub(r'/r/\w+', '', text)                 # /r/politics becomes empty
    text = re.sub(r'r/\w+', '', text)                  # r/politics becomes empty
    text = re.sub(r'/u/\w+', '', text)                 # /u/username becomes empty
    text = re.sub(r'u/\w+', '', text)                  # u/username becomes empty
    
    # Remove time-related terms that create noise
    # Time markers
    text = re.sub(r'\b(?:am|pm|AM|PM|a\.m\.|p\.m\.)\b', '', text)
    
    # Days of week - both full and abbreviated forms
    days_pattern = r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|'
    days_pattern += r'Mon|Tue|Tues|Wed|Thu|Thurs|Fri|Sat|Sun)\b'
    text = re.sub(days_pattern, '', text, flags=re.IGNORECASE)
    
    # Month names - both full and abbreviated forms
    months_pattern = r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December|'
    months_pattern += r'Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\b'
    text = re.sub(months_pattern, '', text, flags=re.IGNORECASE)
    
    # Time units that often appear in Reddit comments
    time_units = r'\b(?:second|minute|hour|day|week|month|year)s?\b'
    text = re.sub(time_units, '', text, flags=re.IGNORECASE)
    
    # Basic text cleaning
    text = re.sub("[^A-Za-z]+", ' ', text).lower()

    # Remove single letters except 'i'
    text = re.sub(r'\b([a-hj-z])\b', '', text, flags=re.IGNORECASE)

    # If need lemmatization
    if lemmatize:
        stop_words = set(stopwords.words('english')) if without_stopwords else set()
        lemmatizer = WordNetLemmatizer()
        
        words = text.split()
        if words:
            tagged_words = nltk.pos_tag(words)
            processed_words = []
            
            for word, tag in tagged_words:
                if without_stopwords and word in stop_words:
                    continue
                    
                if tag.startswith('J'):
                    wordnet_pos = 'a'  # 形容词
                elif tag.startswith('V'):
                    wordnet_pos = 'v'  # 动词
                elif tag.startswith('N'):
                    wordnet_pos = 'n'  # 名词
                elif tag.startswith('R'):
                    wordnet_pos = 'r'  # 副词
                else:
                    wordnet_pos = 'n'  # 默认作为名词
                    
                lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
                processed_words.append(lemma)
                
            return ' '.join(processed_words)
    
    return text

In [None]:
def preprocess_and_store_comments(path, party, without_stopwords=True, min_users_per_word=3, batch_size=1000000):
    """
    处理评论并按批次存储结果以供将来使用
    
    参数:
    path - 评论数据路径
    party - 子版块名称
    without_stopwords - 是否移除停用词
    min_users_per_word - 每个词至少要被多少用户使用
    batch_size - 每批次处理的评论数量
    """
    print(f"处理文件 {path}")
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
    
    # 为每个时间段创建数据结构
    periods = ["before_2016", "2017_2020", "2021_2024"]
    
    # 为每个批次设置计数器
    batch_counts = {period: 0 for period in periods}
    total_counts = {period: 0 for period in periods}
    batch_number = {period: 1 for period in periods}
    
    # 创建输出目录
    preprocessed_dir = "preprocessed_data"
    os.makedirs(preprocessed_dir, exist_ok=True)
    
    print(f"开始预处理来自 {path} 的评论...")
    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"跳过未知文件 {path}")
            return
        
        # 为当前批次创建数据结构
        user_words = {period: defaultdict(set) for period in periods}
        user_comments = {period: defaultdict(list) for period in periods}
        
        for row in tqdm(jsonStream, desc="处理评论"):
            if "body" not in row or "created_utc" not in row or "author" not in row:
                continue
            author = row["author"]
            if author in {"AutoModerator", "election_info_bot"}:
                continue
                
            text = row["body"]
            created_timestamp = row["created_utc"]
            year = datetime.datetime.fromtimestamp(int(created_timestamp)).year
            
            # 确定时间段
            if year <= 2016:
                period = "before_2016"
            elif 2017 <= year <= 2020:
                period = "2017_2020"
            elif 2021 <= year <= 2024:
                period = "2021_2024"
            else:
                continue
            
            # 在这里直接调用预处理和词形还原函数
            processed_text = preprocess_reddit_text(text, lemmatize=True, without_stopwords=without_stopwords)
            processed_words = processed_text.split()
            
            if processed_words:
                # 跟踪单词-用户关系
                for word in processed_words:
                    user_words[period][word].add(author)
                
                # 保存处理后的评论
                user_comments[period][author].append(processed_words)
                batch_counts[period] += 1
                total_counts[period] += 1
                
            # 检查是否达到批处理阈值
            for p in periods:
                if batch_counts[p] >= batch_size:
                    print(f"\n达到 {batch_size} 条 {p} 评论，处理批次 {batch_number[p]}...")
                    
                    # 处理并保存这个批次
                    _process_and_save_batch(
                        p, user_words[p], user_comments[p], 
                        party, batch_number[p], min_users_per_word, preprocessed_dir
                    )
                    
                    # 重置批次数据
                    user_words[p] = defaultdict(set)
                    user_comments[p] = defaultdict(list)
                    batch_counts[p] = 0
                    batch_number[p] += 1
    
    # 处理所有剩余评论
    print("\n=== 各时间段评论总数 ===")
    for period, count in total_counts.items():
        print(f"{period}: {count} 条评论")
        
    for period in periods:
        if batch_counts[period] > 0:
            print(f"\n处理剩余的 {batch_counts[period]} 条 {period} 评论...")
            _process_and_save_batch(
                period, user_words[period], user_comments[period],
                party, batch_number[period], min_users_per_word, preprocessed_dir
            )
    
    print(f"预处理完成! 所有批次都已保存到 {preprocessed_dir} 目录")

In [None]:
def preprocess_and_store_comments(path, party, without_stopwords=True, min_users_per_word=3):
    """Process comments once and save results for future use"""
    print(f"Processing file {path}")
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # For each period, track comments and user-word usage
    periods = ["before_2016", "2017_2020", "2021_2024"]
    user_words = {period: defaultdict(set) for period in periods}
    user_comments = {period: defaultdict(list) for period in periods}
    counts = {period: 0 for period in periods}

    print(f"Preprocessing comments from {path}...")
    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {path}")
            return

        for row in tqdm(jsonStream, desc="Processing comments"):
            if "body" not in row or "created_utc" not in row or "author" not in row:
                continue
            author = row["author"]
            if author in {"AutoModerator", "election_info_bot"}:
                continue
                
            text = row["body"]
            created_timestamp = row["created_utc"]
            year = datetime.datetime.fromtimestamp(int(created_timestamp)).year
            
            if year <= 2016:
                period = "before_2016"
            elif 2017 <= year <= 2020:
                period = "2017_2020"
            elif 2021 <= year <= 2024:
                period = "2021_2024"
            else:
                continue
            
            text = preprocess_reddit_text(text)
            words = text.split()
            if not words:
                continue
                
            tagged_words = nltk.pos_tag(words)
            processed_words = []
            
            for word, tag in tagged_words:
                if without_stopwords and word in stop_words:
                    continue
                if tag.startswith('J'):
                    wordnet_pos = 'a'
                elif tag.startswith('V'):
                    wordnet_pos = 'v'
                elif tag.startswith('N'):
                    wordnet_pos = 'n'
                elif tag.startswith('R'):
                    wordnet_pos = 'r'
                else:
                    wordnet_pos = 'n'
                lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
                processed_words.append(lemma)
                user_words[period][lemma].add(author)
                
            if processed_words:
                user_comments[period][author].append(processed_words)
                counts[period] += 1

    print("\n=== Comment Counts by Period ===")
    for period, count in counts.items():
        print(f"{period}: {count} comments")

    # Filter words by user count and rebuild comments for each period
    filtered_data = {}
    for period in periods:
        print(f"\nFiltering {period} comments by user count (min {min_users_per_word})...")
        valid_words = {w for w, users in user_words[period].items() if len(users) >= min_users_per_word}
        filtered_comments = []
        
        for comments in user_comments[period].values():
            for comment in comments:
                filtered = [w for w in comment if w in valid_words]
                if filtered:
                    filtered_comments.append(filtered)
                    
        print(f"{period}: {len(filtered_comments)} comments after filtering words by user count")
        filtered_data[period] = filtered_comments
    
    # Create directory for preprocessed data
    preprocessed_dir = "preprocessed_data"
    os.makedirs(preprocessed_dir, exist_ok=True)
    
    # Save filtered comments to disk
    for period, comments in filtered_data.items():
        output_path = f"{preprocessed_dir}/{party}_{period}_filtered.pkl"
        print(f"Saving {len(comments)} filtered comments to {output_path}")
        
        with open(output_path, 'wb') as f:
            import pickle
            pickle.dump(comments, f)
    
    print(f"Preprocessing completed for {party}!")
    return filtered_data


def train_model_from_preprocessed(party, period, phrases_min_count=5, word2vec_min_count=5):
    """Train a model using preprocessed comments"""
    preprocessed_path = f"preprocessed_data/{party}_{period}_filtered.pkl"
    
    if not os.path.exists(preprocessed_path):
        print(f"Error: Preprocessed file {preprocessed_path} not found.")
        return None
    
    # Load preprocessed comments
    print(f"Loading preprocessed comments from {preprocessed_path}")
    with open(preprocessed_path, 'rb') as f:
        import pickle
        filtered_comments = pickle.load(f)
    
    print(f"Loaded {len(filtered_comments)} preprocessed comments")
    
    # Extract bigrams
    print(f"Extracting bigrams...")
    phrases = Phrases(filtered_comments, 
                      min_count=phrases_min_count, 
                      threshold=0.7,
                      scoring='npmi')
    bigram_model = Phraser(phrases)
    bigrammed_comments = [bigram_model[comment] for comment in filtered_comments]
    
    # Create and train model
    print(f"Creating new Word2Vec model")
    model = Word2Vec(
        vector_size=300,
        window=5,
        min_count=word2vec_min_count,
        workers=16
    )
    model.build_vocab(bigrammed_comments)
    print(f"Vocabulary size: {len(model.wv.index_to_key)}")
    
    print(f"Training model on {len(bigrammed_comments)} comments")
    model.train(
        bigrammed_comments,
        total_examples=len(bigrammed_comments),
        epochs=5
    )
    
    # Save model
    model_dir = "models/model_v4"
    os.makedirs(model_dir, exist_ok=True)
    model_path = f"{model_dir}/reddit_word2vec_{phrases_min_count}_{word2vec_min_count}_{party}_{period}.model"
    model.save(model_path)
    print(f"Model saved to {model_path}")
    
    return model


def main():
    # Paths to your data
    filePathforDemocrats = r"datasets/democrats_comments.zst"
    filePathforRepublican = r"datasets/Republican_comments.zst"
    filePathforBackpacking = r"datasets/backpacking_comments.zst"
    filePathforVagabond = r"datasets/vagabond_comments.zst"
    filePathforConservative = r"datasets/Conservative_comments.zst"
    filePathforLiberal = r"datasets/Liberal_comments.zst"
    
    # Step 1: Preprocess once and save results
    # You only need to run this once per dataset
    preprocess = True  # Set to False if preprocessed data already exists
    if preprocess:
        print("Starting preprocessing pipeline...")
        preprocess_and_store_comments(filePathforDemocrats, "democrats", min_users_per_word=3)
        preprocess_and_store_comments(filePathforRepublican, "republican", min_users_per_word=3)
        preprocess_and_store_comments(filePathforConservative, "conservatives", min_users_per_word=3)
        preprocess_and_store_comments(filePathforLiberal, "liberal", min_users_per_word=3)
    
    # Step 2: Train models using preprocessed data
    # You can modify parameters and run this many times without repeating preprocessing
    print("\n\nTraining models using preprocessed data:")
    
    # Examples for different parameter combinations
    train_model_from_preprocessed("democrats", "before_2016", phrases_min_count=10, word2vec_min_count=10)
    train_model_from_preprocessed("democrats", "2017_2020", phrases_min_count=10, word2vec_min_count=10)
    train_model_from_preprocessed("democrats", "2021_2024", phrases_min_count=10, word2vec_min_count=10)
    
    # You could also loop through parameter combinations
    # for phrases_min in [5, 10, 15]:
    #     for word2vec_min in [5, 10, 15]:
    #         train_model_from_preprocessed("democrats", "before_2016", 
    #                                      phrases_min_count=phrases_min, 
    #                                      word2vec_min_count=word2vec_min)

    print("Done :>")

In [None]:
import pickle

def inspect_pkl_file(file_path, num_examples=5):
    """
    Load a pickle file and print a few example records
    
    Args:
        file_path: Path to the pickle file
        num_examples: Number of examples to show (default 5)
    """
    # Load the pickle file
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    
    # Print information about the data structure
    print(f"Data type: {type(data)}")
    
    if isinstance(data, list):
        print(f"Number of items: {len(data)}")
        
        # Display examples
        print(f"\nShowing first {min(num_examples, len(data))} examples:")
        for i, item in enumerate(data[:num_examples]):
            print(f"\nExample {i+1}:")
            if isinstance(item, dict):
                for key, value in item.items():
                    # For processed text, just show a few words
                    if key == "processed_text" and isinstance(value, list) and len(value) > 10:
                        print(f"  {key}: {value[:10]} ... (total: {len(value)} words)")
                    else:
                        print(f"  {key}: {value}")
            else:
                print(item)
    else:
        print("Data is not a list. Structure:", data)

# Example usage
inspect_pkl_file("processed_comments/democrats/democrats_batch1.pkl")

Data type: <class 'list'>
Number of items: 1000000

Showing first 5 examples:

Example 1:
  comment_id: c07p2u0
  author: Garak
  date: 2009-02-16
  timestamp: 1234791099
  processed_text: ['allow', 'legend', 'grow', 'ill', 'mythical', 'proportion', 'lie', 'fund', 'acorn', 'nowhere'] ... (total: 49 words)

Example 2:
  comment_id: c0883zo
  author: Garak
  date: 2009-03-13
  timestamp: 1236991154
  processed_text: ['sadden', 'read', 'time', 'yesterday', 'water', 'bill', 'maher', 'recently', 'seem', 'pretty'] ... (total: 15 words)

Example 3:
  comment_id: c091f56
  author: [deleted]
  date: 2009-04-22
  timestamp: 1240434783
  processed_text: ['speaker', 'pelosi', 'culture', 'corruption', 'washington', 'party', 'bad', 'republican']

Example 4:
  comment_id: c095pfx
  author: [deleted]
  date: 2009-04-27
  timestamp: 1240881225
  processed_text: ['congresswoman', 'nancy', 'pelosi', 'call', 'washington', 'culture', 'corruption', 'house', 'speaker', 'nancy'] ... (total: 13 words)

Example