In [1]:
import os
import pickle
import datetime
import glob
import numpy as np
import random
from gensim.models.phrases import Phraser
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

def get_date_from_comment(comment):
    """Extract date from a comment dictionary"""
    try:
        return datetime.datetime.strptime(comment["date"], "%Y-%m-%d").date()
    except (KeyError, ValueError):
        try:
            return datetime.datetime.fromtimestamp(int(comment["timestamp"])).date()
        except (KeyError, ValueError):
            return None

def get_period(date):
    """Determine which time period a date belongs to"""
    if date is None:
        return None
    year = date.year
    if year <= 2016:
        return "before_2016"
    elif 2017 <= year <= 2020:
        return "2017_2020"
    elif 2021 <= year <= 2024:
        return "2021_2024"
    return None

def build_bigram_model(comments):
    """Build a bigram model for the given comments"""
    sentences = []
    for comment in comments:
        if "processed_text" in comment:
            sentences.append(comment["processed_text"])
    phrases = Phrases(sentences, min_count=10, threshold=10)
    return Phraser(phrases)

def apply_bigrams(comments, bigram_model):
    """Apply bigram model to comments"""
    processed = []
    for comment in comments:
        if "processed_text" in comment:
            processed.append(bigram_model[comment["processed_text"]])
    return processed

def create_or_update_model(period, comments, vector_size, window, min_count, workers, sg, epochs, existing_model=None):
    """Create a new model or update an existing one"""
    if existing_model is None:
        model = Word2Vec(
            vector_size=vector_size,
            window=window,
            min_count=min_count,
            workers=workers,
            sg=sg,
            seed=23
        )
        model.build_vocab(comments)
        print(f"{period} vocabulary size: {len(model.wv.index_to_key)}")
    else:
        model = existing_model
        model.build_vocab(comments, update=True)
        print(f"{period} vocabulary size: {len(model.wv.index_to_key)}")
    model.train(comments, total_examples=len(comments), epochs=epochs)
    return model

def save_model(model, subreddit, period, model_dir, is_interim=False):
    """Save model to disk"""
    if is_interim:
        path = f"{model_dir}/interim/{subreddit}_{period}_interim.model"
    else:
        path = f"{model_dir}/{subreddit}_{period}.model"
    model.save(path)

def build_models_for_subreddit(
    subreddit,
    base_data_dir,
    model_dir,
    vector_size=300,
    window=5,
    min_count=5,
    epochs=5,
    workers=16,
    sg=0,
    min_comments_to_train=10000,
    chunk_size=1000000,
    global_bigram_path=None
):

    time_periods = ["before_2016", "2017_2020", "2021_2024"]
    models = {period: None for period in time_periods}
    bigram_models = {period: None for period in time_periods}
    
    # Load global bigram model if exists
    global_bigram_path = global_bigram_path
    if os.path.exists(global_bigram_path):
        print(f"Loading global bigram model from {global_bigram_path}")
        global_bigram_model = Phraser.load(global_bigram_path)
    else:
        print(f"Global bigram model not found at {global_bigram_path}, will train on each chunk.")
        global_bigram_model = None
        return

    # Find all pickle files
    pattern = f"{base_data_dir}/{subreddit}/{subreddit}_batch*.pkl"
    pickle_files = sorted(glob.glob(pattern))
    if not pickle_files:
        print(f"No pickle files found for {subreddit} in {base_data_dir}/{subreddit}/")
        return

    comments_by_period = {period: [] for period in time_periods}

    for file_path in pickle_files:
        try:
            with open(file_path, 'rb') as f:
                comments = pickle.load(f)
            print(f"Loaded {len(comments)} comments from {file_path}")
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue

        for comment in comments:
            date = get_date_from_comment(comment)
            period = get_period(date)
            if period:
                comments_by_period[period].append(comment)

        for period in time_periods:
            period_comments = comments_by_period[period]
            while len(period_comments) >= chunk_size:
                print(f"Processing chunk of {chunk_size} comments for {period}")
                chunk = period_comments[:chunk_size]
                period_comments = period_comments[chunk_size:]

                # Use global bigram model if exists, otherwise train on each chunk
                if global_bigram_model is not None:
                    bigram_model = global_bigram_model
                else:
                    bigram_model = build_bigram_model(chunk)
                bigram_models[period] = bigram_model
                processed_chunk = apply_bigrams(chunk, bigram_model)

                if len(processed_chunk) > min_comments_to_train:
                    model = create_or_update_model(
                        period, processed_chunk, vector_size, window, min_count, workers, sg, epochs, models[period]
                    )
                    models[period] = model
                    save_model(model, subreddit, period, model_dir, is_interim=True)
            comments_by_period[period] = period_comments

    # Process any remaining comments
    for period, remaining_comments in comments_by_period.items():
        if len(remaining_comments) > min_comments_to_train:
            print(f"Processing final {len(remaining_comments)} comments for {period}")
            if global_bigram_model is not None:
                bigram_model = global_bigram_model
            else:
                bigram_model = build_bigram_model(remaining_comments)
            bigram_models[period] = bigram_model
            processed_chunk = apply_bigrams(remaining_comments, bigram_model)
            model = create_or_update_model(
                period, processed_chunk, vector_size, window, min_count, workers, sg, epochs, models[period]
            )
            models[period] = model
            save_model(model, subreddit, period, model_dir, is_interim=False)
        else:
            print(f"Skipping final {len(remaining_comments)} comments for {period} (less than minimum required)")

    # Save final models
    for period, model in models.items():
        if model is not None:
            save_model(model, subreddit, period, model_dir, is_interim=False)
    print(f"Model saved to {model_dir}")
    print(f"Completed building models for {subreddit}")

def main():
    model_dir = "models/chunk_1"
    global_bigram_path = "models/bigram/political_bigram_1.phr"
    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(f"{model_dir}/interim", exist_ok=True)
    random.seed(23)
    np.random.seed(23)
    subreddits = ["democrats", "republican"]
    for subreddit in subreddits:
        build_models_for_subreddit(
            subreddit,
            base_data_dir="processed_comments_1",
            model_dir=model_dir,
            vector_size=300,
            window=5,
            min_count=10,
            epochs=5,
            workers=16,
            sg=0,
            min_comments_to_train=10000,
            chunk_size=1000000,
            global_bigram_path=global_bigram_path
        )

if __name__ == "__main__":
    main()

# Changes made:
# Using global bigram model, set min_count=10

Loading global bigram model from models/bigram/political_bigram_1.phr
Loaded 1000000 comments from processed_comments_1/democrats\democrats_batch1.pkl
Loaded 933011 comments from processed_comments_1/democrats\democrats_batch2.pkl
Processing chunk of 1000000 comments for 2021_2024
2021_2024 vocabulary size: 28457
Processing final 127161 comments for before_2016
before_2016 vocabulary size: 11534
Processing final 472651 comments for 2017_2020
2017_2020 vocabulary size: 19525
Processing final 333199 comments for 2021_2024
2021_2024 vocabulary size: 28704
Model saved to models/chunk_1
Completed building models for democrats
Loading global bigram model from models/bigram/political_bigram_1.phr
Loaded 1000000 comments from processed_comments_1/republican\republican_batch1.pkl
Loaded 290701 comments from processed_comments_1/republican\republican_batch2.pkl
Processing final 263564 comments for before_2016
before_2016 vocabulary size: 17932
Processing final 414712 comments for 2017_2020
2017_

In [None]:
# For streaming
import sys
version = sys.version_info
if version.major < 3 or (version.major == 3 and version.minor < 10):
    raise RuntimeError("This script requires Python 3.10 or higher")
import os
from typing import Iterable

from fileStreams import getFileJsonStream
from utils import FileProgressLog

# For processing
import gensim
from gensim.models import Word2Vec
import re
import logging
from tqdm import tqdm
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models.phrases import Phrases, Phraser
    
import datetime
import random
import numpy as np
import html
import unicodedata

# # Download necessary NLTK resources
# nltk.download('stopwords', quiet=True)
# nltk.download('wordnet', quiet=True)
# nltk.download('punkt', quiet=True)
# nltk.download('averaged_perceptron_tagger', quiet=True)  # For POS tagging

recursive = False


def processFile(path, party, output_dir, without_stopwords=True):
    print(f"Processing file {path}")
    # Initialize lemmatizer and stop words
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Create empty lists for each time period
    chunks = {
        "before_2016": [],
        "2017_2020": [],
        "2021_2024": [],
    }
    
    # Track counts
    counts = {period: 0 for period in chunks.keys()}
    
    POS_CACHE = {}
    LEMMA_CACHE = {}
    
    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {path}")
            return
        for row in tqdm(jsonStream, desc="Processing comments"):
            if "body" not in row or "created_utc" not in row:
                continue
            
            # Get the comment text and timestamp
            text = row["body"]
            created_timestamp = row["created_utc"]
            author = row["author"]
            if author in {"AutoModerator", "election_info_bot"}:
                continue
            
            # Convert timestamp to year
            year = datetime.datetime.fromtimestamp(int(created_timestamp)).year
            
            # Determine which chunk this comment belongs to
            chunk_key = None
            if year <= 2016:
                chunk_key = "before_2016"
            elif 2017 <= year <= 2020:
                chunk_key = "2017_2020"
            elif 2021 <= year <= 2024:
                chunk_key = "2021_2024"
            
            # Process text
            # Handle HTML entities
            text = html.unescape(text)
            
            # Unicode normalization
            text = unicodedata.normalize('NFKD', text)
            
            # Remove URLs and Markdown formatting
            text = re.sub(r'http\S+', '', text)
            text = re.sub(r'!\[.*?\]\(.*?\)', '', text)
            text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)
            text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
            text = re.sub(r'\*(.*?)\*', r'\1', text)
            
            # Remove subreddit and user references
            text = re.sub(r'/r/\w+', '', text)
            text = re.sub(r'r/\w+', '', text)
            text = re.sub(r'/u/\w+', '', text)
            text = re.sub(r'u/\w+', '', text)
            
            # Basic text cleaning
            text = re.sub("[^A-Za-z]+", ' ', text).lower()
            
            # Remove single letters (except 'i')
            text = re.sub(r'\b([a-hj-z])\b', '', text)
            
            # Tokenize
            words = text.split()
            
            # Skip empty comments
            if not words:
                continue
            
            # Filter stop words
            if without_stopwords:
                words_to_tag = []
                for word in words:
                    if word not in stop_words:
                        words_to_tag.append(word)
            else:
                words_to_tag = words[:]
                
            if not words_to_tag:
                continue
            
            # POS Cache
            uncached_words = []
            for word in words_to_tag:
                if word not in POS_CACHE:
                    uncached_words.append(word)

            if uncached_words:
                tagged_uncached = nltk.pos_tag(uncached_words)
                for word, tag in tagged_uncached:
                    POS_CACHE[word] = tag

            processed_words = []
            for word in words_to_tag:
                tag = POS_CACHE[word]
                # Convert to wordnet_pos
                if tag.startswith('J'):
                    wordnet_pos = 'a'
                elif tag.startswith('V'):
                    wordnet_pos = 'v'
                elif tag.startswith('N'):
                    wordnet_pos = 'n'
                elif tag.startswith('R'):
                    wordnet_pos = 'r'
                else:
                    wordnet_pos = 'n'

                lemma_key = (word, wordnet_pos)
                if lemma_key in LEMMA_CACHE:
                    lemma = LEMMA_CACHE[lemma_key]
                else:
                    lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
                    LEMMA_CACHE[lemma_key] = lemma

                processed_words.append(lemma)

            if processed_words:
                chunks[chunk_key].append(processed_words)
                counts[chunk_key] += 1
                
    # Print statistics
    print("\n=== Comment Counts by Period ===")
    for period, count in counts.items():
        print(f"{period}: {count} comments")
        
    # Extract bigrams from each time period
    for period, comments in chunks.items():
        if len(comments) > 0:
            # Build bigram model
            phrases = Phrases(comments, min_count=5, threshold=10)
            bigram_model = Phraser(phrases)
            
            # Apply bigram model to create comments with bigrams
            bigrammed_comments = [bigram_model[comment] for comment in comments]
            chunks[period] = bigrammed_comments
        
    # Train a Word2Vec model for each time period
    for period, comments in chunks.items():
        if len(comments) > 0:
            # Initialize and train model
            model = Word2Vec(
                vector_size=300,
                window=5,
                min_count=5,
                workers=16,
                seed=23
            )
            
            # Build vocabulary
            model.build_vocab(comments)
            print(f"{period} vocabulary size: {len(model.wv.index_to_key)}")
            
            # Train the model
            model.train(
                comments, 
                total_examples=len(comments), 
                epochs=5
            )
            
            # Save the model
            model_path = f"{output_dir}/{party}_{period}.model"
            model.save(model_path)
            print(f"Model saved to {model_path}")
            
        
def main():
    filePathforDemocrats = r"datasets/democrats_comments.zst"
    filePathforRepublican = r"datasets/Republican_comments.zst"
    filePathforConservative = r"datasets/Conservative_comments.zst"
    filePathforLiberal = r"datasets/Liberal_comments.zst"
    filePathforVagabond = r"datasets/vagabond_comments.zst"
    filePathforbackpacking = r"datasets/backpacking_comments.zst"


    random.seed(23)
    np.random.seed(23)
    output_dir = "models/final_full"
    os.makedirs(output_dir, exist_ok=True)
    # processFile(filePathforDemocrats, "democrats", output_dir)
    # processFile(filePathforRepublican, "republican", output_dir)
    processFile(filePathforConservative, "conservative", output_dir)
    processFile(filePathforLiberal, "liberal", output_dir)
    processFile(filePathforVagabond, "vagabond", output_dir)
    processFile(filePathforbackpacking, "backpacking", output_dir)
    print("Done :>")

if __name__ == "__main__":
    main()
