In [1]:
# For streaming
import sys
version = sys.version_info
if version.major < 3 or (version.major == 3 and version.minor < 10):
    raise RuntimeError("This script requires Python 3.10 or higher")
import os
from typing import Iterable

from fileStreams import getFileJsonStream
from utils import FileProgressLog

# For processing
import gensim
from gensim.models import Word2Vec
import re
import html
import unicodedata
import logging
from tqdm import tqdm
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models.phrases import Phrases, Phraser
    
import datetime
from collections import defaultdict

recursive = False

In [None]:
def preprocess_reddit_text(text):
    # Handle HTML entities
    text = html.unescape(text)                         # &amp; becomes &, etc.
    
    # Handle Unicode normalization
    text = unicodedata.normalize('NFKD', text)         # converts café with a single combined "é" character to "e" with an accent
    
    # Remove all URLs
    text = re.sub(r'http\S+', '', text)                # Remove URLs
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)        # Images/GIFs
    
    # Handle Reddit's link format
    text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)    # [text](link) becomes text
    
    # Handle markdown formatting (bold, italics)
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)       # **word** becomes word
    text = re.sub(r'\*(.*?)\*', r'\1', text)           # *word* becomes word
    
    # Handle subreddit and user references - both with and without leading slash
    text = re.sub(r'/r/\w+', '', text)                 # /r/politics becomes empty
    text = re.sub(r'r/\w+', '', text)                  # r/politics becomes empty
    text = re.sub(r'/u/\w+', '', text)                 # /u/username becomes empty
    text = re.sub(r'u/\w+', '', text)                  # u/username becomes empty
    
    # Remove time-related terms that create noise
    # Time markers
    text = re.sub(r'\b(?:am|pm|AM|PM|a\.m\.|p\.m\.)\b', '', text)
    
    # Days of week - both full and abbreviated forms
    days_pattern = r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|'
    days_pattern += r'Mon|Tue|Tues|Wed|Thu|Thurs|Fri|Sat|Sun)\b'
    text = re.sub(days_pattern, '', text, flags=re.IGNORECASE)
    
    # Month names - both full and abbreviated forms
    months_pattern = r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December|'
    months_pattern += r'Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\b'
    text = re.sub(months_pattern, '', text, flags=re.IGNORECASE)
    
    # Time units that often appear in Reddit comments
    time_units = r'\b(?:second|minute|hour|day|week|month|year)s?\b'
    text = re.sub(time_units, '', text, flags=re.IGNORECASE)
    
    # Basic text cleaning
    text = re.sub("[^A-Za-z]+", ' ', text).lower()

    # Remove single letters except 'i'
    text = re.sub(r'\b([a-hj-z])\b', '', text, flags=re.IGNORECASE)
    
    return text

def build_word2vect_model(path, party, without_stopwords=True, phrases_min_count=5, word2vec_min_count=5, 
                          batch_size=1000000, save_interim=True):
    print(f"Processing file {path}")
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # For each period, track comments and user-word usage
    chunks = {
        "before_2016": [],
        "2017_2020": [],
        "2021_2024": [],
    }
    user_words = {
        "before_2016": defaultdict(set),
        "2017_2020": defaultdict(set),
        "2021_2024": defaultdict(set),
    }
    user_comments = {
        "before_2016": defaultdict(list),
        "2017_2020": defaultdict(list),
        "2021_2024": defaultdict(list),
    }
    counts = {period: 0 for period in chunks.keys()}
    
    # Batch processing counters
    batch_counts = {period: 0 for period in chunks.keys()}
    models = {period: None for period in chunks.keys()}

    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {path}")
            return
            
        for row in tqdm(jsonStream, desc="Processing comments"):
            if "body" not in row or "created_utc" not in row or "author" not in row:
                continue
            author = row["author"]
            if author in {"AutoModerator", "election_info_bot"}:
                continue
                
            text = row["body"]
            created_timestamp = row["created_utc"]
            year = datetime.datetime.fromtimestamp(int(created_timestamp)).year
            
            if year <= 2016:
                chunk_key = "before_2016"
            elif 2017 <= year <= 2020:
                chunk_key = "2017_2020"
            elif 2021 <= year <= 2024:
                chunk_key = "2021_2024"
            else:
                continue
            
            text = preprocess_reddit_text(text)

            words = text.split()
            if not words:
                continue
                
            tagged_words = nltk.pos_tag(words)
            processed_words = []
            
            for word, tag in tagged_words:
                if without_stopwords and word in stop_words:
                    continue
                if tag.startswith('J'):
                    wordnet_pos = 'a'
                elif tag.startswith('V'):
                    wordnet_pos = 'v'
                elif tag.startswith('N'):
                    wordnet_pos = 'n'
                elif tag.startswith('R'):
                    wordnet_pos = 'r'
                else:
                    wordnet_pos = 'n'
                lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
                processed_words.append(lemma)
                user_words[chunk_key][lemma].add(author)
                
            if processed_words:
                user_comments[chunk_key][author].append(processed_words)
                counts[chunk_key] += 1
                batch_counts[chunk_key] += 1
            
            # Process batch when we reach the batch size for any period
            for period in chunks.keys():
                if batch_counts[period] >= batch_size:
                    print(f"\nReached {batch_size} comments for {period}, processing batch...")
                    process_batch(period, user_words[period], user_comments[period], 
                                 models[period], phrases_min_count, word2vec_min_count, party)
                    
                    # Clear batch data to free memory
                    user_words[period] = defaultdict(set)
                    user_comments[period] = defaultdict(list)
                    batch_counts[period] = 0

    print("\n=== Final Comment Counts by Period ===")
    for period, count in counts.items():
        print(f"{period}: {count} comments")
    
    # Process any remaining comments
    for period in chunks.keys():
        if batch_counts[period] > 0:
            print(f"\nProcessing final batch of {batch_counts[period]} comments for {period}...")
            process_batch(period, user_words[period], user_comments[period], 
                         models[period], phrases_min_count, word2vec_min_count, party)

    # Save final models
    for period, model in models.items():
        if model is not None:
            final_path = f"models/model_v4/reddit_word2vec_{phrases_min_count}_{word2vec_min_count}_{party}_{period}.model"
            model.save(final_path)
            print(f"Final model saved to {final_path}")


def process_batch(period, user_words_dict, user_comments_dict, existing_model, 
                 phrases_min_count, word2vec_min_count, party):
    """Process a batch of comments and update the model incrementally"""
    
    # Filter words by user count
    valid_words = {w for w, users in user_words_dict.items() if len(users) >= 3}
    filtered_comments = []
    
    for comments in user_comments_dict.values():
        for comment in comments:
            filtered = [w for w in comment if w in valid_words]
            if filtered:
                filtered_comments.append(filtered)
                
    print(f"{period}: Processing {len(filtered_comments)} comments after filtering words by user count")
    
    if not filtered_comments:
        print(f"No valid comments for {period} after filtering, skipping batch")
        return existing_model
    
    # Extract bigrams
    print(f"Extracting bigrams...")
    phrases = Phrases(filtered_comments, 
                      min_count=phrases_min_count, 
                      threshold=0.7,
                      scoring='npmi')
    bigram_model = Phraser(phrases)
    bigrammed_comments = [bigram_model[comment] for comment in filtered_comments]
    
    # Either create a new model or update existing one
    if existing_model is None:
        print(f"Creating new Word2Vec model for {period}")
        model = Word2Vec(
            vector_size=300,
            window=5,
            min_count=word2vec_min_count,
            workers=16
        )
        model.build_vocab(bigrammed_comments)
        
    else:
        print(f"Updating existing Word2Vec model for {period}")
        model = existing_model
        model.build_vocab(bigrammed_comments, update=True)
    
    print(f"Training model on {len(bigrammed_comments)} comments")
    model.train(
        bigrammed_comments,
        total_examples=len(bigrammed_comments),
        epochs=5
    )
    
    # Save interim model
    interim_path = f"models/model_v4/interim/reddit_word2vec_{phrases_min_count}_{word2vec_min_count}_{party}_{period}_interim.model"
    os.makedirs(os.path.dirname(interim_path), exist_ok=True)
    model.save(interim_path)
    print(f"Interim model saved to {interim_path}")
    
    return model


def main():
    # Create output directories
    os.makedirs("models/model_v4", exist_ok=True)
    os.makedirs("models/model_v4/interim", exist_ok=True)
    
    filePathforDemocrats = r"datasets/democrats_comments.zst"
    filePathforRepublican = r"datasets/Republican_comments.zst"
    filePathforBackpacking = r"datasets/backpacking_comments.zst"
    filePathforVagabond = r"datasets/vagabond_comments.zst"
    filePathforConservative = r"datasets/Conservative_comments.zst"
    filePathforLiberal = r"datasets/Liberal_comments.zst"

    # Set batch size to 1 million comments
    batch_size = 1000000
    
    # build_word2vect_model(filePathforDemocrats, "democrats", 
    #                       phrases_min_count=10, word2vec_min_count=10,
    #                       batch_size=batch_size)
    # build_word2vect_model(filePathforConservative, "conservative", 
    #                       phrases_min_count=10, word2vec_min_count=10,
    #                       batch_size=batch_size)
    # build_word2vect_model(filePathforRepublican, "republican", 
    #                       phrases_min_count=10, word2vec_min_count=10,
    #                       batch_size=batch_size)
    build_word2vect_model(filePathforBackpacking, "backpacking", 
                          phrases_min_count=10, word2vec_min_count=10,
                          batch_size=batch_size)
    build_word2vect_model(filePathforVagabond, "vagabond", 
                          phrases_min_count=10, word2vec_min_count=10,
                          batch_size=batch_size)
    # build_word2vect_model(filePathforLiberal, "liberal", 
    #                       phrases_min_count=10, word2vec_min_count=10,
    #                       batch_size=batch_size)

    print("Done :>")
    
    
if __name__ == "__main__":
    main()

Processing file datasets/backpacking_comments.zst


Processing comments: 866545it [07:35, 1901.82it/s]



=== Final Comment Counts by Period ===
before_2016: 160382 comments
2017_2020: 225690 comments
2021_2024: 428375 comments

Processing final batch of 160382 comments for before_2016...
before_2016: Processing 160121 comments after filtering words by user count
Extracting bigrams...
Creating new Word2Vec model for before_2016
Training model on 160121 comments
Interim model saved to models/model_v4/interim/reddit_word2vec_10_10_backpacking_before_2016_interim.model

Processing final batch of 225690 comments for 2017_2020...
2017_2020: Processing 224936 comments after filtering words by user count
Extracting bigrams...
Creating new Word2Vec model for 2017_2020
Training model on 224936 comments
Interim model saved to models/model_v4/interim/reddit_word2vec_10_10_backpacking_2017_2020_interim.model

Processing final batch of 428375 comments for 2021_2024...
2021_2024: Processing 427362 comments after filtering words by user count
Extracting bigrams...
Creating new Word2Vec model for 2021_20

Processing comments: 513168it [04:19, 1980.32it/s]



=== Final Comment Counts by Period ===
before_2016: 39300 comments
2017_2020: 156891 comments
2021_2024: 292050 comments

Processing final batch of 39300 comments for before_2016...
before_2016: Processing 39147 comments after filtering words by user count
Extracting bigrams...
Creating new Word2Vec model for before_2016
Training model on 39147 comments
Interim model saved to models/model_v4/interim/reddit_word2vec_10_10_vagabond_before_2016_interim.model

Processing final batch of 156891 comments for 2017_2020...
2017_2020: Processing 156248 comments after filtering words by user count
Extracting bigrams...
Creating new Word2Vec model for 2017_2020
Training model on 156248 comments
Interim model saved to models/model_v4/interim/reddit_word2vec_10_10_vagabond_2017_2020_interim.model

Processing final batch of 292050 comments for 2021_2024...
2021_2024: Processing 290866 comments after filtering words by user count
Extracting bigrams...
Creating new Word2Vec model for 2021_2024
Trainin