In [4]:
# For streaming
import sys
version = sys.version_info
if version.major < 3 or (version.major == 3 and version.minor < 10):
    raise RuntimeError("This script requires Python 3.10 or higher")
import os
from typing import Iterable

from fileStreams import getFileJsonStream
from utils import FileProgressLog

# For processing
import gensim
from gensim.models import Word2Vec
import re
import html
import unicodedata
import logging
from tqdm import tqdm
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models.phrases import Phrases, Phraser
    
import datetime
from collections import defaultdict


filePathforDemocrats = r"datasets/democrats_comments.zst"
filePathforRepublican = r"datasets/Republican_comments.zst"
recursive = False

In [None]:


def build_word2vect_model(path, party, without_stopwords=True, phrases_min_count=5, word2vec_min_count=5):
    print(f"Processing file {path}")
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # For each period, track comments and user-word usage
    chunks = {
        "before_2016": [],
        "2017_2020": [],
        "2021_2024": [],
    }
    user_words = {
        "before_2016": defaultdict(set),
        "2017_2020": defaultdict(set),
        "2021_2024": defaultdict(set),
    }
    user_comments = {
        "before_2016": defaultdict(list),
        "2017_2020": defaultdict(list),
        "2021_2024": defaultdict(list),
    }
    counts = {period: 0 for period in chunks.keys()}

    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {path}")
            return
        for row in tqdm(jsonStream, desc="Processing comments"):
            if "body" not in row or "created_utc" not in row or "author" not in row:
                continue
            author = row["author"]
            if author in {"AutoModerator", "election_info_bot"}:
                continue
            text = row["body"]
            created_timestamp = row["created_utc"]
            year = datetime.datetime.fromtimestamp(int(created_timestamp)).year
            # if year <= 2016:
            #     chunk_key = "before_2016"
            # elif 2017 <= year <= 2020:
            #     chunk_key = "2017_2020"
            # elif 2021 <= year <= 2024:
            #     chunk_key = "2021_2024"
            # else:
            #     continue
            if 2021 <= year <= 2024:
                chunk_key = "2021_2024"
            else:
                continue
            
            # Handle HTML entities
            text = html.unescape(text)                         # &amp; becomes &, etc.
            
            # Handle Unicode normalization
            text = unicodedata.normalize('NFKD', text)         # converts café with a single combined "é" character to "e" with an accent
            
            # Remove all URLs
            text = re.sub(r'http\S+', '', text)                # Remove URLs
            text = re.sub(r'!\[.*?\]\(.*?\)', '', text)        # Images/GIFs
            
            # Handle Reddit's link format
            text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)    # [text](link) becomes text
            
            # Handle markdown formatting (bold, italics)
            text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)       # **word** becomes word
            text = re.sub(r'\*(.*?)\*', r'\1', text)           # *word* becomes word
            
            # Handle subreddit and user references
            text = re.sub(r'/r/(\w+)', r'subreddit_\1', text)  # /r/politics becomes subreddit_politics
            text = re.sub(r'/u/(\w+)', r'user_\1', text)       # /u/username becomes user_username
            
            text = re.sub("[^A-Za-z']+", ' ', text).lower()

            words = text.split()
            if not words:
                continue
            tagged_words = nltk.pos_tag(words)
            processed_words = []
            for word, tag in tagged_words:
                if without_stopwords and word in stop_words:
                    continue
                if tag.startswith('J'):
                    wordnet_pos = 'a'
                elif tag.startswith('V'):
                    wordnet_pos = 'v'
                elif tag.startswith('N'):
                    wordnet_pos = 'n'
                elif tag.startswith('R'):
                    wordnet_pos = 'r'
                else:
                    wordnet_pos = 'n'
                lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
                processed_words.append(lemma)
                user_words[chunk_key][lemma].add(author)
            if processed_words:
                user_comments[chunk_key][author].append(processed_words)
                counts[chunk_key] += 1

    print("\n=== Comment Counts by Period ===")
    for period, count in counts.items():
        print(f"{period}: {count} comments")

    # Filter words by user count and rebuild comments for each period
    for period in chunks.keys():
        valid_words = {w for w, users in user_words[period].items() if len(users) >= 5}
        filtered_comments = []
        for comments in user_comments[period].values():
            for comment in comments:
                filtered = [w for w in comment if w in valid_words]
                if filtered:
                    filtered_comments.append(filtered)
        print(f"{period}: {len(filtered_comments)} comments after filtering words by user count")
        if filtered_comments:
            print(f"\nExtracting bigrams for {period}...")
            phrases = Phrases(filtered_comments, 
                              min_count=phrases_min_count, 
                              threshold=0.7,
                              scoring='npmi')
            bigram_model = Phraser(phrases)
            bigrammed_comments = [bigram_model[comment] for comment in filtered_comments]
            chunks[period] = bigrammed_comments
        else:
            chunks[period] = []

    # Train a Word2Vec model for each time period
    for period, comments in chunks.items():
        if len(comments) > 0:
            print(f"\n=== Training Word2Vec for {period} ({len(comments)} comments) ===")
            model = Word2Vec(
                vector_size=300,
                window=5,
                min_count=word2vec_min_count,
                workers=16
            )
            model.build_vocab(comments)
            print(f"Vocabulary size: {len(model.wv.index_to_key)}")
            model.train(
                comments,
                total_examples=len(comments),
                epochs=5
            )
            model_path = f"models/model_v3/reddit_word2vec_{phrases_min_count}_{word2vec_min_count}_{party}_{period}.model"
            model.save(model_path)
            print(f"Model saved to {model_path}")

def main():
    # build_word2vect_model(filePathforDemocrats, "democrats", without_stopwords=False, 
    #                       phrases_min_count=5, word2vec_min_count=5)
    # build_word2vect_model(filePathforRepublican, "republican", without_stopwords=False, 
    #                       phrases_min_count=5, word2vec_min_count=5)
    build_word2vect_model(filePathforDemocrats, "democrats", without_stopwords=False, 
                          phrases_min_count=10, word2vec_min_count=10)
    build_word2vect_model(filePathforRepublican, "republican", without_stopwords=False, 
                          phrases_min_count=10, word2vec_min_count=10)
    # build_word2vect_model(filePathforDemocrats, "democrats", without_stopwords=False, 
    #                       phrases_min_count=20, word2vec_min_count=20)
    # build_word2vect_model(filePathforRepublican, "republican", without_stopwords=False, 
    #                       phrases_min_count=20, word2vec_min_count=20)
    # build_word2vect_model(filePathforDemocrats, "democrats", without_stopwords=False, 
    #                       phrases_min_count=50, word2vec_min_count=50)
    # build_word2vect_model(filePathforRepublican, "republican", without_stopwords=False, 
    #                       phrases_min_count=50, word2vec_min_count=50)
    print("Done :>")

if __name__ == "__main__":
    main()

Processing file datasets/democrats_comments.zst


Processing comments: 2011525it [07:20, 4564.13it/s] 



=== Comment Counts by Period ===
before_2016: 0 comments
2017_2020: 0 comments
2021_2024: 1341626 comments
before_2016: 0 comments after filtering words by user count
2017_2020: 0 comments after filtering words by user count
2021_2024: 1339874 comments after filtering words by user count

Extracting bigrams for 2021_2024...

=== Training Word2Vec for 2021_2024 (1339874 comments) ===
Vocabulary size: 26047
Model saved to models/reddit_word2vec_10_10_filtered_processed_democrats_2021_2024.model
Done :>
