In [1]:
# For streaming
import sys
version = sys.version_info
if version.major < 3 or (version.major == 3 and version.minor < 10):
    raise RuntimeError("This script requires Python 3.10 or higher")
import os
from typing import Iterable

from fileStreams import getFileJsonStream
from utils import FileProgressLog

# For processing
import gensim
from gensim.models import Word2Vec
import re
import logging
from tqdm import tqdm
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models.phrases import Phrases, Phraser
    
import datetime


filePathforDemocrats = r"..\..\datasets\Reddit\reddit\subreddits24\democrats_comments.zst"
filePathforRepublican = r"..\..\datasets\Reddit\reddit\subreddits24\Republican_comments.zst"
recursive = False

In [3]:
def processFile(path, party, without_stopwords=True):
    print(f"Processing file {path}")
    
    # Download necessary NLTK resources
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)  # For POS tagging
    
    # Initialize lemmatizer and stop words
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Create empty lists for each time period
    chunks = {
        "before_2016": [],
        "2017_2020": [],
        "2021_2024": [],
    }
    
    # Track counts
    counts = {period: 0 for period in chunks.keys()}
    
    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {path}")
            return
        for row in tqdm(jsonStream, desc="Processing comments"):
            if "body" not in row or "created_utc" not in row:
                continue
            
            # Get the comment text and timestamp
            text = row["body"]
            created_timestamp = row["created_utc"]
            
            # Convert timestamp to year
            year = datetime.datetime.fromtimestamp(int(created_timestamp)).year
            
            # Determine which chunk this comment belongs to
            chunk_key = None
            if year <= 2016:
                chunk_key = "before_2016"
            elif 2017 <= year <= 2020:
                chunk_key = "2017_2020"
            elif 2021 <= year <= 2024:
                chunk_key = "2021_2024"
            
            # Process text
            # Remove URLs
            txt = re.sub(r'http\S+', '', text)  # Remove URLs
            
            # Remove non-alphanumeric characters and convert to lowercase
            txt = re.sub("[^A-Za-z0-9']+", ' ', txt).lower()
            
            # Replace special characters with spaces
            txt = re.sub(r"['\-_]", ' ', txt)
            
            # Tokenize
            words = txt.split()
            
            # Tag words with parts of speech for better lemmatization
            tagged_words = nltk.pos_tag(words)
            processed_words = []
            
            # Skip empty comments
            if not words:
                continue
            
            for word, tag in tagged_words:
                if without_stopwords:
                    if word in stop_words:
                        continue
                
                # Convert Penn Treebank tag to WordNet tag
                if tag.startswith('J'):
                    wordnet_pos = 'a'  # adjective
                elif tag.startswith('V'):
                    wordnet_pos = 'v'  # verb
                elif tag.startswith('N'):
                    wordnet_pos = 'n'  # noun
                elif tag.startswith('R'):
                    wordnet_pos = 'r'  # adverb
                else:
                    wordnet_pos = 'n'  # default to noun
                    
                # Lemmatize with the correct POS
                lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
                processed_words.append(lemma)
            
            # Add to appropriate chunk if it has words
            if processed_words:
                chunks[chunk_key].append(processed_words)
                counts[chunk_key] += 1
                
    # Print statistics
    print("\n=== Comment Counts by Period ===")
    for period, count in counts.items():
        print(f"{period}: {count} comments")
        
    # Extract bigrams from each time period
    for period, comments in chunks.items():
        if len(comments) > 0:
            print(f"\nExtracting bigrams for {period}...")
            # Build bigram model
            phrases = Phrases(comments, min_count=5, threshold=10)
            bigram_model = Phraser(phrases)
            
            # Apply bigram model to create comments with bigrams
            bigrammed_comments = [bigram_model[comment] for comment in comments]
            chunks[period] = bigrammed_comments
        
    # Train a Word2Vec model for each time period
    for period, comments in chunks.items():
        if len(comments) > 0:
            print(f"\n=== Training Word2Vec for {period} ({len(comments)} comments) ===")
            
            # Initialize and train model
            model = Word2Vec(
                vector_size=300,
                window=5,
                min_count=5,
                workers=16
            )
            
            # Build vocabulary
            model.build_vocab(comments)
            print(f"Vocabulary size: {len(model.wv.index_to_key)}")
            
            # Train the model
            model.train(
                comments, 
                total_examples=len(comments), 
                epochs=5
            )
            
            # Save the model
            model_path = f"reddit_word2vec_{party}_{period}.model"
            model.save(model_path)
            print(f"Model saved to {model_path}")
            
            # Show example results
            print("\n=== Example Results ===")
            test_words = ["democracy", "president", "policy", "vote", "climate"]
            for word in test_words:
                try:
                    print(f"\nWords similar to '{word}':")
                    similar = model.wv.most_similar(word, topn=3)
                    for similar_word, similarity in similar:
                        print(f"  {similar_word}: {similarity:.4f}")
                except KeyError:
                    print(f"  '{word}' not found in {period} vocabulary")
        
def main():
    processFile(filePathforDemocrats, "democrats_with_stopwords", without_stopwords=False)
    processFile(filePathforRepublican, "republican_with_stopwords", without_stopwords=False)
    
    print("Done :>")

if __name__ == "__main__":
    main()


Processing file ..\..\datasets\Reddit\reddit\subreddits24\democrats_comments.zst


Processing comments: 2011525it [13:12, 2537.19it/s]



=== Comment Counts by Period ===
before_2016: 129718 comments
2017_2020: 492295 comments
2021_2024: 1379685 comments

Extracting bigrams for before_2016...

Extracting bigrams for 2017_2020...

Extracting bigrams for 2021_2024...

=== Training Word2Vec for before_2016 (129718 comments) ===
Vocabulary size: 21814
Model saved to reddit_word2vec_democrats_with_stopwords_before_2016.model

=== Example Results ===

Words similar to 'democracy':
  capitalism: 0.6880
  society: 0.6871
  nation: 0.6373

Words similar to 'president':
  potus: 0.6785
  presidency: 0.6247
  leader: 0.5840

Words similar to 'policy':
  economic_policy: 0.6963
  platform: 0.6567
  agenda: 0.6520

Words similar to 'vote':
  voting: 0.6867
  support: 0.5513
  voter: 0.5493

Words similar to 'climate':
  structural: 0.6787
  austerity: 0.6605
  technology: 0.6572

=== Training Word2Vec for 2017_2020 (492295 comments) ===
Vocabulary size: 34115
Model saved to reddit_word2vec_democrats_with_stopwords_2017_2020.model

=

Processing comments: 1405486it [11:45, 1993.47it/s]



=== Comment Counts by Period ===
before_2016: 265370 comments
2017_2020: 452285 comments
2021_2024: 680869 comments

Extracting bigrams for before_2016...

Extracting bigrams for 2017_2020...

Extracting bigrams for 2021_2024...

=== Training Word2Vec for before_2016 (265370 comments) ===
Vocabulary size: 30882
Model saved to reddit_word2vec_republican_with_stopwords_before_2016.model

=== Example Results ===

Words similar to 'democracy':
  republic: 0.7965
  tyranny: 0.7113
  dictatorship: 0.6685

Words similar to 'president':
  potus: 0.7824
  presidency: 0.6026
  vice_president: 0.5950

Words similar to 'policy':
  economic_policy: 0.6946
  fiscal_policy: 0.6153
  policy_proposal: 0.5841

Words similar to 'vote':
  voting: 0.7103
  ballot: 0.5388
  stay_home: 0.5329

Words similar to 'climate':
  global_climate: 0.6726
  paradigm: 0.6414
  warming: 0.6055

=== Training Word2Vec for 2017_2020 (452285 comments) ===
Vocabulary size: 32958
Model saved to reddit_word2vec_republican_wit

In [19]:
def process_entire_dataset(path, party):
    print(f"Processing entire dataset from {path}")
    
    # Download necessary NLTK resources
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)  # For POS tagging
    
    # Initialize lemmatizer and stop words
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Storage for all processed comments
    all_comments = []
    total_comments = 0
    
    # Process the dataset
    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {path}")
            return
        
        for row in tqdm(jsonStream, desc="Processing comments"):
            total_comments += 1
            if "body" not in row:
                continue
            
            # Get the comment text
            text = row["body"]
            
            # Process text
            # Remove URLs
            txt = re.sub(r'http\S+', '', str(text))
            
            # Keep alphanumeric + apostrophes, replace with spaces, lowercase
            txt = re.sub(r"[^A-Za-z0-9']+", ' ', txt).lower()
            
            # Replace special characters with spaces
            txt = re.sub(r"['\-_]", ' ', txt)
            
            # Tokenize
            words = txt.split()
            
            # Skip empty comments
            if not words:
                continue
                
            # Tag words with parts of speech for better lemmatization
            tagged_words = nltk.pos_tag(words)
            processed_words = []
            
            for word, tag in tagged_words:
                if word in stop_words:
                    continue
                
                # Convert Penn Treebank tag to WordNet tag
                if tag.startswith('J'):
                    wordnet_pos = 'a'  # adjective
                elif tag.startswith('V'):
                    wordnet_pos = 'v'  # verb
                elif tag.startswith('N'):
                    wordnet_pos = 'n'  # noun
                elif tag.startswith('R'):
                    wordnet_pos = 'r'  # adverb
                else:
                    wordnet_pos = 'n'  # default to noun
                    
                # Lemmatize with the correct POS
                lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
                processed_words.append(lemma)
            
            # Add to comments if it has words
            if processed_words:
                all_comments.append(processed_words)
    
    print(f"Total comments processed: {total_comments}")
    
    # Extract bigrams from the dataset
    print("\nExtracting bigrams...")
    phrases = Phrases(all_comments, min_count=5, threshold=10)
    bigram_model = Phraser(phrases)
    
    # Apply bigram model to create comments with bigrams
    bigrammed_comments = [bigram_model[comment] for comment in all_comments]
    
    # Train Word2Vec model
    print("\n=== Training Word2Vec model on entire dataset ===")
    model = Word2Vec(
        vector_size=300,
        window=5,
        min_count=5,
        workers=16
    )
    
    # Build vocabulary
    model.build_vocab(bigrammed_comments)
    print(f"Vocabulary size: {len(model.wv.index_to_key)}")
    
    # Train the model
    model.train(
        bigrammed_comments, 
        total_examples=len(bigrammed_comments), 
        epochs=5
    )
    
    # Save the model
    model_path = f"reddit_word2vec_{party}_all_periods.model"
    model.save(model_path)
    print(f"Model saved to {model_path}")
    
    # Show example results
    print("\n=== Example Results ===")
    test_words = ["democracy", "president", "policy", "trump", "biden"]
    for word in test_words:
        try:
            print(f"\nWords similar to '{word}':")
            similar = model.wv.most_similar(word, topn=10)
            for similar_word, similarity in similar:
                print(f"  {similar_word}: {similarity:.4f}")
        except KeyError:
            print(f"  '{word}' not found in vocabulary")
    
    return model

def main_entire_dataset():
    process_entire_dataset(filePathforDemocrats, "democrats")
    process_entire_dataset(filePathforRepublican, "republican")
    
    print("Done processing both datasets :)")

if __name__ == "__main__":
    main_entire_dataset()

Processing entire dataset from ..\..\datasets\Reddit\reddit\subreddits24\democrats_comments.zst


Processing comments: 2011525it [14:54, 2249.90it/s]


Total comments processed: 2011525

Extracting bigrams...

=== Training Word2Vec model on entire dataset ===
Vocabulary size: 80120
Model saved to reddit_word2vec_democrats_all_periods.model

=== Example Results ===

Words similar to 'democracy':
  republic: 0.7175
  democratic_republic: 0.6618
  representative_democracy: 0.6310
  fascism: 0.5854
  democratic_institution: 0.5639
  autocracy: 0.5592
  dictatorship: 0.5455
  american_experiment: 0.5292
  oligarchy: 0.5245
  tyranny: 0.5199

Words similar to 'president':
  potus: 0.7967
  vice_president: 0.5898
  presidency: 0.5754
  prez: 0.5734
  pres: 0.5542
  presidential_candidate: 0.5444
  commander_chief: 0.5376
  second_term: 0.5131
  presidential: 0.5073
  vp: 0.5063

Words similar to 'policy':
  economic_policy: 0.7203
  policy_proposal: 0.6181
  immigration_policy: 0.6073
  agenda: 0.5583
  proposal: 0.5234
  legislation: 0.4938
  platform: 0.4934
  fiscal_policy: 0.4893
  domestic_policy: 0.4774
  environmental_policy: 0.4755



Processing comments: 1405486it [13:25, 1745.87it/s]


Total comments processed: 1405486

Extracting bigrams...

=== Training Word2Vec model on entire dataset ===
Vocabulary size: 70486
Model saved to reddit_word2vec_republican_all_periods.model

=== Example Results ===

Words similar to 'democracy':
  republic: 0.7347
  constitutional_republic: 0.6772
  democratic_republic: 0.6603
  dictatorship: 0.6052
  democratic_process: 0.6036
  representative_democracy: 0.6028
  direct_democracy: 0.5999
  pure_democracy: 0.5857
  monarchy: 0.5649
  mob_rule: 0.5537

Words similar to 'president':
  potus: 0.7545
  pres: 0.6406
  presidency: 0.6232
  vice_president: 0.5959
  former_president: 0.5942
  commander_chief: 0.5740
  vp: 0.5697
  presidential: 0.5693
  administration: 0.5362
  presidential_candidate: 0.5287

Words similar to 'policy':
  economic_policy: 0.6785
  immigration_policy: 0.5804
  fiscal_policy: 0.5629
  foreign_policy: 0.5073
  policy_proposal: 0.4965
  agenda: 0.4739
  legislation: 0.4579
  proposal: 0.4391
  stance: 0.4347
  ini