In [3]:
# For streaming
import sys
version = sys.version_info
if version.major < 3 or (version.major == 3 and version.minor < 10):
    raise RuntimeError("This script requires Python 3.10 or higher")
import os
from typing import Iterable

from fileStreams import getFileJsonStream
from utils import FileProgressLog

# For processing
import gensim
from gensim.models import Word2Vec
import re
import logging
from tqdm import tqdm
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models.phrases import Phrases, Phraser
    
import datetime


filePathforDemocrats = r"datasets/democrats_comments.zst"
filePathforRepublican = r"datasets/Republican_comments.zst"
recursive = False

In [None]:
def build_word2vect_model(path, party, without_stopwords=True):
    print(f"Processing file {path}")
    # Download necessary NLTK resources
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)  # For POS tagging
    
    # Initialize lemmatizer and stop words
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Create empty lists for each time period
    chunks = {
        "before_2016": [],
        "2017_2020": [],
        "2021_2024": [],
    }
    
    # Track counts
    counts = {period: 0 for period in chunks.keys()}
    
    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {path}")
            return
        for row in tqdm(jsonStream, desc="Processing comments"):
            if "body" not in row or "created_utc" not in row:
                continue
            
            # Get the comment text and timestamp
            text = row["body"]
            created_timestamp = row["created_utc"]
            
            # Convert timestamp to year
            year = datetime.datetime.fromtimestamp(int(created_timestamp)).year
            
            # Determine which chunk this comment belongs to
            chunk_key = None
            if year <= 2016:
                chunk_key = "before_2016"
            elif 2017 <= year <= 2020:
                chunk_key = "2017_2020"
            elif 2021 <= year <= 2024:
                chunk_key = "2021_2024"
            
            # Process text
            # Remove URLs
            txt = re.sub(r'http\S+', '', text)  # Remove URLs
            
            # Remove non-alphanumeric characters and convert to lowercase
            txt = re.sub("[^A-Za-z0-9']+", ' ', txt).lower()
            
            # Replace special characters with spaces
            txt = re.sub(r"['\-_]", ' ', txt)
            
            # Tokenize
            words = txt.split()
            
            # Tag words with parts of speech for better lemmatization
            tagged_words = nltk.pos_tag(words)
            processed_words = []
            
            # Skip empty comments
            if not words:
                continue
            
            for word, tag in tagged_words:
                if without_stopwords:
                    if word in stop_words:
                        continue
                
                # Convert Penn Treebank tag to WordNet tag
                if tag.startswith('J'):
                    wordnet_pos = 'a'  # adjective
                elif tag.startswith('V'):
                    wordnet_pos = 'v'  # verb
                elif tag.startswith('N'):
                    wordnet_pos = 'n'  # noun
                elif tag.startswith('R'):
                    wordnet_pos = 'r'  # adverb
                else:
                    wordnet_pos = 'n'  # default to noun
                    
                # Lemmatize with the correct POS
                lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
                processed_words.append(lemma)
            
            # Add to appropriate chunk if it has words
            if processed_words:
                chunks[chunk_key].append(processed_words)
                counts[chunk_key] += 1
                
    # Print statistics
    print("\n=== Comment Counts by Period ===")
    for period, count in counts.items():
        print(f"{period}: {count} comments")
        
    # Extract bigrams from each time period
    for period, comments in chunks.items():
        if len(comments) > 0:
            print(f"\nExtracting bigrams for {period}...")
            # Build bigram model
            phrases = Phrases(comments, min_count=5, threshold=10)
            bigram_model = Phraser(phrases)
            
            # Apply bigram model to create comments with bigrams
            bigrammed_comments = [bigram_model[comment] for comment in comments]
            chunks[period] = bigrammed_comments
        
    # Train a Word2Vec model for each time period
    for period, comments in chunks.items():
        if len(comments) > 0:
            print(f"\n=== Training Word2Vec for {period} ({len(comments)} comments) ===")
            
            # Initialize and train model
            model = Word2Vec(
                vector_size=300,
                window=5,
                min_count=5,
                workers=16
            )
            
            # Build vocabulary
            model.build_vocab(comments)
            print(f"Vocabulary size: {len(model.wv.index_to_key)}")
            
            # Train the model
            model.train(
                comments, 
                total_examples=len(comments), 
                epochs=5
            )
            
            # Save the model
            model_path = f"reddit_word2vec_{party}_{period}.model"
            model.save(model_path)
            print(f"Model saved to {model_path}")
            
            # Show example results
            print("\n=== Example Results ===")
            test_words = ["democracy", "president", "policy", "vote", "climate"]
            for word in test_words:
                try:
                    print(f"\nWords similar to '{word}':")
                    similar = model.wv.most_similar(word, topn=3)
                    for similar_word, similarity in similar:
                        print(f"  {similar_word}: {similarity:.4f}")
                except KeyError:
                    print(f"  '{word}' not found in {period} vocabulary")
        
def main():
    build_word2vect_model(filePathforDemocrats, "democrats_with_stopwords", without_stopwords=False)
    build_word2vect_model(filePathforRepublican, "republican_with_stopwords", without_stopwords=False)
    
    print("Done :>")

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd

# Load your data (replace with your actual file path)
df = pd.read_json('datasets/democrats_comments.zst', lines=True, compression='infer')

# Count the number of comments per user
user_comment_counts = df['author'].value_counts().reset_index()
user_comment_counts.columns = ['author', 'comment_count']

In [4]:
# Display the table of user comment frequencies
user_comment_counts.head(20)  # Show top 20 users

Unnamed: 0,author,comment_count
0,[deleted],440324
1,AutoModerator,49681
2,VegaThePunisher,21638
3,backpackwayne,11345
4,kopskey1,9504
5,michaelconfoy,6573
6,Gsteel11,6286
7,kerryfinchelhillary,6180
8,raistlin65,4895
9,therecordcorrected,4476


In [None]:
# Count how many users have fewer than 5 comments
num_users_less_than_5 = (user_comment_counts['comment_count'] < 3).sum()
print(f'Number of users with fewer than 5 comments: {num_users_less_than_5}')

Number of users with fewer than 5 comments: 85820


In [None]:
from collections import defaultdict

def build_word2vect_model(path, party, without_stopwords=True, phrases_min_count=5, word2vec_min_count=5):
    print(f"Processing file {path}")
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # For each period, track comments and user-word usage
    chunks = {
        "before_2016": [],
        "2017_2020": [],
        "2021_2024": [],
    }
    user_words = {
        "before_2016": defaultdict(set),
        "2017_2020": defaultdict(set),
        "2021_2024": defaultdict(set),
    }
    user_comments = {
        "before_2016": defaultdict(list),
        "2017_2020": defaultdict(list),
        "2021_2024": defaultdict(list),
    }
    counts = {period: 0 for period in chunks.keys()}

    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {path}")
            return
        for row in tqdm(jsonStream, desc="Processing comments"):
            if "body" not in row or "created_utc" not in row or "author" not in row:
                continue
            author = row["author"]
            if author in {"AutoModerator", "election_info_bot"}:
                continue
            text = row["body"]
            created_timestamp = row["created_utc"]
            year = datetime.datetime.fromtimestamp(int(created_timestamp)).year
            # if year <= 2016:
            #     chunk_key = "before_2016"
            # elif 2017 <= year <= 2020:
            #     chunk_key = "2017_2020"
            # elif 2021 <= year <= 2024:
            #     chunk_key = "2021_2024"
            # else:
            #     continue
            if 2021 <= year <= 2024:
                chunk_key = "2021_2024"
            else:
                continue

            txt = re.sub(r'http\S+', '', text)
            txt = re.sub("[^A-Za-z']+", ' ', txt).lower()
            txt = re.sub(r"['\-]", ' ', txt)
            words = txt.split()
            if not words:
                continue
            tagged_words = nltk.pos_tag(words)
            processed_words = []
            for word, tag in tagged_words:
                if without_stopwords and word in stop_words:
                    continue
                if tag.startswith('J'):
                    wordnet_pos = 'a'
                elif tag.startswith('V'):
                    wordnet_pos = 'v'
                elif tag.startswith('N'):
                    wordnet_pos = 'n'
                elif tag.startswith('R'):
                    wordnet_pos = 'r'
                else:
                    wordnet_pos = 'n'
                lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
                processed_words.append(lemma)
                user_words[chunk_key][lemma].add(author)
            if processed_words:
                user_comments[chunk_key][author].append(processed_words)
                counts[chunk_key] += 1

    print("\n=== Comment Counts by Period ===")
    for period, count in counts.items():
        print(f"{period}: {count} comments")

    # Filter words by user count and rebuild comments for each period
    for period in chunks.keys():
        valid_words = {w for w, users in user_words[period].items() if len(users) >= 5}
        filtered_comments = []
        for comments in user_comments[period].values():
            for comment in comments:
                filtered = [w for w in comment if w in valid_words]
                if filtered:
                    filtered_comments.append(filtered)
        print(f"{period}: {len(filtered_comments)} comments after filtering words by user count")
        if filtered_comments:
            print(f"\nExtracting bigrams for {period}...")
            phrases = Phrases(filtered_comments, 
                              min_count=phrases_min_count, 
                              threshold=100)
            bigram_model = Phraser(phrases)
            bigrammed_comments = [bigram_model[comment] for comment in filtered_comments]
            chunks[period] = bigrammed_comments
        else:
            chunks[period] = []

    # Train a Word2Vec model for each time period
    for period, comments in chunks.items():
        if len(comments) > 0:
            print(f"\n=== Training Word2Vec for {period} ({len(comments)} comments) ===")
            model = Word2Vec(
                vector_size=300,
                window=5,
                min_count=word2vec_min_count,
                workers=16
            )
            model.build_vocab(comments)
            print(f"Vocabulary size: {len(model.wv.index_to_key)}")
            model.train(
                comments,
                total_examples=len(comments),
                epochs=5
            )
            model_path = f"models/reddit_word2vec_{phrases_min_count}_{word2vec_min_count}_filterd_{party}_{period}.model"
            model.save(model_path)
            print(f"Model saved to {model_path}")

def main():
    build_word2vect_model(filePathforDemocrats, "democrats", without_stopwords=False, 
                          phrases_min_count=5, word2vec_min_count=5)
    build_word2vect_model(filePathforRepublican, "republican", without_stopwords=False, 
                          phrases_min_count=5, word2vec_min_count=5)
    build_word2vect_model(filePathforDemocrats, "democrats", without_stopwords=False, 
                          phrases_min_count=10, word2vec_min_count=10)
    build_word2vect_model(filePathforRepublican, "republican", without_stopwords=False, 
                          phrases_min_count=10, word2vec_min_count=10)
    build_word2vect_model(filePathforDemocrats, "democrats", without_stopwords=False, 
                          phrases_min_count=20, word2vec_min_count=20)
    build_word2vect_model(filePathforRepublican, "republican", without_stopwords=False, 
                          phrases_min_count=20, word2vec_min_count=20)
    build_word2vect_model(filePathforDemocrats, "democrats", without_stopwords=False, 
                          phrases_min_count=50, word2vec_min_count=50)
    build_word2vect_model(filePathforRepublican, "republican", without_stopwords=False, 
                          phrases_min_count=50, word2vec_min_count=50)
    print("Done :>")

if __name__ == "__main__":
    main()

Processing file datasets/democrats_comments.zst


Processing comments: 2011525it [07:46, 4314.33it/s] 



=== Comment Counts by Period ===
before_2016: 0 comments
2017_2020: 0 comments
2021_2024: 1343920 comments
before_2016: 0 comments after filtering words by user count
2017_2020: 0 comments after filtering words by user count
2021_2024: 1342207 comments after filtering words by user count

Extracting bigrams for 2021_2024...

=== Training Word2Vec for 2021_2024 (1342207 comments) ===
Vocabulary size: 28032
Model saved to models/reddit_word2vec_10_10_democrats_2021_2024.model
Processing file datasets/Republican_comments.zst


Processing comments: 1405486it [03:54, 6005.26it/s] 



=== Comment Counts by Period ===
before_2016: 0 comments
2017_2020: 0 comments
2021_2024: 617471 comments
before_2016: 0 comments after filtering words by user count
2017_2020: 0 comments after filtering words by user count
2021_2024: 616459 comments after filtering words by user count

Extracting bigrams for 2021_2024...

=== Training Word2Vec for 2021_2024 (616459 comments) ===
Vocabulary size: 20147
Model saved to models/reddit_word2vec_10_10_republican_2021_2024.model
Done :>
