In [None]:
# For streaming
import sys
version = sys.version_info
if version.major < 3 or (version.major == 3 and version.minor < 10):
    raise RuntimeError("This script requires Python 3.10 or higher")
import os
from typing import Iterable

from fileStreams import getFileJsonStream
from utils import FileProgressLog

# For processing
import gensim
from gensim.models import Word2Vec
import re
import logging
from tqdm import tqdm
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models.phrases import Phrases, Phraser
    
import datetime
import random
import numpy as np


filePathforDemocrats = r"datasets/democrats_comments.zst"
filePathforRepublican = r"datasets/Republican_comments.zst"
filePathforBackpacking = r"datasets/backpacking_comments.zst"
filePathforvagabond = r"datasets/vagabond_comments.zst"
recursive = False

In [10]:
def processFile(path, party, without_stopwords=True):
    # Set seeds for reproducibility
    random.seed(23)
    np.random.seed(23)
    
    print(f"Processing file {path}")
    
    # Download necessary NLTK resources
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)  # For POS tagging
    
    # Initialize lemmatizer and stop words
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Create empty lists for each time period
    chunks = {
        "before_2016": [],
        "2017_2020": [],
        "2021_2024": [],
    }
    
    # Track counts
    counts = {period: 0 for period in chunks.keys()}
    
    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {path}")
            return
        for row in tqdm(jsonStream, desc="Processing comments"):
            if "body" not in row or "created_utc" not in row:
                continue
            
            # Get the comment text and timestamp
            text = row["body"]
            created_timestamp = row["created_utc"]
            
            # Convert timestamp to year
            year = datetime.datetime.fromtimestamp(int(created_timestamp)).year
            
            # Determine which chunk this comment belongs to
            chunk_key = None
            if year <= 2016:
                chunk_key = "before_2016"
            elif 2017 <= year <= 2020:
                chunk_key = "2017_2020"
            elif 2021 <= year <= 2024:
                chunk_key = "2021_2024"
            
            # Process text
            # Remove URLs
            txt = re.sub(r'http\S+', '', text)  # Remove URLs
            
            # Remove non-alphanumeric characters and convert to lowercase
            txt = re.sub("[^A-Za-z0-9']+", ' ', txt).lower()
            
            # Replace special characters with spaces
            txt = re.sub(r"['\-_]", ' ', txt)
            
            # Tokenize
            words = txt.split()
            
            # Tag words with parts of speech for better lemmatization
            tagged_words = nltk.pos_tag(words)
            processed_words = []
            
            # Skip empty comments
            if not words:
                continue
            
            for word, tag in tagged_words:
                if without_stopwords:
                    if word in stop_words:
                        continue
                
                # Convert Penn Treebank tag to WordNet tag
                if tag.startswith('J'):
                    wordnet_pos = 'a'  # adjective
                elif tag.startswith('V'):
                    wordnet_pos = 'v'  # verb
                elif tag.startswith('N'):
                    wordnet_pos = 'n'  # noun
                elif tag.startswith('R'):
                    wordnet_pos = 'r'  # adverb
                else:
                    wordnet_pos = 'n'  # default to noun
                    
                # Lemmatize with the correct POS
                lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
                processed_words.append(lemma)
            
            # Add to appropriate chunk if it has words
            if processed_words:
                chunks[chunk_key].append(processed_words)
                counts[chunk_key] += 1
                
    # Print statistics
    print("\n=== Comment Counts by Period ===")
    for period, count in counts.items():
        print(f"{period}: {count} comments")
        
    # Extract bigrams from each time period
    for period, comments in chunks.items():
        if len(comments) > 0:
            print(f"\nExtracting bigrams for {period}...")
            # Build bigram model
            phrases = Phrases(comments, min_count=5, threshold=10)
            bigram_model = Phraser(phrases)
            
            # Apply bigram model to create comments with bigrams
            bigrammed_comments = [bigram_model[comment] for comment in comments]
            chunks[period] = bigrammed_comments
        
    # Train a Word2Vec model for each time period
    for period, comments in chunks.items():
        if len(comments) > 0:
            print(f"\n=== Training Word2Vec for {period} ({len(comments)} comments) ===")
            
            # Initialize and train model
            model = Word2Vec(
                vector_size=300,
                window=5,
                min_count=5,
                workers=16,
                seed=23
            )
            
            # Build vocabulary
            model.build_vocab(comments)
            print(f"Vocabulary size: {len(model.wv.index_to_key)}")
            
            # Train the model
            model.train(
                comments, 
                total_examples=len(comments), 
                epochs=5
            )
            
            # Save the model
            model_path = f"models/models_distribution/reddit_word2vec_{party}_{period}.model"
            model.save(model_path)
            print(f"Model saved to {model_path}")

In [11]:
def main():
    processFile(filePathforDemocrats, "democrats")
    processFile(filePathforRepublican, "republican")
    processFile(filePathforBackpacking, "backpacking")
    processFile(filePathforvagabond, "vagabond")    
    print("Done :>")

if __name__ == "__main__":
    main()

NameError: name 'np' is not defined

In [None]:
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

def compare_party_embeddings_by_period(dem_model, rep_model, time_period, output_file=None):
    """Compare two word2vec models globally by aligning their vector spaces"""
    # Find common vocabulary
    vocab_dem = set(dem_model.wv.index_to_key)
    vocab_rep = set(rep_model.wv.index_to_key)
    common_vocab = list(vocab_dem.intersection(vocab_rep))
    
    # Extract embeddings for common words
    vectors_dem = np.array([dem_model.wv[word] for word in common_vocab])
    vectors_rep = np.array([rep_model.wv[word] for word in common_vocab])
    
    # Compute the best rotational alignment (orthogonal Procrustes)
    m = vectors_dem.T @ vectors_rep
    u, _, vt = np.linalg.svd(m)
    rotation = u @ vt
    
    # Apply rotation to align model2's space with model1's space
    vectors_rep_aligned = vectors_rep @ rotation
    
    # Calculate word by word similarities
    similarities = []
    for i, word in enumerate(common_vocab):
        sim = cosine_similarity(vectors_dem[i].reshape(1, -1), 
                                vectors_rep_aligned[i].reshape(1, -1))[0][0]
        similarities.append((word, sim))
    
    # Create DataFrame
    df = pd.DataFrame(similarities, columns=['word', 'similarity'])
    
    # Add context data - most similar words in each model
    dem_context = []
    rep_context = []
    
    for word in df['word']:
        try:
            dem_similar = [w for w, _ in dem_model.wv.most_similar(word, topn=10)]
            dem_context.append(", ".join(dem_similar))
        except:
            dem_context.append("")
        
        try:
            rep_similar = [w for w, _ in rep_model.wv.most_similar(word, topn=10)]
            rep_context.append(", ".join(rep_similar))
        except:
            rep_context.append("")
    
    df['dem_context'] = dem_context
    df['rep_context'] = rep_context
    df['time_period'] = time_period
    
    # Sort by similarity (most different words first)
    df = df.sort_values('similarity')
    
    # Save to file if requested
    if output_file:
        df.to_csv(output_file, index=False)
        print(f"Results saved to {output_file}")
    
    return df


In [None]:
model_democrats_before_2016 = gensim.models.Word2Vec.load("models/models_distribution/reddit_word2vec_democrats_before_2016.model")
model_republican_before_2016 = gensim.models.Word2Vec.load("models/models_distribution/reddit_word2vec_republican_before_2016.model")
model_democrats_2017_2020 = gensim.models.Word2Vec.load("models/models_distribution/reddit_word2vec_democrats_2017_2020.model")
model_republican_2017_2020 = gensim.models.Word2Vec.load("models/models_distribution/reddit_word2vec_republican_2017_2020.model")
model_democrats_2021_2024 = gensim.models.Word2Vec.load("models/models_distribution/reddit_word2vec_democrats_2021_2024.model")
model_republican_2021_2024 = gensim.models.Word2Vec.load("models/models_distribution/reddit_word2vec_republican_2021_2024.model")

# Before 2016
df_before_2016 = compare_party_embeddings_by_period(
    model_democrats_before_2016, 
    model_republican_before_2016,
    "before_2016",
    output_file="output/output_distribution/party_comparison_before_2016.csv"
)

# 2017-2020
df_2017_2020 = compare_party_embeddings_by_period(
    model_democrats_2017_2020, 
    model_republican_2017_2020,
    "2017_2020",
    output_file="output/output_distribution/party_comparison_2017_2020.csv"
)

# 2021-2024
df_2021_2024 = compare_party_embeddings_by_period(
    model_democrats_2021_2024, 
    model_republican_2021_2024,
    "2021_2024",
    output_file="output/output_distribution/party_comparison_2021_2024.csv"
)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the CSV files
df_before_2016 = pd.read_csv("output/output_distribution/party_comparison_before_2016.csv")
df_2017_2020 = pd.read_csv("output/output_distribution/party_comparison_2017_2020.csv") 
df_2021_2024 = pd.read_csv("output/output_distribution/party_comparison_2021_2024.csv")



def analyze_similarity_distributions():
    """Analyze how cosine similarity distributions change over time"""
    
    # Prepare data
    dataframes = [df_before_2016, df_2017_2020, df_2021_2024]
    period_names = ['Before 2016', '2017-2020', '2021-2024']
    
    # Create the plot
    plt.figure(figsize=(18, 12))
    
    # Main distribution plots
    for i, (df, period) in enumerate(zip(dataframes, period_names)):
        plt.subplot(2, 3, i+1)
        
        # Create histogram
        plt.hist(df['similarity'], bins=50, alpha=0.7, density=True, 
                color=['#1f77b4', '#ff7f0e', '#2ca02c'][i])
        
        # Add statistics
        mean_sim = df['similarity'].mean()
        std_sim = df['similarity'].std()
        median_sim = df['similarity'].median()
        
        plt.axvline(mean_sim, color='red', linestyle='--', linewidth=2,
                   label=f'Mean: {mean_sim:.3f}')
        plt.axvline(median_sim, color='purple', linestyle=':', linewidth=2,
                   label=f'Median: {median_sim:.3f}')
        
        plt.title(f'Similarity Distribution: {period}', fontsize=14, fontweight='bold')
        plt.xlabel('Cosine Similarity', fontsize=12)
        plt.ylabel('Density', fontsize=12)
        plt.legend()
        plt.grid(True, alpha=0.3)
    
    # Combined comparison plot
    plt.subplot(2, 3, 4)
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
    for i, (df, period, color) in enumerate(zip(dataframes, period_names, colors)):
        plt.hist(df['similarity'], bins=30, alpha=0.6, density=True, 
                label=period, color=color)
    
    plt.title('Similarity Distributions Comparison', fontsize=14, fontweight='bold')
    plt.xlabel('Cosine Similarity', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print detailed statistics
    print("\n" + "="*60)
    print("COSINE SIMILARITY DISTRIBUTION ANALYSIS")
    
    for df, period in zip(dataframes, period_names):
        print(f"\n{period} Statistics:")
        print(f"  Mean similarity: {df['similarity'].mean():.4f}")
        print(f"  Median similarity: {df['similarity'].median():.4f}")
        print(f"  Std similarity: {df['similarity'].std():.4f}")
        print(f"  Min similarity: {df['similarity'].min():.4f}")
        print(f"  Max similarity: {df['similarity'].max():.4f}")
        print(f"  Words with similarity < 0.0: {(df['similarity'] < 0.0).sum()}/{len(df)} ({(df['similarity'] < 0.0).mean()*100:.1f}%)")
        print(f"  Words with similarity < -0.1: {(df['similarity'] < -0.1).sum()}/{len(df)} ({(df['similarity'] < -0.1).mean()*100:.1f}%)")


# Run the analysis
similarity_stats = analyze_similarity_distributions()