In [6]:
import numpy as np
import pandas as pd
import gensim
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import os

def compare_subreddit_embeddings_by_period(model1, model2, subreddit1_name, subreddit2_name, time_period, output_file=None):
    """Compare two word2vec models by aligning their vector spaces and measuring word similarities"""
    # Find common vocabulary
    vocab1 = set(model1.wv.index_to_key)
    vocab2 = set(model2.wv.index_to_key)
    common_vocab = list(vocab1.intersection(vocab2))

    print(f"{time_period}: Common vocabulary between {subreddit1_name} and {subreddit2_name}: {len(common_vocab)} words")
    
    # Extract embeddings for common words
    vectors1 = np.array([model1.wv[word] for word in common_vocab])
    vectors2 = np.array([model2.wv[word] for word in common_vocab])
    
    # Compute the best rotational alignment (orthogonal Procrustes)
    m = vectors1.T @ vectors2
    u, _, vt = np.linalg.svd(m)
    rotation = u @ vt
    
    # Apply rotation to align model2's space with model1's space
    vectors2_aligned = vectors2 @ rotation
    
    # Calculate word by word similarities
    similarities = []
    for i, word in enumerate(common_vocab):
        sim = cosine_similarity(vectors1[i].reshape(1, -1), 
                                vectors2_aligned[i].reshape(1, -1))[0][0]
        similarities.append((word, sim))
    
    # Create DataFrame
    df = pd.DataFrame(similarities, columns=['word', 'similarity'])
    
    # Sort by similarity (most different words first)
    df = df.sort_values('similarity')
    
    # Save to file if requested
    if output_file:
        # Create directory if it doesn't exist
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        df.to_csv(output_file, index=False)
        print(f"Results saved to {output_file}")
    
    return df

# Load models
print("Loading models...")
# Democrats vs Republicans
republican_2018 = gensim.models.Word2Vec.load("models/yearly_models/republican_2018.model")
republican_2019 = gensim.models.Word2Vec.load("models/yearly_models/republican_2019.model")
republican_2020 = gensim.models.Word2Vec.load("models/yearly_models/republican_2020.model")
republican_2021 = gensim.models.Word2Vec.load("models/yearly_models/republican_2021.model")
republican_2022 = gensim.models.Word2Vec.load("models/yearly_models/republican_2022.model")
republican_2023 = gensim.models.Word2Vec.load("models/yearly_models/republican_2023.model")
republican_2024 = gensim.models.Word2Vec.load("models/yearly_models/republican_2024.model")
democrats_2018 = gensim.models.Word2Vec.load("models/yearly_models/conservative_2018.model")
democrats_2019 = gensim.models.Word2Vec.load("models/yearly_models/conservative_2019.model")
democrats_2020 = gensim.models.Word2Vec.load("models/yearly_models/conservative_2020.model")
democrats_2021 = gensim.models.Word2Vec.load("models/yearly_models/conservative_2021.model")
democrats_2022 = gensim.models.Word2Vec.load("models/yearly_models/conservative_2022.model")
democrats_2023 = gensim.models.Word2Vec.load("models/yearly_models/conservative_2023.model")
democrats_2024 = gensim.models.Word2Vec.load("models/yearly_models/conservative_2024.model")

# Define output directory
output_dir = "output/subreddit_comparisons_yearly"
os.makedirs(output_dir, exist_ok=True)

print("\nComparing models:")

df_2018 = compare_subreddit_embeddings_by_period(
    democrats_2018, 
    republican_2018,
    "democrats", 
    "republicans",
    "2018",
    output_file=f"{output_dir}/democrats_vs_republican_2018.csv"
)

df_2019 = compare_subreddit_embeddings_by_period(
    democrats_2019, 
    republican_2019,
    "democrats", 
    "republicans",
    "2019",
    output_file=f"{output_dir}/democrats_vs_republican_2019.csv"
)

df_2020 = compare_subreddit_embeddings_by_period(
    democrats_2020, 
    republican_2020,
    "democrats", 
    "republicans",
    "2020",
    output_file=f"{output_dir}/democrats_vs_republican_2020.csv"
)

df_2021 = compare_subreddit_embeddings_by_period(
    democrats_2021, 
    republican_2021,
    "democrats", 
    "republicans",
    "2021",
    output_file=f"{output_dir}/democrats_vs_republican_2021.csv"
)

df_2022 = compare_subreddit_embeddings_by_period(
    democrats_2022, 
    republican_2022,
    "democrats", 
    "republicans",
    "2022",
    output_file=f"{output_dir}/democrats_vs_republican_2022.csv"
)

df_2023 = compare_subreddit_embeddings_by_period(
    democrats_2023, 
    republican_2023,
    "democrats", 
    "republicans",
    "2023",
    output_file=f"{output_dir}/democrats_vs_republican_2023.csv"
)


df_2024 = compare_subreddit_embeddings_by_period(
    democrats_2024, 
    republican_2024,
    "democrats", 
    "republicans",
    "2024",
    output_file=f"{output_dir}/democrats_vs_republican_2024.csv"
)

print("\nAnalysis complete. Generated 8 comparison files.")

Loading models...

Comparing models:
2018: Common vocabulary between democrats and republicans: 9090 words
Results saved to output/subreddit_comparisons_yearly/democrats_vs_republican_2018.csv
2019: Common vocabulary between democrats and republicans: 8257 words
Results saved to output/subreddit_comparisons_yearly/democrats_vs_republican_2019.csv
2020: Common vocabulary between democrats and republicans: 12961 words
Results saved to output/subreddit_comparisons_yearly/democrats_vs_republican_2020.csv
2021: Common vocabulary between democrats and republicans: 13131 words
Results saved to output/subreddit_comparisons_yearly/democrats_vs_republican_2021.csv
2022: Common vocabulary between democrats and republicans: 12715 words
Results saved to output/subreddit_comparisons_yearly/democrats_vs_republican_2022.csv
2023: Common vocabulary between democrats and republicans: 10852 words
Results saved to output/subreddit_comparisons_yearly/democrats_vs_republican_2023.csv
2024: Common vocabulary