In [1]:
import gensim

model_rep = gensim.models.Word2Vec.load("../data/models/republican_2017_2020.model")
model_dem = gensim.models.Word2Vec.load("../data/models/democrats_2017_2020.model")
target_words = ["freedom", "illegal", "gun", "border"] # Add the words you want to show

print(f"{'Word':<15} | {'Republican Neighbors':<30} | {'Democrat Neighbors':<30}")
print("-" * 80)

for word in target_words:
    try:
        # Get top 3 neighbors
        rep_sims = model_rep.wv.most_similar(word, topn=3)
        dem_sims = model_dem.wv.most_similar(word, topn=3)
        
        # Extract just the words
        rep_neighbors = ", ".join([w[0] for w in rep_sims])
        dem_neighbors = ", ".join([w[0] for w in dem_sims])
        
        print(f"{word:<15} | {rep_neighbors:<30} | {dem_neighbors:<30}")
    except KeyError:
        print(f"{word} not found in vocabulary")

Word            | Republican Neighbors           | Democrat Neighbors            
--------------------------------------------------------------------------------
freedom         | liberty, freedom_speech, freedom_religion | liberty, freedom_speech, free_speech
illegal         | legal, unlawful, possession    | legal, felony, legalize       
gun             | firearm, weapon, rifle         | firearm, rifle, weapon        
border          | southern_border, boarder, border_illegally | boarder, southern_border, border_illegally


In [5]:
import pickle

# Adjust the path if your file is in a different location
file_path = "../data/processed_comments/wikipedia/wikipedia.pkl"

try:
    with open(file_path, "rb") as f:
        data = pickle.load(f)
        
    print(f"Total entries: {len(data)}")
    print("Type of data:", type(data))
    
    # Check the first 5 examples
    print("\n--- First 5 Examples ---")
    for i, example in enumerate(data[:5]):
        print(f"Entry {i}: {example}")

except FileNotFoundError:
    print(f"File not found at: {file_path}")

Total entries: 1000000
Type of data: <class 'list'>

--- First 5 Examples ---
Entry 0: ['the', 'los', 'angeles', 'film', 'critic', 'association', 'award', 'announce', 'december', 'and', 'give', 'january', 'winner', 'best', 'picture', 'bugsy', 'runner', 'the', 'fisher', 'king', 'best', 'director', 'barry', 'levinson', 'bugsy', 'runner', 'terry', 'gilliam', 'the', 'fisher', 'king', 'best', 'actor', 'nick', 'nolte', 'the', 'prince', 'tide', 'runner', 'warren', 'beatty', 'bugsy', 'best', 'actress', 'mercedes', 'ruehl', 'the', 'fisher', 'king', 'runner', 'jodie', 'foster', 'the', 'silence', 'the', 'lamb', 'best', 'support', 'actor', 'michael', 'lerner', 'barton', 'fink', 'runner', 'robert', 'duvall', 'rambling', 'rise', 'best', 'support', 'actress', 'jane', 'horrocks', 'life', 'sweet', 'runner', 'amanda', 'plummer', 'the', 'fisher', 'king', 'best', 'screenplay', 'jam', 'toback', 'bugsy', 'runner', 'richard', 'lagravenese', 'the', 'fisher', 'king', 'best', 'cinematography', 'roger', 'deakins

In [6]:
from collections import Counter
import statistics

list_lengths = [len(item) for item in data if isinstance(item, (list, tuple))]
if not list_lengths:
    print("No inner lists/tuples found in `data`.")
else:
    print(f"count: {len(list_lengths)}")
    print(f"avg length: {statistics.mean(list_lengths):.2f}")
    print(f"median length: {statistics.median(list_lengths)}")
    print(f"min length: {min(list_lengths)}, max length: {max(list_lengths)}")
    print("top length counts:", Counter(list_lengths).most_common(10))

count: 1000000
avg length: 368.42
median length: 163.0
min length: 0, max length: 36981
top length counts: [(34, 6135), (20, 6003), (35, 5831), (31, 5815), (32, 5796), (33, 5794), (36, 5762), (26, 5753), (28, 5737), (29, 5726)]


In [7]:
import pickle

# Adjust the path if your file is in a different location
file_path = "../data/processed_comments/democrats/democrats_batch1.pkl"

try:
    with open(file_path, "rb") as f:
        data = pickle.load(f)
        
    print(f"Total entries: {len(data)}")
    print("Type of data:", type(data))
    
    # Check the first 5 examples
    print("\n--- First 5 Examples ---")
    for i, example in enumerate(data[:5]):
        print(f"Entry {i}: {example}")

except FileNotFoundError:
    print(f"File not found at: {file_path}")

Total entries: 1000000
Type of data: <class 'list'>

--- First 5 Examples ---
Entry 0: {'comment_id': 'c07p2u0', 'author': 'Garak', 'date': '2009-02-16', 'timestamp': '1234791099', 'processed_text': ['and', 'they', 'have', 'allow', 'legend', 'grow', 'ill', 'and', 'mythical', 'proportion', 'lie', 'about', 'funding', 'for', 'acorn', 'which', 'nowhere', 'mention', 'the', 'bill', 'gross', 'and', 'unanswered', 'mccain', 'about', 'the', 'honey', 'bee', 'insurance', 'provision', 'this', 'great', 'point', 'democrat', 'have', 'this', 'habit', 'let', 'the', 'republican', 'not', 'only', 'control', 'the', 'conversation', 'but', 'reduce', 'fourth', 'grade', 'level', 'honey', 'bee', 'insurance', 'insurance', 'for', 'livestock', 'producer', 'general', 'include', 'honeybee', 'because', 'they', 'very', 'important', 'and', 'have', 'have', 'rough', 'few', 'year', 'fruit', 'fly', 'research', 'genetics', 'research'], 'original': '&gt;  And they have allowed its legend to grow to ill and mythical proportion