In [None]:
import os
import pickle
from glob import glob
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from gensim.models.phrases import Phraser
# from collections import Counter

political = ["conservative", "liberal", "republican", "democrats"]
non_political = ["books", "cooking", "gaming", "movies", "personalfinance", "travel", "technology"]

# Load global bigram model
bigram_model_path = "../../models/bigram/all_bigram_1.phr"
bigram = Phraser.load(bigram_model_path)

def save_or_load_subreddit_doc(subreddit, base_dir="../../processed_comments/processed_comments_1", out_dir="bigdocs"):
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f"{subreddit}.txt")
    if os.path.exists(out_path):
        print(f"Found existing {out_path}, skipping aggregation.")
        return out_path
    pattern = os.path.join(base_dir, subreddit, f"{subreddit}_batch*.pkl")
    files = sorted(glob(pattern))
    with open(out_path, "w", encoding="utf8") as fout:
        for file in files:
            with open(file, "rb") as f:
                batch = pickle.load(f)
                for comment in batch:
                    tokens = comment.get("processed_text", [])
                    bigram_tokens = bigram[tokens]
                    fout.write(" ".join(bigram_tokens) + " ")
    return out_path

subreddits = political + non_political
doc_files = []
for sub in subreddits:
    print(f"Handling {sub}...")
    doc_files.append(save_or_load_subreddit_doc(sub))

# total_counter = Counter()
# for doc_file in doc_files:
#     with open(doc_file, encoding="utf8") as f:
#         tokens = f.read().split()
#         print(f"{doc_file} has {len(tokens)} tokens.")
#         total_counter.update(tokens)

# min_total_freq = 100 
# vocab = [word for word, freq in total_counter.items() if freq >= min_total_freq]

vectorizer = TfidfVectorizer(max_features=20000, min_df=5, input='filename')
tfidf_matrix = vectorizer.fit_transform(doc_files)
words = vectorizer.get_feature_names_out()

# Calculate mean TF-IDF for political and non-political subreddits
political_idx = [subreddits.index(s) for s in political]
non_political_idx = [subreddits.index(s) for s in non_political]

mean_tfidf_political = tfidf_matrix[political_idx].mean(axis=0).A1
mean_tfidf_nonpolitical = tfidf_matrix[non_political_idx].mean(axis=0).A1

diff = mean_tfidf_political - mean_tfidf_nonpolitical
df = pd.DataFrame({"word": words, "tfidf_diff": diff})
df = df.sort_values("tfidf_diff", ascending=False)

os.makedirs("output", exist_ok=True)
df.to_csv("output/political_words_tfidf_diff.csv", index=False)

print(df.head(50))

Handling conservative...
Found existing bigdocs\conservative.txt, skipping aggregation.
Handling liberal...
Found existing bigdocs\liberal.txt, skipping aggregation.
Handling republican...
Found existing bigdocs\republican.txt, skipping aggregation.
Handling democrats...
Found existing bigdocs\democrats.txt, skipping aggregation.
Handling books...
Found existing bigdocs\books.txt, skipping aggregation.
Handling cooking...
Found existing bigdocs\cooking.txt, skipping aggregation.
Handling gaming...
Found existing bigdocs\gaming.txt, skipping aggregation.
Handling movies...
Found existing bigdocs\movies.txt, skipping aggregation.
Handling personalfinance...
Found existing bigdocs\personalfinance.txt, skipping aggregation.
Handling travel...
Found existing bigdocs\travel.txt, skipping aggregation.
Handling technology...
Found existing bigdocs\technology.txt, skipping aggregation.
bigdocs\conservative.txt has 213976957 tokens.
bigdocs\liberal.txt has 7990796 tokens.
bigdocs\republican.txt 