In [None]:
import os
import pickle
from glob import glob
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from gensim.models.phrases import Phraser

political = ["conservative", "liberal", "republican", "democrats"]
non_political = ["books", "cooking", "gaming", "movies", "personalfinance", "travel", "technology"]

# Load global bigram model
bigram_model_path = "models/bigram/political_bigram_1.phr"
bigram = Phraser.load(bigram_model_path)

def save_or_load_subreddit_doc(subreddit, base_dir="processed_comments_1", out_dir="bigdocs"):
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f"{subreddit}.txt")
    if os.path.exists(out_path):
        print(f"Found existing {out_path}, skipping aggregation.")
        return out_path
    pattern = os.path.join(base_dir, subreddit, f"{subreddit}_batch*.pkl")
    files = sorted(glob(pattern))
    with open(out_path, "w", encoding="utf8") as fout:
        for file in files:
            with open(file, "rb") as f:
                batch = pickle.load(f)
                for comment in batch:
                    tokens = comment.get("processed_text", [])
                    bigram_tokens = bigram[tokens]
                    fout.write(" ".join(bigram_tokens) + " ")
    return out_path

subreddits = political + non_political
doc_files = []
for sub in subreddits:
    print(f"Saving {sub}...")
    doc_files.append(save_or_load_subreddit_doc(sub))

vectorizer = TfidfVectorizer(max_features=20000, min_df=3, input='filename')
tfidf_matrix = vectorizer.fit_transform(doc_files)
words = vectorizer.get_feature_names_out()

# Calculate mean TF-IDF for political and non-political subreddits
political_idx = [subreddits.index(s) for s in political]
non_political_idx = [subreddits.index(s) for s in non_political]

mean_tfidf_political = tfidf_matrix[political_idx].mean(axis=0).A1
mean_tfidf_nonpolitical = tfidf_matrix[non_political_idx].mean(axis=0).A1

diff = mean_tfidf_political - mean_tfidf_nonpolitical
df = pd.DataFrame({"word": words, "tfidf_diff": diff})
df = df.sort_values("tfidf_diff", ascending=False)

os.makedirs("output", exist_ok=True)
df.to_csv("output/political_words_tfidf_diff.csv", index=False)

print(df.head(50))