In [1]:
import glob
import pickle
from gensim.models.phrases import Phrases, Phraser
import os

def train_and_save_global_bigram_model(subreddits, base_data_dir, output_path, min_count=10, threshold=10):
    phrases = Phrases(min_count=min_count, threshold=threshold)
    total_sentences = 0

    for subreddit in subreddits:
        pattern = f"{base_data_dir}/{subreddit}/{subreddit}_batch*.pkl"
        files = sorted(glob.glob(pattern))
        print(f"Pattern: {files}")
        print(f"Loading {len(files)} files for subreddit: {subreddit}")
        for file_path in files:
            try:
                with open(file_path, "rb") as f:
                    comments = pickle.load(f)
                batch_sentences = [
                    comment["processed_text"]
                    for comment in comments
                    if "processed_text" in comment
                ]
                phrases.add_vocab(batch_sentences)
                total_sentences += len(batch_sentences)
            except Exception as e:
                print(f"Error loading {file_path}: {e}")

    print(f"Total sentences for bigram training: {total_sentences}")
    bigram_model = Phraser(phrases)
    bigram_model.save(output_path)
    print(f"Global bigram model saved to {output_path}")

subreddits = ["democrats", "republican", "conservative", "liberal", "technology", "cooking", "movies", "books", "personalfinance", "travel"]

output_dir = "models/bigram"
os.makedirs(output_dir, exist_ok=True)
output_path = f"{output_dir}/all_bigram_1.phr"
base_data_dir = "processed_comments_1"

train_and_save_global_bigram_model(
    subreddits,
    base_data_dir=base_data_dir,
    output_path=output_path,
    min_count=10,
    threshold=10
)

Pattern: ['processed_comments_1/democrats\\democrats_batch1.pkl', 'processed_comments_1/democrats\\democrats_batch2.pkl']
Loading 2 files for subreddit: democrats
Pattern: ['processed_comments_1/republican\\republican_batch1.pkl', 'processed_comments_1/republican\\republican_batch2.pkl']
Loading 2 files for subreddit: republican
Pattern: ['processed_comments_1/conservative\\conservative_batch1.pkl', 'processed_comments_1/conservative\\conservative_batch10.pkl', 'processed_comments_1/conservative\\conservative_batch11.pkl', 'processed_comments_1/conservative\\conservative_batch12.pkl', 'processed_comments_1/conservative\\conservative_batch13.pkl', 'processed_comments_1/conservative\\conservative_batch14.pkl', 'processed_comments_1/conservative\\conservative_batch15.pkl', 'processed_comments_1/conservative\\conservative_batch16.pkl', 'processed_comments_1/conservative\\conservative_batch17.pkl', 'processed_comments_1/conservative\\conservative_batch18.pkl', 'processed_comments_1/conserva