In [2]:
import re
import json
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from fileStreams import getFileJsonStream

# Download NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocess text the same way as in model training
def preprocess_text(text, without_stopwords=False):
    # Remove URLs
    txt = re.sub(r'http\S+', '', text)
    
    # Remove non-alphanumeric characters and convert to lowercase
    txt = re.sub("[^A-Za-z']+", ' ', txt).lower()
    
    # Replace special characters with spaces
    txt = re.sub(r"['\-_]", ' ', txt)
    
    # Tokenize
    words = txt.split()
    
    # Tag words with parts of speech for better lemmatization
    tagged_words = nltk.pos_tag(words)
    processed_words = []
    
    for word, tag in tagged_words:
        if without_stopwords and word in stop_words:
            continue
        
        # Convert Penn Treebank tag to WordNet tag
        if tag.startswith('J'):
            wordnet_pos = 'a'  # adjective
        elif tag.startswith('V'):
            wordnet_pos = 'v'  # verb
        elif tag.startswith('N'):
            wordnet_pos = 'n'  # noun
        elif tag.startswith('R'):
            wordnet_pos = 'r'  # adverb
        else:
            wordnet_pos = 'n'  # default to noun
                
        # Lemmatize with the correct POS
        lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
        processed_words.append(lemma)
    
    return processed_words

def find_comments_with_single_word(data_path, target_words, max_results=3):
    """Find comments containing a single target word after preprocessing"""
    results = {}
    
    # Initialize results dictionary
    for word in target_words:
        results[word] = []
    
    # Process comments
    with open(data_path, "rb") as f:
        jsonStream = getFileJsonStream(data_path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {data_path}")
            return results
        
        for row in tqdm(jsonStream, desc="Processing comments"):
            if "body" not in row or "created_utc" not in row:
                continue
                
            # Check if we have enough results for all words
            if all(len(results[word]) >= max_results for word in target_words):
                break
            
            # Get the comment text
            text = row["body"]
            
            # Process text
            processed_words = preprocess_text(text)
            processed_text = " ".join(processed_words)
            
            # Check for each target word
            for word in target_words:
                # Skip if we already have enough examples for this word
                if len(results[word]) >= max_results:
                    continue
                
                # For words with underscores, check both versions
                search_variations = [word, word.replace('_', ' ')]
                
                # Also check for component parts (for bigrams)
                components = word.split('_')
                components_present = all(comp in processed_text for comp in components)
                
                # Check if the word is in the processed text or all components are present
                if any(var in processed_text for var in search_variations) or components_present:
                    results[word].append({
                        'original_comment': text,
                        'processed_comment': processed_text,
                        'author': row.get('author', 'unknown'),
                        'created_utc': row.get('created_utc', 'unknown')
                    })
    
    return results

# Example usage
def find_single_words():
    # Load the top 50 words from dem_10_10_list.csv
    df = pd.read_csv('output/dem_10_10_list.csv')
    
    # Get the top 50 source words (preserving original order)
    target_words = df['source'].head(50).tolist()
    
    print(f"Searching for {len(target_words)} words from dem_10_10_list.csv")
    print(f"Words to search for: {target_words}")
    
    # Find comments containing these words
    data_path = "datasets/democrats_comments.zst"
    results = find_comments_with_single_word(data_path, target_words, max_results=3)
    
    # Print results
    print("\n=== Comments Containing Target Words ===\n")
    
    found_count = 0
    for word in target_words:  # Iterate in original order
        comments = results[word]
        print(f"\n## Word: {word} ##\n")
        
        if not comments:
            print("No comments found with this word.")
            continue
        
        found_count += 1
        for i, comment in enumerate(comments):
            print(f"\nExample {i+1}:")
            print(f"Original: {comment['original_comment'][:200]}..." if len(comment['original_comment']) > 200 
                  else f"Original: {comment['original_comment']}")
            print(f"Processed: {comment['processed_comment'][:200]}..." if len(comment['processed_comment']) > 200
                  else f"Processed: {comment['processed_comment']}")
            print(f"Author: {comment['author']}")
    
    print(f"\nFound examples for {found_count} out of {len(target_words)} words")
    
    # Save results to file
    with open("output/top50_word_comments.json", "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)
    
    print("\nResults saved to output/top50_word_comments.json")

# Run the single word search
find_single_words()

Searching for 50 words from dem_10_10_list.csv
Words to search for: ['j_fzim', 'vobu_q', 'aug_pm', 'pm_edt', 'loosen_arctic', 'drilling_restriction', 'webster_bbc', 'ukraine_merriam', 'catherine_cortez', 'masto', 'aug_pm', 'pm_pm', 'pm_edt', 'pm_pm', 'aldarondo', 'misla', 'lukens', 'buz', 'immoral_mob', 'insecure_pathological', 'dr_martin', 'luther_king', 'fri_aug', 'edt_tue', 'fri_aug', 'pm_pm', 'jerry_moran', 'john_hoeven', 'j_fzim', 'xt', 'o_bstruct', 'p_roject', 'agedlikemilk', 'noshitsherlock', 'noshitsherlock', 'lostredditors', 'paywall_workaround', 'firewall_workaround', 'ecuador_indict', 'alleged_bribery', 'cynthia', 'lummis', 'p_roject', 'g_aslight', 'climate_resiliency', 'coastal_resiliency', 'vobu_q', 'xt', 'edt_tue', 'pm_pm']


Processing comments: 1670395it [13:06, 2124.88it/s]


=== Comments Containing Target Words ===


## Word: j_fzim ##


Example 1:
Original: ![gif](giphy|J8FZIm9VoBU6Q)
Processed: gif giphy j fzim vobu q
Author: fatjumboshrimp

Example 2:
Original: ![gif](giphy|J8FZIm9VoBU6Q)
Processed: gif giphy j fzim vobu q
Author: fatjumboshrimp

Example 3:
Original: ![gif](giphy|J8FZIm9VoBU6Q)
Processed: gif giphy j fzim vobu q
Author: NemoLeeGreen

## Word: vobu_q ##


Example 1:
Original: ![gif](giphy|J8FZIm9VoBU6Q)
Processed: gif giphy j fzim vobu q
Author: fatjumboshrimp

Example 2:
Original: ![gif](giphy|J8FZIm9VoBU6Q)
Processed: gif giphy j fzim vobu q
Author: fatjumboshrimp

Example 3:
Original: ![gif](giphy|J8FZIm9VoBU6Q)
Processed: gif giphy j fzim vobu q
Author: NemoLeeGreen

## Word: aug_pm ##


Example 1:
Original: What is so stunning is that in violence we may see it end in a slaughter and just keep on trying. This is a faith element in some way. Such faith is being demonstrated in the two wars right now. Furth...
Processed: what be so st




In [None]:
from collections import defaultdict

def build_word2vect_model(path, party, without_stopwords=True, phrases_min_count=5, word2vec_min_count=5):
    print(f"Processing file {path}")
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # For each period, track comments and user-word usage
    chunks = {
        "before_2016": [],
        "2017_2020": [],
        "2021_2024": [],
    }
    user_words = {
        "before_2016": defaultdict(set),
        "2017_2020": defaultdict(set),
        "2021_2024": defaultdict(set),
    }
    user_comments = {
        "before_2016": defaultdict(list),
        "2017_2020": defaultdict(list),
        "2021_2024": defaultdict(list),
    }
    counts = {period: 0 for period in chunks.keys()}

    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {path}")
            return
        for row in tqdm(jsonStream, desc="Processing comments"):
            if "body" not in row or "created_utc" not in row or "author" not in row:
                continue
            author = row["author"]
            if author in {"AutoModerator", "election_info_bot"}:
                continue
            text = row["body"]
            created_timestamp = row["created_utc"]
            year = datetime.datetime.fromtimestamp(int(created_timestamp)).year
            # if year <= 2016:
            #     chunk_key = "before_2016"
            # elif 2017 <= year <= 2020:
            #     chunk_key = "2017_2020"
            # elif 2021 <= year <= 2024:
            #     chunk_key = "2021_2024"
            # else:
            #     continue
            if 2021 <= year <= 2024:
                chunk_key = "2021_2024"
            else:
                continue

            txt = re.sub(r'http\S+', '', text)
            txt = re.sub("[^A-Za-z']+", ' ', txt).lower()
            txt = re.sub(r"['\-_]", ' ', txt)
            words = txt.split()
            if not words:
                continue
            tagged_words = nltk.pos_tag(words)
            processed_words = []
            for word, tag in tagged_words:
                if without_stopwords and word in stop_words:
                    continue
                if tag.startswith('J'):
                    wordnet_pos = 'a'
                elif tag.startswith('V'):
                    wordnet_pos = 'v'
                elif tag.startswith('N'):
                    wordnet_pos = 'n'
                elif tag.startswith('R'):
                    wordnet_pos = 'r'
                else:
                    wordnet_pos = 'n'
                lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
                processed_words.append(lemma)
                user_words[chunk_key][lemma].add(author)
            if processed_words:
                user_comments[chunk_key][author].append(processed_words)
                counts[chunk_key] += 1

    print("\n=== Comment Counts by Period ===")
    for period, count in counts.items():
        print(f"{period}: {count} comments")

    # Filter words by user count and rebuild comments for each period
    for period in chunks.keys():
        valid_words = {w for w, users in user_words[period].items() if len(users) >= 5}
        filtered_comments = []
        for comments in user_comments[period].values():
            for comment in comments:
                filtered = [w for w in comment if w in valid_words]
                if filtered:
                    filtered_comments.append(filtered)
        print(f"{period}: {len(filtered_comments)} comments after filtering words by user count")
        if filtered_comments:
            print(f"\nExtracting bigrams for {period}...")
            phrases = Phrases(filtered_comments, 
                              min_count=phrases_min_count, 
                              threshold=100)
            bigram_model = Phraser(phrases)
            bigrammed_comments = [bigram_model[comment] for comment in filtered_comments]
            chunks[period] = bigrammed_comments
        else:
            chunks[period] = []

    # Train a Word2Vec model for each time period
    for period, comments in chunks.items():
        if len(comments) > 0:
            print(f"\n=== Training Word2Vec for {period} ({len(comments)} comments) ===")
            model = Word2Vec(
                vector_size=300,
                window=5,
                min_count=word2vec_min_count,
                workers=16
            )
            model.build_vocab(comments)
            print(f"Vocabulary size: {len(model.wv.index_to_key)}")
            model.train(
                comments,
                total_examples=len(comments),
                epochs=5
            )
            model_path = f"models/reddit_word2vec_{phrases_min_count}_{word2vec_min_count}_filterd_{party}_{period}.model"
            model.save(model_path)
            print(f"Model saved to {model_path}")

def main():
    build_word2vect_model(filePathforDemocrats, "democrats", without_stopwords=False, 
                          phrases_min_count=10, word2vec_min_count=10)
    print("Done :>")

if __name__ == "__main__":
    main()

In [3]:
import re
import json
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from fileStreams import getFileJsonStream
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
import os

# Download NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Ensure output directory exists
os.makedirs("output/gibberish_analysis", exist_ok=True)

def preprocess_text(text, without_stopwords=False):
    """Process text the same way as in model training"""
    # Remove URLs
    txt = re.sub(r'http\S+', '', text)
    
    # Remove non-alphanumeric characters and convert to lowercase
    txt = re.sub("[^A-Za-z']+", ' ', txt).lower()
    
    # Replace special characters with spaces
    txt = re.sub(r"['\-_]", ' ', txt)
    
    # Tokenize
    words = txt.split()
    
    # Tag words with parts of speech for better lemmatization
    tagged_words = nltk.pos_tag(words)
    processed_words = []
    
    for word, tag in tagged_words:
        if without_stopwords and word in stop_words:
            continue
        
        # Convert Penn Treebank tag to WordNet tag
        if tag.startswith('J'):
            wordnet_pos = 'a'  # adjective
        elif tag.startswith('V'):
            wordnet_pos = 'v'  # verb
        elif tag.startswith('N'):
            wordnet_pos = 'n'  # noun
        elif tag.startswith('R'):
            wordnet_pos = 'r'  # adverb
        else:
            wordnet_pos = 'n'  # default to noun
                
        # Lemmatize with the correct POS
        lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
        processed_words.append(lemma)
    
    return processed_words

def analyze_gibberish_words():
    # Load the model to check vocabulary
    model_path = "models/reddit_word2vec_10_10_filterd_democrats_2021_2024.model"
    print(f"Loading model from {model_path}")
    model = Word2Vec.load(model_path)
    
    # Load the top 50 source words
    df = pd.read_csv('output/dem_10_10_list.csv')
    target_words = df['source'].head(50).tolist()
    
    print(f"Analyzing {len(target_words)} words from dem_10_10_list.csv")
    
    # Check which words are in the model vocabulary
    words_in_model = []
    words_not_in_model = []
    
    for word in target_words:
        if word in model.wv:
            words_in_model.append(word)
        else:
            words_not_in_model.append(word)
    
    print(f"Words in model: {len(words_in_model)} out of {len(target_words)}")
    if words_not_in_model:
        print(f"Words not found in model: {words_not_in_model}")
    
    # Process sample comments to create bigrams
    data_path = "datasets/democrats_comments.zst"
    print(f"Processing comments from {data_path}")
    
    # Store comments and their processing stages
    all_comments = []
    word_occurrences = {word: [] for word in target_words}
    bigram_components = {}
    
    # For multi-word terms, track their components
    for word in target_words:
        if '_' in word:
            components = word.split('_')
            bigram_components[word] = components
    
    # Process comments
    with open(data_path, "rb") as f:
        jsonStream = getFileJsonStream(data_path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {data_path}")
            return
        
        # Process up to 100,000 comments to find examples
        for idx, row in enumerate(tqdm(jsonStream, desc="Processing comments")):
            if idx > 100000:  # Limit for performance
                break
                
            if "body" not in row or "created_utc" not in row or "author" not in row:
                continue
                
            # Get the comment text
            text = row["body"]
            author = row.get('author', 'unknown')
            created_utc = row.get('created_utc', 'unknown')
            
            # 1. Original text
            original_text = text
            
            # 2. After URL removal and lowercase
            cleaned_text = re.sub(r'http\S+', '', text)
            cleaned_text = re.sub("[^A-Za-z']+", ' ', cleaned_text).lower()
            cleaned_text = re.sub(r"['\-_]", ' ', cleaned_text)
            
            # 3. After lemmatization
            processed_words = preprocess_text(text, without_stopwords=False)
            lemmatized_text = " ".join(processed_words)
            
            # Create a comment record
            comment_record = {
                "original_text": original_text,
                "cleaned_text": cleaned_text,
                "lemmatized_text": lemmatized_text,
                "processed_words": processed_words,
                "author": author,
                "created_utc": created_utc
            }
            
            # Check if any target word components are in this comment
            for word, components in bigram_components.items():
                # Check if all components appear in the lemmatized text
                if all(comp.lower() in lemmatized_text for comp in components):
                    if len(word_occurrences[word]) < 5:  # Limit to 5 examples per word
                        word_occurrences[word].append(comment_record)
            
            # Also check for exact word matches
            for word in target_words:
                # Skip if we already have enough examples
                if len(word_occurrences[word]) >= 5:
                    continue
                    
                # Check if the word is in the lemmatized text
                if word in lemmatized_text or word.replace('_', ' ') in lemmatized_text:
                    word_occurrences[word].append(comment_record)
            
            # Keep track of all comments for further analysis
            all_comments.append(comment_record)
            
            # Break if we found examples for all words
            if all(len(examples) >= 5 for examples in word_occurrences.values()):
                break
    
    # Prepare results for output
    results = {
        "model_info": {
            "path": model_path,
            "vocabulary_size": len(model.wv.index_to_key),
            "target_words_in_model": words_in_model,
            "target_words_not_in_model": words_not_in_model
        },
        "word_examples": {}
    }
    
    # For each target word, show the comments and processing steps
    for word in target_words:
        examples = word_occurrences[word]
        
        word_results = {
            "word": word,
            "is_bigram": '_' in word,
            "components": word.split('_') if '_' in word else [word],
            "found_in_comments": len(examples) > 0,
            "examples": []
        }
        
        for example in examples:
            # Find where the word components appear in the processed words
            if '_' in word:
                components = word.split('_')
                component_indices = []
                
                for comp in components:
                    indices = [i for i, w in enumerate(example["processed_words"]) if w == comp.lower()]
                    if indices:
                        component_indices.append((comp, indices))
                
                word_results["examples"].append({
                    "original_text": example["original_text"],
                    "cleaned_text": example["cleaned_text"],
                    "lemmatized_text": example["lemmatized_text"],
                    "component_locations": component_indices,
                    "author": example["author"],
                    "created_utc": example["created_utc"]
                })
            else:
                # For single words
                indices = [i for i, w in enumerate(example["processed_words"]) if w == word.lower()]
                
                word_results["examples"].append({
                    "original_text": example["original_text"],
                    "cleaned_text": example["cleaned_text"],
                    "lemmatized_text": example["lemmatized_text"],
                    "word_locations": indices,
                    "author": example["author"],
                    "created_utc": example["created_utc"]
                })
        
        results["word_examples"][word] = word_results
    
    # Save the analysis results
    with open("output/gibberish_analysis/gibberish_word_analysis.json", "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)
    
    # Generate a readable report
    with open("output/gibberish_analysis/gibberish_word_report.txt", "w", encoding="utf-8") as f:
        f.write("ANALYSIS OF GIBBERISH WORDS\n")
        f.write("==========================\n\n")
        
        f.write(f"Model path: {model_path}\n")
        f.write(f"Vocabulary size: {len(model.wv.index_to_key)}\n")
        f.write(f"Target words in model: {len(words_in_model)} out of {len(target_words)}\n")
        if words_not_in_model:
            f.write(f"Words not found in model: {', '.join(words_not_in_model)}\n\n")
        
        # Create a summary of findings
        words_with_examples = sum(1 for w in results["word_examples"].values() if w["found_in_comments"])
        f.write(f"Words with examples found: {words_with_examples} out of {len(target_words)}\n\n")
        
        # Report on each word
        for word, word_data in results["word_examples"].items():
            f.write(f"WORD: {word}\n")
            f.write(f"{'=' * (len(word) + 6)}\n")
            f.write(f"Is bigram: {word_data['is_bigram']}\n")
            f.write(f"Components: {', '.join(word_data['components'])}\n")
            f.write(f"Found in comments: {word_data['found_in_comments']}\n")
            f.write(f"Number of examples: {len(word_data['examples'])}\n\n")
            
            # Show examples
            for i, example in enumerate(word_data["examples"]):
                f.write(f"Example {i+1}:\n")
                f.write(f"Original text: {example['original_text'][:200]}...\n")
                f.write(f"Cleaned text: {example['cleaned_text'][:200]}...\n")
                f.write(f"Lemmatized text: {example['lemmatized_text'][:200]}...\n")
                
                if word_data["is_bigram"]:
                    f.write("Component locations:\n")
                    for comp, indices in example.get("component_locations", []):
                        f.write(f"  - {comp}: positions {indices}\n")
                else:
                    f.write(f"Word locations: positions {example.get('word_locations', [])}\n")
                
                f.write(f"Author: {example['author']}\n")
                f.write(f"Timestamp: {example['created_utc']}\n\n")
            
            f.write("\n" + "-" * 80 + "\n\n")
    
    print(f"Analysis complete. Results saved to output/gibberish_analysis/")
    print(f"Words with examples found: {words_with_examples} out of {len(target_words)}")

if __name__ == "__main__":
    analyze_gibberish_words()

Loading model from models/reddit_word2vec_10_10_filterd_democrats_2021_2024.model
Analyzing 50 words from dem_10_10_list.csv
Words in model: 50 out of 50
Processing comments from datasets/democrats_comments.zst


Processing comments: 100001it [01:00, 1640.16it/s]

Analysis complete. Results saved to output/gibberish_analysis/
Words with examples found: 17 out of 50





In [4]:
import re
import json
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from fileStreams import getFileJsonStream
from gensim.models import Word2Vec
import os
import datetime

# Download NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Ensure output directory exists
os.makedirs("output/exact_word_matches", exist_ok=True)

def preprocess_text(text, without_stopwords=True):
    """Process text the same way as in model training"""
    # Remove URLs
    txt = re.sub(r'http\S+', '', text)
    
    # Remove non-alphanumeric characters and convert to lowercase
    txt = re.sub("[^A-Za-z']+", ' ', txt).lower()
    
    # Replace special characters with spaces
    txt = re.sub(r"['\-_]", ' ', txt)
    
    # Tokenize
    words = txt.split()
    
    # Tag words with parts of speech for better lemmatization
    tagged_words = nltk.pos_tag(words)
    processed_words = []
    
    for word, tag in tagged_words:
        if without_stopwords and word in stop_words:
            continue
        
        # Convert Penn Treebank tag to WordNet tag
        if tag.startswith('J'):
            wordnet_pos = 'a'  # adjective
        elif tag.startswith('V'):
            wordnet_pos = 'v'  # verb
        elif tag.startswith('N'):
            wordnet_pos = 'n'  # noun
        elif tag.startswith('R'):
            wordnet_pos = 'r'  # adverb
        else:
            wordnet_pos = 'n'  # default to noun
                
        # Lemmatize with the correct POS
        lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
        processed_words.append(lemma)
    
    return processed_words

def check_exact_word_match(word, processed_words):
    """Check if a word appears exactly in processed words (not as part of another word)"""
    if '_' in word:
        # For bigrams, check if components appear as separate words
        components = word.split('_')
        return all(comp in processed_words for comp in components)
    else:
        # For single words, check for exact match
        return word in processed_words

def find_exact_word_matches():
    # Load the model to check vocabulary
    model_path = "models/reddit_word2vec_10_10_filterd_democrats_2021_2024.model"
    print(f"Loading model from {model_path}")
    model = Word2Vec.load(model_path)
    
    # Load the top 50 source words
    df = pd.read_csv('output/dem_10_10_list.csv')
    target_words = df['source'].head(50).tolist()
    
    print(f"Analyzing {len(target_words)} words from dem_10_10_list.csv")
    
    # Dictionary to store results
    results = {
        "model_info": {
            "path": model_path,
            "vocabulary_size": len(model.wv.index_to_key),
            "top50_words": target_words
        },
        "word_analysis": {}
    }
    
    # Initialize analysis for each word
    for word in target_words:
        results["word_analysis"][word] = {
            "word": word,
            "in_model_vocab": word in model.wv,
            "exact_matches": [],
            "partial_matches": [],
            "components": word.split('_') if '_' in word else [word]
        }
    
    # Process comments to find exact and partial matches
    data_path = "datasets/democrats_comments.zst"
    print(f"Processing comments from {data_path}")
    
    # Set maximum comments to process and examples to find per word
    max_comments = 500000
    max_examples = 5
    
    # Track if we have found enough examples for all words
    all_found_exact = False
    
    # Process comments
    with open(data_path, "rb") as f:
        jsonStream = getFileJsonStream(data_path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {data_path}")
            return
        
        for idx, row in enumerate(tqdm(jsonStream, desc="Processing comments")):
            if idx >= max_comments:
                break
                
            if "body" not in row or "created_utc" not in row or "author" not in row:
                continue
                
            # Get the comment text
            text = row["body"]
            author = row.get('author', 'unknown')
            created_utc = row.get('created_utc', 'unknown')
            year = datetime.datetime.fromtimestamp(int(created_utc)).year if isinstance(created_utc, (int, float)) else "unknown"
            
            # Process text
            processed_words = preprocess_text(text, without_stopwords=False)
            processed_text = " ".join(processed_words)
            
            # Check for each target word
            for word in target_words:
                word_data = results["word_analysis"][word]
                
                # Check for exact match as a whole token
                if len(word_data["exact_matches"]) < max_examples:
                    exact_match = check_exact_word_match(word, processed_words)
                    if exact_match:
                        word_data["exact_matches"].append({
                            "original_text": text,
                            "processed_text": processed_text,
                            "processed_words": processed_words,
                            "author": author,
                            "created_utc": created_utc,
                            "year": year
                        })
                
                # Check for partial match (substring)
                if len(word_data["partial_matches"]) < max_examples:
                    # For words with underscores, check each component separately
                    if '_' in word:
                        components = word.split('_')
                        partial_match = all(any(comp in w for w in processed_words) for comp in components)
                    else:
                        # For single words, check if it's a substring of any processed word
                        partial_match = any(word in w and word != w for w in processed_words)
                    
                    if partial_match and not exact_match:  # Only count as partial if not already an exact match
                        word_data["partial_matches"].append({
                            "original_text": text,
                            "processed_text": processed_text,
                            "processed_words": processed_words,
                            "author": author,
                            "created_utc": created_utc,
                            "year": year
                        })
            
            # Check if we have enough examples for all words
            all_found_exact = all(len(results["word_analysis"][word]["exact_matches"]) >= max_examples for word in target_words)
            
            # If we have found enough examples for all words, stop processing
            if all_found_exact:
                print(f"Found enough exact matches for all words after processing {idx+1} comments.")
                break
    
    # Analyze and summarize results
    exact_match_counts = {word: len(data["exact_matches"]) for word, data in results["word_analysis"].items()}
    partial_match_counts = {word: len(data["partial_matches"]) for word, data in results["word_analysis"].items()}
    
    words_with_exact_matches = sum(1 for count in exact_match_counts.values() if count > 0)
    words_with_partial_matches = sum(1 for count in partial_match_counts.values() if count > 0)
    
    print(f"Words with exact matches: {words_with_exact_matches} out of {len(target_words)}")
    print(f"Words with only partial matches: {words_with_partial_matches} out of {len(target_words)}")
    
    # Add summary to results
    results["summary"] = {
        "words_with_exact_matches": words_with_exact_matches,
        "words_with_partial_matches": words_with_partial_matches,
        "words_with_no_matches": len(target_words) - words_with_exact_matches - words_with_partial_matches
    }
    
    # Save results as JSON
    with open("output/exact_word_matches/word_match_analysis.json", "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)
    
    # Generate a readable report
    with open("output/exact_word_matches/word_match_report.txt", "w", encoding="utf-8") as f:
        f.write("ANALYSIS OF TOP 50 WORDS - EXACT VS PARTIAL MATCHES\n")
        f.write("=================================================\n\n")
        
        f.write(f"Model path: {model_path}\n")
        f.write(f"Vocabulary size: {len(model.wv.index_to_key)}\n\n")
        
        f.write("SUMMARY\n")
        f.write("=======\n")
        f.write(f"Words with exact matches: {words_with_exact_matches} out of {len(target_words)}\n")
        f.write(f"Words with only partial matches: {words_with_partial_matches} out of {len(target_words)}\n")
        f.write(f"Words with no matches: {len(target_words) - words_with_exact_matches - words_with_partial_matches} out of {len(target_words)}\n\n")
        
        # Words without any matches
        no_match_words = [word for word in target_words if len(results["word_analysis"][word]["exact_matches"]) == 0 and 
                           len(results["word_analysis"][word]["partial_matches"]) == 0]
        if no_match_words:
            f.write("WORDS WITH NO MATCHES\n")
            f.write("====================\n")
            for word in no_match_words:
                f.write(f"- {word}\n")
            f.write("\n")
        
        # Words that appear only as part of other words
        partial_only_words = [word for word in target_words if len(results["word_analysis"][word]["exact_matches"]) == 0 and 
                               len(results["word_analysis"][word]["partial_matches"]) > 0]
        if partial_only_words:
            f.write("WORDS APPEARING ONLY AS PART OF OTHER WORDS\n")
            f.write("========================================\n")
            for word in partial_only_words:
                f.write(f"- {word}\n")
            f.write("\n")
        
        # Words with exact matches
        exact_match_words = [word for word in target_words if len(results["word_analysis"][word]["exact_matches"]) > 0]
        if exact_match_words:
            f.write("WORDS WITH EXACT MATCHES\n")
            f.write("=======================\n")
            for word in exact_match_words:
                f.write(f"- {word} ({len(results['word_analysis'][word]['exact_matches'])} examples)\n")
            f.write("\n")
        
        # Detailed examples for each word
        f.write("DETAILED EXAMPLES\n")
        f.write("================\n\n")
        
        for word in target_words:
            word_data = results["word_analysis"][word]
            f.write(f"WORD: {word}\n")
            f.write(f"{'=' * (len(word) + 6)}\n")
            f.write(f"In model vocabulary: {word_data['in_model_vocab']}\n")
            f.write(f"Components: {', '.join(word_data['components'])}\n")
            f.write(f"Exact matches: {len(word_data['exact_matches'])}\n")
            f.write(f"Partial matches: {len(word_data['partial_matches'])}\n\n")
            
            # Show exact match examples
            if word_data["exact_matches"]:
                f.write("EXACT MATCHES:\n")
                f.write("-------------\n")
                for i, example in enumerate(word_data["exact_matches"]):
                    f.write(f"Example {i+1} (Year: {example['year']}):\n")
                    f.write(f"Original: {example['original_text'][:200]}...\n" if len(example['original_text']) > 200 
                           else f"Original: {example['original_text']}\n")
                    f.write(f"Processed: {example['processed_text'][:200]}...\n" if len(example['processed_text']) > 200 
                           else f"Processed: {example['processed_text']}\n")
                    f.write(f"Tokens: {example['processed_words']}\n")
                    f.write(f"Author: {example['author']}\n\n")
            
            # Show partial match examples
            if word_data["partial_matches"]:
                f.write("PARTIAL MATCHES:\n")
                f.write("---------------\n")
                for i, example in enumerate(word_data["partial_matches"]):
                    f.write(f"Example {i+1} (Year: {example['year']}):\n")
                    f.write(f"Original: {example['original_text'][:200]}...\n" if len(example['original_text']) > 200 
                           else f"Original: {example['original_text']}\n")
                    f.write(f"Processed: {example['processed_text'][:200]}...\n" if len(example['processed_text']) > 200 
                           else f"Processed: {example['processed_text']}\n")
                    f.write(f"Tokens: {example['processed_words']}\n")
                    f.write(f"Author: {example['author']}\n\n")
            
            f.write("\n" + "-" * 80 + "\n\n")
    
    print(f"Analysis complete. Results saved to output/exact_word_matches/")

if __name__ == "__main__":
    find_exact_word_matches()

Loading model from models/reddit_word2vec_10_10_filterd_democrats_2021_2024.model
Analyzing 50 words from dem_10_10_list.csv
Processing comments from datasets/democrats_comments.zst


Processing comments: 500000it [04:15, 1959.81it/s]

Words with exact matches: 23 out of 50
Words with only partial matches: 16 out of 50
Analysis complete. Results saved to output/exact_word_matches/





In [5]:
import re
import json
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from fileStreams import getFileJsonStream
from gensim.models import Word2Vec
import os
import datetime
import html
import unicodedata

# Download NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Ensure output directory exists
os.makedirs("output/exact_word_matches", exist_ok=True)

def preprocess_reddit_text(text):
    """Specialized preprocessing for Reddit content before regular text processing"""
    # Handle HTML entities
    text = html.unescape(text)
    
    # Handle Unicode normalization
    text = unicodedata.normalize('NFKD', text)
    
    # Remove all URLs and images
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)
    
    # Handle Reddit's link format
    text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)
    
    # Handle markdown formatting (bold, italics)
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
    text = re.sub(r'\*(.*?)\*', r'\1', text)
    
    # Handle Reddit single character bold/emphasis that causes g_aslight issue
    text = re.sub(r'\*\*([A-Za-z])\*\*([A-Za-z]+)', r'\1\2', text)
    
    # Handle subreddit and user references
    text = re.sub(r'/r/(\w+)', r'subreddit_\1', text)
    text = re.sub(r'/u/(\w+)', r'user_\1', text)
    
    # Handle quote markers and other Reddit-specific formatting
    text = re.sub(r'&gt;', ' ', text)  # Quote marker
    text = re.sub(r'\^', ' ', text)     # Superscript marker
    
    return text

def preprocess_text(text, without_stopwords=True):
    """Process text the same way as in model training"""
    # Apply Reddit-specific preprocessing first
    txt = preprocess_reddit_text(text)
    
    # Remove non-alphanumeric characters and convert to lowercase
    txt = re.sub("[^A-Za-z']+", ' ', txt).lower()
    
    # Tokenize
    words = txt.split()
    
    # Tag words with parts of speech for better lemmatization
    tagged_words = nltk.pos_tag(words)
    processed_words = []
    
    for word, tag in tagged_words:
        if without_stopwords and word in stop_words:
            continue
        
        # Convert Penn Treebank tag to WordNet tag
        if tag.startswith('J'):
            wordnet_pos = 'a'  # adjective
        elif tag.startswith('V'):
            wordnet_pos = 'v'  # verb
        elif tag.startswith('N'):
            wordnet_pos = 'n'  # noun
        elif tag.startswith('R'):
            wordnet_pos = 'r'  # adverb
        else:
            wordnet_pos = 'n'  # default to noun
                
        # Lemmatize with the correct POS
        lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
        processed_words.append(lemma)
    
    return processed_words

def check_exact_word_match(word, processed_words):
    """Check if a word appears exactly in processed words (not as part of another word)"""
    if '_' in word:
        # For bigrams, check if components appear as separate words
        components = word.split('_')
        return all(comp in processed_words for comp in components)
    else:
        # For single words, check for exact match
        return word in processed_words

def find_exact_word_matches():
    # Load the model to check vocabulary
    model_path = "models/model_v3/reddit_word2vec_10_10_democrats_2021_2024.model"
    print(f"Loading model from {model_path}")
    model = Word2Vec.load(model_path)
    
    # Load the top 50 source words
    df = pd.read_csv('output/v3/dem_10_10_v3_list.csv')
    target_words = df['source'].head(50).tolist()
    
    print(f"Analyzing {len(target_words)} words from dem_10_10_list.csv")
    
    # Dictionary to store results
    results = {
        "model_info": {
            "path": model_path,
            "vocabulary_size": len(model.wv.index_to_key),
            "top50_words": target_words
        },
        "word_analysis": {}
    }
    
    # Initialize analysis for each word
    for word in target_words:
        results["word_analysis"][word] = {
            "word": word,
            "in_model_vocab": word in model.wv,
            "exact_matches": [],
            "partial_matches": [],
            "components": word.split('_') if '_' in word else [word]
        }
    
    # Process comments to find exact and partial matches
    data_path = "datasets/democrats_comments.zst"
    print(f"Processing comments from {data_path}")
    
    # Set maximum comments to process and examples to find per word
    max_comments = 500000
    max_examples = 5
    
    # Track if we have found enough examples for all words
    all_found_exact = False
    
    # Process comments
    with open(data_path, "rb") as f:
        jsonStream = getFileJsonStream(data_path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {data_path}")
            return
        
        for idx, row in enumerate(tqdm(jsonStream, desc="Processing comments")):
            if idx >= max_comments:
                break
                
            if "body" not in row or "created_utc" not in row or "author" not in row:
                continue
                
            # Get the comment text
            text = row["body"]
            author = row.get('author', 'unknown')
            created_utc = row.get('created_utc', 'unknown')
            year = datetime.datetime.fromtimestamp(int(created_utc)).year if isinstance(created_utc, (int, float)) else "unknown"
            
            # Process text
            processed_words = preprocess_text(text, without_stopwords=False)
            processed_text = " ".join(processed_words)
            
            # Check for each target word
            for word in target_words:
                word_data = results["word_analysis"][word]
                
                # Check for exact match as a whole token
                if len(word_data["exact_matches"]) < max_examples:
                    exact_match = check_exact_word_match(word, processed_words)
                    if exact_match:
                        word_data["exact_matches"].append({
                            "original_text": text,
                            "processed_text": processed_text,
                            "processed_words": processed_words,
                            "author": author,
                            "created_utc": created_utc,
                            "year": year
                        })
                
                # Check for partial match (substring)
                if len(word_data["partial_matches"]) < max_examples:
                    # For words with underscores, check each component separately
                    if '_' in word:
                        components = word.split('_')
                        partial_match = all(any(comp in w for w in processed_words) for comp in components)
                    else:
                        # For single words, check if it's a substring of any processed word
                        partial_match = any(word in w and word != w for w in processed_words)
                    
                    if partial_match and not exact_match:  # Only count as partial if not already an exact match
                        word_data["partial_matches"].append({
                            "original_text": text,
                            "processed_text": processed_text,
                            "processed_words": processed_words,
                            "author": author,
                            "created_utc": created_utc,
                            "year": year
                        })
            
            # Check if we have enough examples for all words
            all_found_exact = all(len(results["word_analysis"][word]["exact_matches"]) >= max_examples for word in target_words)
            
            # If we have found enough examples for all words, stop processing
            if all_found_exact:
                print(f"Found enough exact matches for all words after processing {idx+1} comments.")
                break
    
    # Analyze and summarize results
    exact_match_counts = {word: len(data["exact_matches"]) for word, data in results["word_analysis"].items()}
    partial_match_counts = {word: len(data["partial_matches"]) for word, data in results["word_analysis"].items()}
    
    words_with_exact_matches = sum(1 for count in exact_match_counts.values() if count > 0)
    words_with_partial_matches = sum(1 for count in partial_match_counts.values() if count > 0)
    
    print(f"Words with exact matches: {words_with_exact_matches} out of {len(target_words)}")
    print(f"Words with only partial matches: {words_with_partial_matches} out of {len(target_words)}")
    
    # Add summary to results
    results["summary"] = {
        "words_with_exact_matches": words_with_exact_matches,
        "words_with_partial_matches": words_with_partial_matches,
        "words_with_no_matches": len(target_words) - words_with_exact_matches - words_with_partial_matches
    }
    
    # Save results as JSON
    with open("output/exact_word_matches/word_match_analysis_v3.json", "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)
    
    # Generate a readable report
    with open("output/exact_word_matches/word_match_report_v3.txt", "w", encoding="utf-8") as f:
        f.write("ANALYSIS OF TOP 50 WORDS - EXACT VS PARTIAL MATCHES\n")
        f.write("=================================================\n\n")
        
        f.write(f"Model path: {model_path}\n")
        f.write(f"Vocabulary size: {len(model.wv.index_to_key)}\n\n")
        
        f.write("SUMMARY\n")
        f.write("=======\n")
        f.write(f"Words with exact matches: {words_with_exact_matches} out of {len(target_words)}\n")
        f.write(f"Words with only partial matches: {words_with_partial_matches} out of {len(target_words)}\n")
        f.write(f"Words with no matches: {len(target_words) - words_with_exact_matches - words_with_partial_matches} out of {len(target_words)}\n\n")
        
        # Words without any matches
        no_match_words = [word for word in target_words if len(results["word_analysis"][word]["exact_matches"]) == 0 and 
                           len(results["word_analysis"][word]["partial_matches"]) == 0]
        if no_match_words:
            f.write("WORDS WITH NO MATCHES\n")
            f.write("====================\n")
            for word in no_match_words:
                f.write(f"- {word}\n")
            f.write("\n")
        
        # Words that appear only as part of other words
        partial_only_words = [word for word in target_words if len(results["word_analysis"][word]["exact_matches"]) == 0 and 
                               len(results["word_analysis"][word]["partial_matches"]) > 0]
        if partial_only_words:
            f.write("WORDS APPEARING ONLY AS PART OF OTHER WORDS\n")
            f.write("========================================\n")
            for word in partial_only_words:
                f.write(f"- {word}\n")
            f.write("\n")
        
        # Words with exact matches
        exact_match_words = [word for word in target_words if len(results["word_analysis"][word]["exact_matches"]) > 0]
        if exact_match_words:
            f.write("WORDS WITH EXACT MATCHES\n")
            f.write("=======================\n")
            for word in exact_match_words:
                f.write(f"- {word} ({len(results['word_analysis'][word]['exact_matches'])} examples)\n")
            f.write("\n")
        
        # Detailed examples for each word
        f.write("DETAILED EXAMPLES\n")
        f.write("================\n\n")
        
        for word in target_words:
            word_data = results["word_analysis"][word]
            f.write(f"WORD: {word}\n")
            f.write(f"{'=' * (len(word) + 6)}\n")
            f.write(f"In model vocabulary: {word_data['in_model_vocab']}\n")
            f.write(f"Components: {', '.join(word_data['components'])}\n")
            f.write(f"Exact matches: {len(word_data['exact_matches'])}\n")
            f.write(f"Partial matches: {len(word_data['partial_matches'])}\n\n")
            
            # Show exact match examples
            if word_data["exact_matches"]:
                f.write("EXACT MATCHES:\n")
                f.write("-------------\n")
                for i, example in enumerate(word_data["exact_matches"]):
                    f.write(f"Example {i+1} (Year: {example['year']}):\n")
                    f.write(f"Original: {example['original_text'][:200]}...\n" if len(example['original_text']) > 200 
                           else f"Original: {example['original_text']}\n")
                    f.write(f"Processed: {example['processed_text'][:200]}...\n" if len(example['processed_text']) > 200 
                           else f"Processed: {example['processed_text']}\n")
                    f.write(f"Tokens: {example['processed_words']}\n")
                    f.write(f"Author: {example['author']}\n\n")
            
            # Show partial match examples
            if word_data["partial_matches"]:
                f.write("PARTIAL MATCHES:\n")
                f.write("---------------\n")
                for i, example in enumerate(word_data["partial_matches"]):
                    f.write(f"Example {i+1} (Year: {example['year']}):\n")
                    f.write(f"Original: {example['original_text'][:200]}...\n" if len(example['original_text']) > 200 
                           else f"Original: {example['original_text']}\n")
                    f.write(f"Processed: {example['processed_text'][:200]}...\n" if len(example['processed_text']) > 200 
                           else f"Processed: {example['processed_text']}\n")
                    f.write(f"Tokens: {example['processed_words']}\n")
                    f.write(f"Author: {example['author']}\n\n")
            
            f.write("\n" + "-" * 80 + "\n\n")
    
    print(f"Analysis complete. Results saved to output/exact_word_matches/")

if __name__ == "__main__":
    find_exact_word_matches()

Loading model from models/model_v3/reddit_word2vec_10_10_democrats_2021_2024.model
Analyzing 50 words from dem_10_10_list.csv
Processing comments from datasets/democrats_comments.zst


Processing comments: 500000it [04:28, 1862.11it/s]

Words with exact matches: 30 out of 50
Words with only partial matches: 19 out of 50
Analysis complete. Results saved to output/exact_word_matches/



