In [2]:
import pandas as pd
from nltk.corpus import wordnet as wn
from tqdm import tqdm
import nltk

# Download WordNet if not already
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt")

from nltk.tokenize import word_tokenize, sent_tokenize

# Load your dataset
df = pd.read_csv(r"C:\Users\saman\Downloads\preprocessed_fnspid_10k.csv")  # replace with your dataset path

def count_semantic_ambiguity(text, min_meanings=2):
    """
    Counts sentences in the text that contain semantically ambiguous words.
    
    Args:
        text (str): Article text.
        min_meanings (int): Minimum number of WordNet senses to consider a word ambiguous.
    
    Returns:
        dict: Count, percentage, flagged sentences
    """
    flagged_sentences = []
    sentences = sent_tokenize(text)
    
    for sent in sentences:
        words = word_tokenize(sent)
        for word in words:
            synsets = wn.synsets(word)
            if len(synsets) >= min_meanings:
                flagged_sentences.append(sent)
                break  # flag the sentence if any word is ambiguous
                
    total_sentences = len(sentences)
    count = len(flagged_sentences)
    percentage = (count / total_sentences) * 100 if total_sentences else 0
    
    return {
        "count": count,
        "percentage": percentage,
        "flagged_sentences": flagged_sentences,
        "total_sentences": total_sentences
    }

# Apply to dataset with tqdm
results = []
for article in tqdm(df['Article'], desc="Quantifying semantic ambiguity"):
    res = count_semantic_ambiguity(article)
    results.append(res)

# Convert results to DataFrame
semantic_results = pd.DataFrame(results)
semantic_results.to_csv("semantic_ambiguity_results.csv", index=False)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\saman\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Quantifying semantic ambiguity: 100%|██████████| 10000/10000 [00:59<00:00, 169.22it/s]
