In [6]:
import pandas as pd
from tqdm import tqdm
import spacy
from nltk.wsd import lesk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
import nltk

# Download required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load spaCy model for NER
nlp = spacy.load("en_core_web_sm")

# -----------------------------
# 1️⃣ Load CSV files
# -----------------------------
articles_df = pd.read_csv(r"C:\Users\saman\Downloads\preprocessed_fnspid_10k.csv")  # Original articles
pragmatic_df = pd.read_csv(r"F:\College\SEM 7\19CSE453-Natural Language Processing\Case Study\Ambiguities\sarcasm_results_rule_based.csv")  # Flagged sentences

# Add IDs to both files for mapping
articles_df['article_id'] = range(len(articles_df))
pragmatic_df['article_id'] = range(len(pragmatic_df))

# -----------------------------
# 2️⃣ Add [AMBIGUOUS] tags for pragmatic ambiguity
# -----------------------------
def add_ambiguous_tags(article_text, flagged_sentences):
    sentences = nltk.sent_tokenize(article_text)
    flagged = set(pragmatic_df['flagged_sentences'])
    
    tagged_sentences = [
        "[AMBIGUOUS] " + s if s in flagged else s
        for s in sentences
    ]
    return " ".join(tagged_sentences)

tqdm.pandas(desc="Adding pragmatic tags")
articles_df['text_pragmatic'] = articles_df.progress_apply(
    lambda row: add_ambiguous_tags(
        row['Article'], 
        pragmatic_df[pragmatic_df['article_id']==row['article_id']]['flagged_sentences'].tolist()
    ),
    axis=1
)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\saman\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Adding pragmatic tags: 100%|██████████| 10000/10000 [00:13<00:00, 740.91it/s]


In [8]:
articles_df['text_pragmatic']

0       Amid the ongoing trade tensions between the Un...
1       Semiconductor stocks were battered by the rece...
2       Pfizer PFE announced disappointing overall sur...
3       Microsoft is getting ready to launch the next ...
4       Camden Property Trust ( CPT ) will begin tradi...
                              ...                        
9995    Momentum investing revolves around the idea of...
9996    The proven Zacks Rank system focuses on earnin...
9997    Growth investors focus on stocks that are seei...
9998    The Nasdaq is home to many of the best growth ...
9999    Tesla TSLA is recalling more than 2 million ve...
Name: text_pragmatic, Length: 10000, dtype: object

In [None]:
import pandas as pd
import spacy

# --- Load spaCy model ---
nlp = spacy.load("en_core_web_sm")  # small model (upgrade to transformer if needed)

# --- Core Pipeline Functions ---

def safe_ner_replace(text):
    """
    1) Run spaCy NER.
    2) Replace entities with [ENTITY_LABEL].
    """
    doc = nlp(text)
    new_tokens = []
    for token in doc:
        if token.ent_type_:
            new_tokens.append(f"[{token.ent_type_}]")
        else:
            new_tokens.append(token.text)
    return " ".join(new_tokens)


# --- Full Dataset Processing ---

def process_and_save_articles(input_csv, output_csv, text_column="article"):
    """
    input_csv: path to raw dataset with at least a column 'article'
    output_csv: where to save final dataset with NER replacements
    text_column: column name of text to process
    """
    df = pd.read_csv(input_csv)

    processed_articles = []
    for i, row in df.iterrows():
        text = str(row[text_column])
        processed = safe_ner_replace(text)
        processed_articles.append(processed)

        if i % 50 == 0:
            print(f"Processed {i} articles...")

    df["processed_article"] = processed_articles
    df.to_csv(output_csv, index=False)
    print(f"✅ Saved processed dataset to {output_csv}")


# --- Example Run ---
if __name__ == "__main__":
    # Change these paths to your dataset
    input_csv = r"F:\College\SEM 7\19CSE453-Natural Language Processing\Case Study\Ambiguities\fnspid_tagged_ambiguous.csv"  # must contain column 'article'
    output_csv = "articles_processed.csv"

    process_and_save_articles(input_csv, output_csv, text_column="tagged_article")


Processing articles: 100%|██████████| 10000/10000 [47:06<00:00,  3.54it/s] 



✅ Saved processed dataset to articles_processed.csv


In [24]:
df = pd.read_csv(r"F:\College\SEM 7\19CSE453-Natural Language Processing\Case Study\Ambiguities\articles_processed.csv")
# Replace any broken ambiguous tag with the correct one
if 'processed_article' in df.columns:
    df['processed_article'] = df['processed_article'].str.replace(r"[_ ]*AMBIGUOUS[_ ]*TAG[_ ]*", "[AMBIGUOUS]", regex=True)
    df.to_csv("articles_processed_fixed_only_ner_label.csv", index=False)
    print("✅ Saved fixed file as articles_processed_fixed.csv")


✅ Saved fixed file as articles_processed_fixed.csv


In [23]:
df['processed_article'][0]

'[AMBIGUOUS]Amid the ongoing trade tensions between [GPE] [GPE] [GPE] and [GPE] , the [GPE] stock market continues to enjoy the longest bull run since [EVENT] [EVENT] [EVENT] by avoiding [PERCENT] [PERCENT] or more decline . The large - cap S&P 500 , the Nasdaq Composite Index , the small - cap Russell 2000 Index and the mid - cap S&P 400 Mid Cap Index all touched new highs in [DATE] \'s trading session and the [ORG] [ORG] reclaimed its [CARDINAL] mark , reflecting a broad - based rally .[AMBIGUOUS]Strong rebound in large - capitalization technology and Internet stocks as well as strong corporate earnings led to investors \' appetite for more risk in [DATE] [DATE] .[AMBIGUOUS]Double - digit earnings growth in [DATE] [DATE] [DATE] is expected to be the highest [DATE] growth pace in [DATE] [DATE] [DATE] . For [DATE] [DATE] [DATE] [DATE] , total earnings of the S&P 500 Index are expected to be up [PERCENT] [PERCENT] on [PERCENT] [PERCENT] higher revenues . Additionally , the trade agreeme

In [2]:
import re
import pandas as pd
import spacy
from tqdm import tqdm

# ---------- configure ----------
INPUT_CSV = r"F:\College\SEM 7\19CSE453-Natural Language Processing\Case Study\Ambiguities\fnspid_tagged_ambiguous.csv"
OUTPUT_CSV = "articles_processed_with_ner.csv"
TEXT_COL = "tagged_article"   # column that already contains [AMBIGUOUS] markers
AMBIG_TAG = "[AMBIGUOUS]"     # exact text to preserve
# unique marker unlikely to appear in text (no brackets, one token)
AMBIG_MARKER = "__AMBIGUOUS_MARKER__"
# -------------------------------

# load spaCy
nlp = spacy.load("en_core_web_sm")

def ner_hybrid_preserve_ambig(text: str,
                              ambig_tag: str = AMBIG_TAG,
                              ambig_marker: str = AMBIG_MARKER) -> str:
    """
    Replace ambig_tag with a one-token marker, run spaCy NER, then reconstruct text
    by inserting ' entity_text [LABEL]' for each entity. Finally replace marker
    back to the exact ambig_tag string (no added spaces).
    """
    if not isinstance(text, str):
        return text

    # 1) Replace all exact occurrences of [AMBIGUOUS] with the single-token marker
    text_marker = text.replace(ambig_tag, ambig_marker)

    # 2) Run spaCy NER on the modified text
    doc = nlp(text_marker)

    # 3) Use entity char offsets to rebuild text with hybrid labels
    ents = list(doc.ents)
    if not ents:
        # no entities -> just revert marker and return
        return text_marker.replace(ambig_marker, ambig_tag)

    result_parts = []
    ptr = 0
    for ent in ents:
        start_char = ent.start_char
        end_char = ent.end_char

        # append text before this entity (keeps original spacing/punctuation)
        if ptr < start_char:
            result_parts.append(text_marker[ptr:start_char])

        # append entity surface form + space + [LABEL]
        ent_text = text_marker[start_char:end_char]
        result_parts.append(f"{ent_text} [{ent.label_}]")

        ptr = end_char

    # append the remaining tail
    if ptr < len(text_marker):
        result_parts.append(text_marker[ptr:])

    reconstructed = "".join(result_parts)

    # 4) revert ambig marker back to exact "[AMBIGUOUS]" (no added spaces)
    final = reconstructed.replace(ambig_marker, ambig_tag)

    return final


def process_file(input_csv: str, output_csv: str, text_col: str = TEXT_COL):
    df = pd.read_csv(input_csv)
    if text_col not in df.columns:
        raise ValueError(f"Column '{text_col}' not found in {input_csv}. Available cols: {list(df.columns)}")

    processed = []
    for i, txt in enumerate(tqdm(df[text_col].fillna("").astype(str), desc="Processing articles")):
        try:
            out = ner_hybrid_preserve_ambig(txt)
        except Exception as e:
            # if something fails for a row, keep original and log (you can expand logging)
            print(f"Warning processing row {i}: {e}")
            out = txt
        processed.append(out)

    df["processed_article"] = processed
    df.to_csv(output_csv, index=False)
    print(f"✅ Saved processed file to: {output_csv}")
    return df


if __name__ == "__main__":
    process_file(INPUT_CSV, OUTPUT_CSV, TEXT_COL)


Processing articles:  10%|█         | 1020/10000 [02:10<19:07,  7.83it/s] 


KeyboardInterrupt: 