In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [4]:
import spacy
import pandas as pd
import logging
from tqdm import tqdm

In [5]:
CLAUSE_TYPES = {"prep", "ccomp", "advcl", "relcl"}

In [29]:
def quantify_syntactic_ambiguity(article_text, max_sentences=50, threshold=5):
    """
    Quantify syntactic ambiguity in a single article without weights.
    """
    try:
        sentences = list(nlp(article_text).sents)[:max_sentences]
        flagged_sentences = []

        for sent in sentences:
            # Count complex clauses
            clause_count = sum(1 for token in sent if token.dep_ in CLAUSE_TYPES)
            if clause_count >= threshold:
                flagged_sentences.append(sent.text)

        total_sentences = len(sentences)
        count = len(flagged_sentences)
        percentage = (count / total_sentences) * 100 if total_sentences else 0

        return {
            "count": count,
            "percentage": percentage,
            "flagged_sentences": flagged_sentences,
            "total_sentences": total_sentences
        }

    except Exception as e:
        logging.warning(f"Error processing article: {e}")
        return {
            "count": 0,
            "percentage": 0.0,
            "flagged_sentences": [],
            "total_sentences": 0
        }


In [7]:
def analyze_syntactic_dataset(df, text_column="Article", label_column=None, max_sentences=50, threshold=3):
    """
    Analyze syntactic ambiguity for an entire dataset.
    """
    results = []
    sentiment_summary = {}

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing articles"):
        article = row[text_column]
        res = quantify_syntactic_ambiguity(article, max_sentences, threshold)
        results.append(res)

        if label_column:
            label = row[label_column]
            if label not in sentiment_summary:
                sentiment_summary[label] = {"count":0, "total_sentences":0}
            sentiment_summary[label]["count"] += res["count"]
            sentiment_summary[label]["total_sentences"] += res["total_sentences"]

    df_results = pd.DataFrame(results)
    total_sentences = df_results['total_sentences'].sum()
    total_flagged = df_results['count'].sum()
    overall_percentage = (total_flagged / total_sentences) * 100 if total_sentences else 0

    print(f"=== Dataset Syntactic Ambiguity Analysis ===")
    print(f"Total articles processed: {len(df)}")
    print(f"Total sentences: {total_sentences}")
    print(f"Total flagged as syntactically ambiguous: {total_flagged}")
    print(f"Overall syntactic ambiguity: {overall_percentage:.2f}%\n")

    if label_column:
        print("=== Per-Sentiment Breakdown ===")
        for label, stats in sentiment_summary.items():
            perc = (stats["count"] / stats["total_sentences"])*100 if stats["total_sentences"] else 0
            print(f"{label}: {stats['count']} flagged sentences / {stats['total_sentences']} total sentences ({perc:.2f}%)")

    return df_results

In [8]:
df = pd.read_csv(r"C:\Users\saman\Downloads\preprocessed_fnspid_10k.csv")
df

Unnamed: 0,Article_title,Stock_symbol,Article,Textrank_summary,Preprocessed_Summary,Sentiment,Confidence
0,5 Growth Stocks at New Highs with Room for Mor...,ADM,Amid the ongoing trade tensions between the Un...,"Click to get this free report NetApp, Inc. (NT...","click to get this free report netapp, inc. nta...",Neutral,0.744207
1,3 Strong Buy Semiconductor Stocks to Consider Now,AMAT,Semiconductor stocks were battered by the rece...,Click to get this free report Apple Inc. (AAPL...,click to get this free report apple inc. aapl ...,Positive,0.633728
2,Pfizer's Breast Cancer Drug Misses Overall Sur...,ANIP,Pfizer PFE announced disappointing overall sur...,Click to get this free report AstraZeneca PLC ...,click to get this free report astrazeneca plc ...,Positive,0.504592
3,Microsoft to Launch Second-Gen Hololens AR Hea...,AR,Microsoft is getting ready to launch the next ...,Microsoft is getting ready to launch the next ...,microsoft is getting ready to launch the next ...,Neutral,0.947672
4,Camden Property Trust (CPT) Ex-Dividend Date S...,AMT,Camden Property Trust ( CPT ) will begin tradi...,"CPT is a part of the Consumer Services sector,...","cpt is a part of the consumer services sector,...",Negative,0.547460
...,...,...,...,...,...,...,...
9995,What Makes Jakks Pacific (JAKK) a Strong Momen...,AFGD,Momentum investing revolves around the idea of...,Our research shows that stocks rated Zacks Ran...,our research shows that stocks rated zacks ran...,Neutral,0.930927
9996,Is Bunge Limited (BG) Stock Undervalued Right ...,BG,The proven Zacks Rank system focuses on earnin...,Click to get this free report Bunge Limited (B...,click to get this free report bunge limited bg...,Neutral,0.925172
9997,Is Wingstop (WING) a Solid Growth Stock? 3 Rea...,ARGD,Growth investors focus on stocks that are seei...,"However, the task of finding cutting-edge grow...","however, the task of finding cutting edge grow...",Neutral,0.758138
9998,3 Nasdaq Stocks That Have Generated 10x Return...,AAPL,The Nasdaq is home to many of the best growth ...,Some of them have generated life-changing retu...,some of them have generated life changing retu...,Positive,0.738722


In [9]:
df_results = analyze_syntactic_dataset(df, text_column="Article", label_column="Sentiment")

Processing articles: 100%|██████████| 10000/10000 [20:23<00:00,  8.17it/s]

=== Dataset Syntactic Ambiguity Analysis ===
Total articles processed: 10000
Total sentences: 300131
Total flagged as syntactically ambiguous: 161323
Overall syntactic ambiguity: 53.75%

=== Per-Sentiment Breakdown ===
Neutral: 78010 flagged sentences / 146885 total sentences (53.11%)
Positive: 47824 flagged sentences / 89931 total sentences (53.18%)
Negative: 35489 flagged sentences / 63315 total sentences (56.05%)





In [10]:
def split_sentence_by_clause(sentence_text):
    """
    Split a sentence into smaller segments based on clause-level dependencies.
    """
    doc = nlp(sentence_text)
    split_segments = []

    for sent in doc.sents:
        clause_indices = [token.i for token in sent if token.dep_ in CLAUSE_TYPES]
        if not clause_indices:
            split_segments.append(sent.text.strip())
            continue

        prev_idx = sent.start
        for idx in clause_indices:
            segment = doc[prev_idx:idx].text.strip()
            if segment:
                split_segments.append(segment)
            prev_idx = idx
        # Add the remaining segment
        last_segment = doc[prev_idx:sent.end].text.strip()
        if last_segment:
            split_segments.append(last_segment)

    return split_segments

In [12]:
articles_file = r"C:\Users\saman\Downloads\preprocessed_fnspid_10k.csv"          # Your main article file
flagged_file = r"F:\College\SEM 7\19CSE453-Natural Language Processing\Case Study\Ambiguities\sarcasm_results_rule_based.csv"
articles_df = pd.read_csv(articles_file)
flagged_df = pd.read_csv(flagged_file)

# -----------------------------
# 2. Add Article_ID columns (incremental)
# -----------------------------
articles_df = articles_df.copy()
flagged_df = flagged_df.copy()

articles_df['Article_ID'] = range(len(articles_df))
flagged_df['Article_ID'] = range(len(flagged_df))  # assumes flagged sentences in same order as articles


In [14]:
tqdm.pandas(desc="Splitting flagged sentences")
flagged_df['split_sentences'] = flagged_df['flagged_sentences'].progress_apply(split_sentence_by_clause)


Splitting flagged sentences: 100%|██████████| 10000/10000 [07:20<00:00, 22.72it/s]


In [19]:
handled_articles = []

for article_id, group in flagged_df.groupby('Article_ID'):
    original_article = articles_df.loc[articles_df['Article_ID'] == article_id, 'Article'].values[0]
    
    # Flatten list of split sentences
    split_sentences = [seg for subs in group['split_sentences'] for seg in subs]
    
    # Replace flagged sentences with their split versions
    new_article_text = original_article
    for flagged_sentence, segments in zip(group['flagged_sentences'], group['split_sentences']):
        new_article_text = new_article_text.replace(flagged_sentence, " ".join(segments))
    
    handled_articles.append({
        'Article_ID': article_id,
        'Original_Article': original_article,
        'Handled_Article': new_article_text,
        'Sentiment': articles_df.loc[articles_df['Article_ID'] == article_id, 'Sentiment'].values[0]
    })

handled_df = pd.DataFrame(handled_articles)

# -----------------------------
# 5. Save the handled dataset
# -----------------------------
handled_df.to_csv("articles_handled_syntactic.csv", index=False)
print("✅ Handled syntactic ambiguity dataset saved as 'articles_handled_syntactic.csv'")

✅ Handled syntactic ambiguity dataset saved as 'articles_handled_syntactic.csv'


In [22]:
df = pd.read_csv(r"F:\College\SEM 7\19CSE453-Natural Language Processing\Case Study\Ambiguities\articles_handled_syntactic.csv")  # Replace with your dataset path

In [23]:
df_results = analyze_syntactic_dataset(df, text_column="Handled_Article", label_column="Sentiment")

Processing articles: 100%|██████████| 10000/10000 [24:27<00:00,  6.81it/s] 


=== Dataset Syntactic Ambiguity Analysis ===
Total articles processed: 10000
Total sentences: 300131
Total flagged as syntactically ambiguous: 161323
Overall syntactic ambiguity: 53.75%

=== Per-Sentiment Breakdown ===
Neutral: 78010 flagged sentences / 146885 total sentences (53.11%)
Positive: 47824 flagged sentences / 89931 total sentences (53.18%)
Negative: 35489 flagged sentences / 63315 total sentences (56.05%)
