In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Downloading click-8.3.0-py3-none-any.whl.metadata (2.6 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 1.1 MB/s eta 0:00:01
   -------------------- ------------------- 0.8/1.5 MB 1.2 MB/s eta 0:00:01
   --------------------------- ------------ 1.0/1.5 MB 1.2 MB/s eta 0:00:01
   ---------------------------------- ----- 1.3/1.5 MB 1.2 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 1.2 MB/s  0:00:01
Downloading click-8.3.0-py3-none-any.whl (107 kB)
Installing collected packages: click, nltk

   ---------------------------------------- 0/2 [click]
   ---------------------------------------- 0/2 [cl

In [1]:
import nltk
import pandas as pd
import numpy as np
from transformers import pipeline
import torch
from tqdm import tqdm
from multiprocessing import Pool, cpu_count, set_start_method
import logging

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [3]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to C:\Users\Shyam
[nltk_data]     Anand\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
try:
    sarcasm_detector = pipeline(
        "text-classification",
        model="mrm8488/t5-base-finetuned-sarcasm-twitter",
        device=0 if torch.cuda.is_available() else -1  # Use GPU if available
    )
except Exception as e:
    logging.error(f"Failed to load model: {e}")
    raise

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at mrm8488/t5-base-finetuned-sarcasm-twitter and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cuda:0


In [4]:
import sys
!"{sys.executable}" -m pip install blobfile --upgrade



In [None]:
def quantify_pragmatic_model(article_text, threshold=0.5, max_sentences=50):
    """
    Quantify pragmatic ambiguity for a single article using sarcasm model.

    Args:
        article_text (str): Text of the article.
        threshold (float): Sarcasm probability threshold (default: 0.5).
        max_sentences (int): Max sentences to process per article (default: 50).

    Returns:
        dict: Count, percentage, avg sarcasm prob, flagged sentences, total sentences.
    """
    try:
        sentences = nltk.sent_tokenize(article_text)
        total_sentences = len(sentences)  # Track total for reporting
        sentences = sentences[:max_sentences]  # Cap for processing
        if not sentences:
            return {
                'count': 0,
                'percentage': 0.0,
                'avg_prob': 0.0,
                'flagged_sentences': [],
                'total_sentences': total_sentences
            }

        # Batch process sentences
        batch_size = 32
        results = []
        for i in range(0, len(sentences), batch_size):
            batch = sentences[i:i + batch_size]
            try:
                results.extend(sarcasm_detector(batch))
            except Exception as e:
                logging.warning(f"Error processing batch in article: {e}")
                results.extend([{'label': 'NOT_SARCASM', 'score': 0.0}] * len(batch))

        sarcastic_count = sum(1 for res in results if res['label'] == 'SARCASM' and res['score'] > threshold)
        avg_prob = np.mean([res['score'] if res['label'] == 'SARCASM' else (1 - res['score']) for res in results])
        percentage = (sarcastic_count / total_sentences) * 100 if total_sentences else 0
        flagged_sentences = [sent for sent, res in zip(sentences, results) if res['label'] == 'SARCASM' and res['score'] > threshold]

        return {
            'count': sarcastic_count,
            'percentage': percentage,
            'avg_prob': avg_prob,
            'flagged_sentences': flagged_sentences,
            'total_sentences': total_sentences
        }
    except Exception as e:
        logging.warning(f"Error processing article: {e}")
        return {
            'count': 0,
            'percentage': 0.0,
            'avg_prob': 0.0,
            'flagged_sentences': [],
            'total_sentences': 0
        }


In [16]:
def process_dataset(articles, threshold=0.5, max_sentences=1, save_interval=1000):
    """
    Process dataset for pragmatic ambiguity (single-threaded).

    Args:
        articles (list): List of article texts.
        threshold (float): Sarcasm probability threshold.
        max_sentences (int): Max sentences per article.
        save_interval (int): Save results every N articles.

    Returns:
        dict: Results DataFrame and dataset-level metrics.
    """
    logging.info(f"Processing {len(articles)} articles single-threaded")

    results = []
    for i, text in enumerate(tqdm(articles, desc="Processing articles")):
        logging.debug(f"Starting article {i+1}")
        result = quantify_pragmatic_model(text, threshold, max_sentences)
        results.append(result)

        # Save intermediate results
        if (i + 1) % save_interval == 0:
            pd.DataFrame(results).to_csv(f'sarcasm_results_partial_{i+1}_new.csv', index=False)
            logging.info(f"Saved partial results at article {i+1}")

    df = pd.DataFrame(results)
    total_sentences = df['total_sentences'].sum()
    dataset_avg_percentage = df['percentage'].mean()
    dataset_total_sarcastic = df['count'].sum()
    dataset_incidence = (dataset_total_sarcastic / total_sentences) * 100 if total_sentences else 0

    return {
        'results': df,
        'dataset_avg_percentage': dataset_avg_percentage,
        'dataset_total_sarcastic': df['count'].sum(),
        'dataset_total_sentences': total_sentences,
        'dataset_incidence': dataset_incidence
    }


In [6]:
df = pd.read_csv(r"C:\Users\Shyam Anand\Documents\Mangal\NLP\preprocessed_fnspid_10k.csv")
df

Unnamed: 0,Article_title,Stock_symbol,Article,Textrank_summary,Preprocessed_Summary,Sentiment,Confidence
0,5 Growth Stocks at New Highs with Room for Mor...,ADM,Amid the ongoing trade tensions between the Un...,"Click to get this free report NetApp, Inc. (NT...","click to get this free report netapp, inc. nta...",Neutral,0.744207
1,3 Strong Buy Semiconductor Stocks to Consider Now,AMAT,Semiconductor stocks were battered by the rece...,Click to get this free report Apple Inc. (AAPL...,click to get this free report apple inc. aapl ...,Positive,0.633728
2,Pfizer's Breast Cancer Drug Misses Overall Sur...,ANIP,Pfizer PFE announced disappointing overall sur...,Click to get this free report AstraZeneca PLC ...,click to get this free report astrazeneca plc ...,Positive,0.504592
3,Microsoft to Launch Second-Gen Hololens AR Hea...,AR,Microsoft is getting ready to launch the next ...,Microsoft is getting ready to launch the next ...,microsoft is getting ready to launch the next ...,Neutral,0.947672
4,Camden Property Trust (CPT) Ex-Dividend Date S...,AMT,Camden Property Trust ( CPT ) will begin tradi...,"CPT is a part of the Consumer Services sector,...","cpt is a part of the consumer services sector,...",Negative,0.547460
...,...,...,...,...,...,...,...
9995,What Makes Jakks Pacific (JAKK) a Strong Momen...,AFGD,Momentum investing revolves around the idea of...,Our research shows that stocks rated Zacks Ran...,our research shows that stocks rated zacks ran...,Neutral,0.930927
9996,Is Bunge Limited (BG) Stock Undervalued Right ...,BG,The proven Zacks Rank system focuses on earnin...,Click to get this free report Bunge Limited (B...,click to get this free report bunge limited bg...,Neutral,0.925172
9997,Is Wingstop (WING) a Solid Growth Stock? 3 Rea...,ARGD,Growth investors focus on stocks that are seei...,"However, the task of finding cutting-edge grow...","however, the task of finding cutting edge grow...",Neutral,0.758138
9998,3 Nasdaq Stocks That Have Generated 10x Return...,AAPL,The Nasdaq is home to many of the best growth ...,Some of them have generated life-changing retu...,some of them have generated life changing retu...,Positive,0.738722


In [7]:
articles = df['Article']
articles.shape

(10000,)

In [8]:
try:
    set_start_method('spawn', force=True)
    logging.info("Multiprocessing start method set to 'spawn'")
except RuntimeError as e:
    logging.warning(f"Failed to set start method to 'spawn': {e}")
    logging.info("Continuing with default start method, may cause CUDA issues")

2025-09-28 11:44:09,557 - INFO - Multiprocessing start method set to 'spawn'


In [9]:
result = process_dataset(articles, threshold=0.5, max_sentences=50)

2025-09-28 11:44:13,284 - INFO - Processing 10000 articles single-threaded
Processing articles:   0%|          | 6/10000 [00:04<2:11:52,  1.26it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing articles:  10%|▉         | 999/10000 [13:26<2:29:45,  1.00it/s]2025-09-28 11:57:39,937 - INFO - Saved partial results at article 1000
Processing articles:  18%|█▊        | 1826/10000 [30:50<2:03:24,  1.10it/s] Token indices sequence length is longer than the specified maximum sequence length for this model (587 > 512). Running this sequence through the model will result in indexing errors
Processing articles:  20%|█▉        | 1999/10000 [33:03<1:16:52,  1.73it/s]2025-09-28 12:17:17,860 - INFO - Saved partial results at article 2000
Processing articles:  30%|██▉       | 2999/10000 [48:14<1:36:22,  1.21it/s]2025-09-28 12:32:28,238 - INFO - Saved partial results at article 3000
Processing articles:  40%|███▉      | 3999/10000 

In [10]:
print(f"\nModel-Based Sarcasm Detection Results:")
print(f"Dataset avg pragmatic ambiguity: {result['dataset_avg_percentage']:.2f}%")
print(f"Total sarcastic sentences: {result['dataset_total_sarcastic']} "
          f"({result['dataset_incidence']:.2f}% of {result['dataset_total_sentences']} sentences)")

    # Save results
result['results'].to_csv('sarcasm_results_large.csv', index=False)
logging.info("Results saved to sarcasm_results_large.csv")

    # Sample results
print("\nSample Article Results (first 5):")
for i, row in result['results'].head(5).iterrows():
    print(f"Article {i+1}: {row['count']} sarcastic sentences ({row['percentage']:.2f}%), "
        f"Avg prob: {row['avg_prob']:.2f}, Total sentences: {row['total_sentences']}")
    print(f"Flagged sentences (first 2): {row['flagged_sentences'][:2]}")

2025-09-28 14:09:56,881 - INFO - Results saved to sarcasm_results_large.csv



Model-Based Sarcasm Detection Results:
Dataset avg pragmatic ambiguity: 0.00%
Total sarcastic sentences: 0 (0.00% of 418149 sentences)

Sample Article Results (first 5):
Article 1: 0 sarcastic sentences (0.00%), Avg prob: 0.20, Total sentences: 36
Flagged sentences (first 2): []
Article 2: 0 sarcastic sentences (0.00%), Avg prob: 0.20, Total sentences: 37
Flagged sentences (first 2): []
Article 3: 0 sarcastic sentences (0.00%), Avg prob: 0.19, Total sentences: 27
Flagged sentences (first 2): []
Article 4: 0 sarcastic sentences (0.00%), Avg prob: 0.22, Total sentences: 12
Flagged sentences (first 2): []
Article 5: 0 sarcastic sentences (0.00%), Avg prob: 0.20, Total sentences: 17
Flagged sentences (first 2): []


In [11]:
title = df['Article_title']
title.shape

(10000,)

In [13]:

try:
    sarcasm_detector_new = pipeline(
        "text-classification",
        model="helinivan/english-sarcasm-detector",
        device=0 if torch.cuda.is_available() else -1  # Use GPU if available
    )
except Exception as e:
    logging.error(f"Failed to load model: {e}")
    raise

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cuda:0


In [17]:
result_new = process_dataset(articles, threshold=0.5, max_sentences=50)

2025-09-28 19:36:02,683 - INFO - Processing 10000 articles single-threaded
Processing articles:  10%|▉         | 999/10000 [03:25<39:01,  3.84it/s]2025-09-28 19:39:28,036 - INFO - Saved partial results at article 1000
Processing articles:  20%|█▉        | 1999/10000 [06:53<20:55,  6.37it/s]2025-09-28 19:42:56,446 - INFO - Saved partial results at article 2000
Processing articles:  30%|██▉       | 2999/10000 [10:49<22:04,  5.29it/s]2025-09-28 19:46:52,773 - INFO - Saved partial results at article 3000
Processing articles:  40%|███▉      | 3998/10000 [13:40<21:50,  4.58it/s]2025-09-28 19:49:43,729 - INFO - Saved partial results at article 4000
2025-09-28 19:52:34,652 - INFO - Saved partial results at article 5000
Processing articles:  60%|█████▉    | 5999/10000 [19:31<14:31,  4.59it/s]2025-09-28 19:55:34,128 - INFO - Saved partial results at article 6000
Processing articles:  70%|██████▉   | 6998/10000 [22:41<08:56,  5.60it/s]2025-09-28 19:58:44,840 - INFO - Saved partial results at arti

In [18]:
print(f"\nModel-Based Sarcasm Detection Results:")
print(f"Dataset avg pragmatic ambiguity: {result_new['dataset_avg_percentage']:.2f}%")
print(f"Total sarcastic sentences: {result_new['dataset_total_sarcastic']} "
          f"({result_new['dataset_incidence']:.2f}% of {result_new['dataset_total_sentences']} sentences)")

    # Save results
result_new['results'].to_csv('sarcasm_results_large.csv', index=False)
logging.info("Results saved to sarcasm_results_large.csv")

    # Sample results
print("\nSample Article Results (first 5):")
for i, row in result_new['results'].head(5).iterrows():
    print(f"Article {i+1}: {row['count']} sarcastic sentences ({row['percentage']:.2f}%), "
        f"Avg prob: {row['avg_prob']:.2f}, Total sentences: {row['total_sentences']}")
    print(f"Flagged sentences (first 2): {row['flagged_sentences'][:2]}")

2025-09-28 20:57:29,721 - INFO - Results saved to sarcasm_results_large.csv



Model-Based Sarcasm Detection Results:
Dataset avg pragmatic ambiguity: 0.00%
Total sarcastic sentences: 0 (0.00% of 418149 sentences)

Sample Article Results (first 5):
Article 1: 0 sarcastic sentences (0.00%), Avg prob: 0.07, Total sentences: 36
Flagged sentences (first 2): []
Article 2: 0 sarcastic sentences (0.00%), Avg prob: 0.05, Total sentences: 37
Flagged sentences (first 2): []
Article 3: 0 sarcastic sentences (0.00%), Avg prob: 0.03, Total sentences: 27
Flagged sentences (first 2): []
Article 4: 0 sarcastic sentences (0.00%), Avg prob: 0.02, Total sentences: 12
Flagged sentences (first 2): []
Article 5: 0 sarcastic sentences (0.00%), Avg prob: 0.03, Total sentences: 17
Flagged sentences (first 2): []
