In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Neel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Neel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
#Loading dataframe

df = pd.read_csv(r'C:\Users\Neel\OneDrive\Desktop\University\Y4S2\IR\Information-Retrieval\data\cleaned_data.csv')
# Ensure text is in string format
df['cleaned_text'] = df['cleaned_text'].astype(str)

In [6]:
# Tokenize the cleaned_text column to process each post
df['tokens'] = df['cleaned_text'].apply(word_tokenize)
all_tokens = [item for sublist in df['tokens'].tolist() for item in sublist]

# Utilize CountVectorizer to find phrases that appear frequently
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english')
X = vectorizer.fit_transform([' '.join(all_tokens)])
features = vectorizer.get_feature_names_out()

In [9]:
# Sum up the counts of each term to find the most common ones
sums = X.sum(axis=0)
data = []
for col, term in enumerate(features):
    data.append((term, sums[0,col]))

ranking = pd.DataFrame(data, columns=['term','freq'])
ranking = ranking.sort_values('freq', ascending=False)

# Print the top 20 concepts/terms
print(ranking.head(20))

                term   freq
470870         https  16821
166373         click  14008
980013         stock  11440
295963      earnings   7430
613212        market   7009
184617       company   5793
1127020         week   5608
468962          http   5576
154657         chart   5330
1050912        today   5117
166392   click chart   4288
1061520      trading   4066
1150015         year   3762
130165      calendar   3740
568155          like   3127
580292          live   3110
675239           new   2852
984956        stocks   2785
1045176         time   2778
929062        shares   2745


## Word Sense Disambiguation from GPT: ( useless )

Word Sense Disambiguation (WSD) is the process of identifying which sense of a word is used in a sentence, when the word has multiple meanings. This is a more complex task in NLP and often requires understanding the context in which a word is used. One common approach to WSD is to use supervised learning models trained on annotated corpora, but for simpler applications or demonstrations, we can use pre-built libraries like NLTK that come with some capabilities for WSD.

For this example, let's use NLTK's Lesk algorithm, which is a simple and classical algorithm for WSD. The Lesk algorithm works by comparing the words in the context of the target word with the words in the definitions of the target word's various senses, and selecting the sense with the highest overlap of words.

Here's how you can implement a basic WSD using NLTK's Lesk algorithm for a given sentence and target word within that sentence:

In [10]:
from nltk.wsd import lesk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Neel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Neel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Neel\AppData\Roaming\nltk_data...


True

In [12]:
# Define a function to apply WSD on a piece of text
def disambiguate_word_sense(row, target_word):
    tokens = word_tokenize(row['cleaned_text'])
    sense = lesk(tokens, target_word)
    if sense:
        return sense.definition()
    else:
        return "No sense found"

In [15]:
# Choose your target word
target_word = "stocks"

# Apply WSD to each row for the target word
df['disambiguated_sense'] = df.apply(disambiguate_word_sense, target_word=target_word, axis=1)

# Optionally, save the DataFrame to a new CSV file
df.to_csv('disambiguated_text.csv', index=False)

# Show the DataFrame
print(df[['cleaned_text', 'disambiguated_sense']])

                                           cleaned_text  \
0     lately seen big drops capacity utilization eur...   
1     make sure wsb discord https check earnings thr...   
2     convinced mom give grand trade split profit pu...   
3     aggressive advertising strategy allocating sub...   
4     nvidia amd first movers ai revolution let assu...   
...                                                 ...   
7231  anyone update occurred https anyone created su...   
7232  long story short xrp enthusiast traders coin k...   
7233                                           conflict   
7234                                 strongest argument   
7235  newb buyer xrp buying buy one xrp know ripple ...   

                                    disambiguated_sense  
0        a supply of something available for future use  
1        a supply of something available for future use  
2     a former instrument of punishment consisting o...  
3        a supply of something available for future use  
4

In [16]:
df.head(-5)

Unnamed: 0.1,Unnamed: 0,title,score,id,subreddit,url,num_comments,body,created,cleaned_text,tokens,disambiguated_sense
0,0,Are we in a recession or not?,2,1azytar,wallstreetbets,https://i.redd.it/8xp7l2atpskc1.jpeg,3,Lately I've seen big drops in Capacity Utiliza...,25/2/2024,lately seen big drops capacity utilization eur...,"[lately, seen, big, drops, capacity, utilizati...",a supply of something available for future use
1,1,"What Are Your Moves Tomorrow, February 26, 2024",1,1azyssf,wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,28,Make sure you're in the [WSB Discord](https://...,25/2/2024,make sure wsb discord https check earnings thr...,"[make, sure, wsb, discord, https, check, earni...",a supply of something available for future use
2,2,Thank you Nancy Pelosi,113,1azxsze,wallstreetbets,https://i.redd.it/symyx8lniskc1.jpeg,20,"I’m 18, convinced my mom to give me a grand to...",25/2/2024,convinced mom give grand trade split profit pu...,"[convinced, mom, give, grand, trade, split, pr...",a former instrument of punishment consisting o...
3,3,Imitator's Reckoning: $AI Anticipating a Downf...,17,1azx26v,wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,9,"\nC3.ai's aggressive advertising strategy, ...",25/2/2024,aggressive advertising strategy allocating sub...,"[aggressive, advertising, strategy, allocating...",a supply of something available for future use
4,4,What are the AI derivative plays?,9,1azvuxx,wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,38,If NVIDIA and AMD are the first movers in the ...,25/2/2024,nvidia amd first movers ai revolution let assu...,"[nvidia, amd, first, movers, ai, revolution, l...",a supply of something available for future use
...,...,...,...,...,...,...,...,...,...,...,...,...
7226,10193,WTF,0,15geh8u,XRP,https://www.reddit.com/r/XRP/comments/15geh8u/...,76,Wtf is up why does ot look like the floor just...,2/8/2023,wtf ot look like floor droped xrp di hit spot ...,"[wtf, ot, look, like, floor, droped, xrp, di, ...",a supply of something available for future use
7227,10194,Is this smart?,24,15gdbss,XRP,https://www.reddit.com/r/XRP/comments/15gdbss/...,56,"Hi guys, first time crypto buyer here ,and I w...",2/8/2023,hi guys first time crypto buyer would like kno...,"[hi, guys, first, time, crypto, buyer, would, ...",a supply of something available for future use
7228,10195,A new court case?,48,15g6iex,XRP,https://www.reddit.com/r/XRP/comments/15g6iex/...,31,"Wait, so a federal judge that was not associat...",2/8/2023,wait federal judge associated ripple case like...,"[wait, federal, judge, associated, ripple, cas...",a supply of something available for future use
7229,10197,08/02/23 [Join XRPLounge Discord] - discord.co...,5,15fy6ik,XRP,https://www.reddit.com/r/XRP/comments/15fy6ik/...,1,# XRPLounge Discord\n\n\>>> **Invite Link:** [...,2/8/2023,xrplounge discord invite link https official d...,"[xrplounge, discord, invite, link, https, offi...",a special variety of domesticated animals with...


## Sarcasm Detection

In [17]:
import pandas as pd
from textblob import TextBlob

In [19]:
# Define a function to detect sarcasm based on sentiment analysis
def detect_sarcasm(text):
    analysis = TextBlob(text)
    # Extremely positive or negative sentiment might indicate sarcasm
    # This threshold is arbitrary and for demonstration; adjust based on your observations
    if analysis.sentiment.polarity > 0.8 or analysis.sentiment.polarity < -0.8:
        return "Potential sarcasm"
    else:
        return "Not sarcasm"

In [20]:
# Apply sarcasm detection on the cleaned_text column
df['sarcasm'] = df['cleaned_text'].apply(detect_sarcasm)

# Display or save your results
print(df[['cleaned_text', 'sarcasm']])

                                           cleaned_text      sarcasm
0     lately seen big drops capacity utilization eur...  Not sarcasm
1     make sure wsb discord https check earnings thr...  Not sarcasm
2     convinced mom give grand trade split profit pu...  Not sarcasm
3     aggressive advertising strategy allocating sub...  Not sarcasm
4     nvidia amd first movers ai revolution let assu...  Not sarcasm
...                                                 ...          ...
7231  anyone update occurred https anyone created su...  Not sarcasm
7232  long story short xrp enthusiast traders coin k...  Not sarcasm
7233                                           conflict  Not sarcasm
7234                                 strongest argument  Not sarcasm
7235  newb buyer xrp buying buy one xrp know ripple ...  Not sarcasm

[7236 rows x 2 columns]


In [21]:
df.head(-10)

Unnamed: 0.1,Unnamed: 0,title,score,id,subreddit,url,num_comments,body,created,cleaned_text,tokens,disambiguated_sense,sarcasm
0,0,Are we in a recession or not?,2,1azytar,wallstreetbets,https://i.redd.it/8xp7l2atpskc1.jpeg,3,Lately I've seen big drops in Capacity Utiliza...,25/2/2024,lately seen big drops capacity utilization eur...,"[lately, seen, big, drops, capacity, utilizati...",a supply of something available for future use,Not sarcasm
1,1,"What Are Your Moves Tomorrow, February 26, 2024",1,1azyssf,wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,28,Make sure you're in the [WSB Discord](https://...,25/2/2024,make sure wsb discord https check earnings thr...,"[make, sure, wsb, discord, https, check, earni...",a supply of something available for future use,Not sarcasm
2,2,Thank you Nancy Pelosi,113,1azxsze,wallstreetbets,https://i.redd.it/symyx8lniskc1.jpeg,20,"I’m 18, convinced my mom to give me a grand to...",25/2/2024,convinced mom give grand trade split profit pu...,"[convinced, mom, give, grand, trade, split, pr...",a former instrument of punishment consisting o...,Not sarcasm
3,3,Imitator's Reckoning: $AI Anticipating a Downf...,17,1azx26v,wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,9,"\nC3.ai's aggressive advertising strategy, ...",25/2/2024,aggressive advertising strategy allocating sub...,"[aggressive, advertising, strategy, allocating...",a supply of something available for future use,Not sarcasm
4,4,What are the AI derivative plays?,9,1azvuxx,wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,38,If NVIDIA and AMD are the first movers in the ...,25/2/2024,nvidia amd first movers ai revolution let assu...,"[nvidia, amd, first, movers, ai, revolution, l...",a supply of something available for future use,Not sarcasm
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7221,10188,Don't give in! HODL!!!,126,15hayxs,XRP,https://www.reddit.com/r/XRP/comments/15hayxs/...,130,Don't let these numbers convince you. Nothing ...,3/8/2023,let numbers convince nothing bad happening bad...,"[let, numbers, convince, nothing, bad, happeni...",a supply of something available for future use,Not sarcasm
7222,10189,Xrp/Twitter,12,15h5nsh,XRP,https://www.reddit.com/r/XRP/comments/15h5nsh/...,12,Man the new Twitter logo be loading me off eve...,3/8/2023,man new twitter logo loading every time get no...,"[man, new, twitter, logo, loading, every, time...",a supply of something available for future use,Not sarcasm
7223,10190,Whats happening,67,15gy5wu,XRP,https://www.reddit.com/r/XRP/comments/15gy5wu/...,142,Why the drop? Time to dca?,3/8/2023,drop time dca,"[drop, time, dca]",a supply of something available for future use,Not sarcasm
7224,10191,08/03/23 [Join XRPLounge Discord] - discord.co...,5,15gtejd,XRP,https://www.reddit.com/r/XRP/comments/15gtejd/...,1,# XRPLounge Discord\n\n\>>> **Invite Link:** [...,3/8/2023,xrplounge discord invite link https official d...,"[xrplounge, discord, invite, link, https, offi...",a special variety of domesticated animals with...,Not sarcasm


In [22]:
# Filter rows where sarcasm column is 'Potential sarcasm'
sarcasm_detected = df[df['sarcasm'] == 'Potential sarcasm']
print(sarcasm_detected)

     Unnamed: 0                                              title score  \
252         410          Is there a "Mount Rushmore" of loss porn?    20   
2690       3213                    $EQ Equillium head-spinning day     4   
3325       3894                                Ballsy GMBL play…..     2   
3456       4049       Best Bitcoin or Cryptocurrency Documentaries     1   
3596       4250       Here's my honest question about BTC security     0   
3742       4489                              Question for you guys     0   
3812       4584  Ladies and Gentlemen I would like to announce ...     4   
3837       4618                                   Best way to DCA.     5   
3901       4693  How can I convert my  Pension into Bitcoin? (E...    15   
4085       4936  What is your preferred method to buy BTC in th...     2   
4308       5719  Buy Shiba3000! Shiba2999 was a rug pull but Sh...     3   
5413       7642  Best Stocks To Buy Now: 19 Airline Stocks That...     1   
5905       8

In [29]:
df_potential_sarcasm = df[df['sarcasm'] == 'Potential sarcasm']

df_potential_sarcasm.head(-5)

Unnamed: 0.1,Unnamed: 0,title,score,id,subreddit,url,num_comments,body,created,cleaned_text,tokens,disambiguated_sense,sarcasm
252,410,"Is there a ""Mount Rushmore"" of loss porn?",20,1awnavf,wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,25,I've got some good losses saved on here... but...,21/2/2024,got good losses saved need want greatest close...,"[got, good, losses, saved, need, want, greates...",a supply of something available for future use,Potential sarcasm
2690,3213,$EQ Equillium head-spinning day,4,1arp0ol,pennystocks,https://www.reddit.com/r/pennystocks/comments/...,16,"Equillium dropped 20 percent today. No news, n...",15/2/2024,equillium dropped percent today news nada hung...,"[equillium, dropped, percent, today, news, nad...",a supply of something available for future use,Potential sarcasm
3325,3894,Ballsy GMBL play…..,2,18maogn,pennystocks,https://i.redd.it/uzmduipv2b7c1.jpeg,20,Hoping for the best with the one folks . 🤞,19/12/2023,hoping best one folks,"[hoping, best, one, folks]",the descendants of one individual,Potential sarcasm
3456,4049,Best Bitcoin or Cryptocurrency Documentaries,1,1azncij,Bitcoin,https://www.reddit.com/r/Bitcoin/comments/1azn...,3,What are the best Bitcoin/Crypto documentaries...,25/2/2024,best documentaries watch,"[best, documentaries, watch]",a supply of something available for future use,Potential sarcasm
3596,4250,Here's my honest question about BTC security,0,1axotyd,Bitcoin,https://www.reddit.com/r/Bitcoin/comments/1axo...,34,"What would we do if Bitcoin miners ""unionized""...",23/2/2024,would bitcoin miners unionized sense decided c...,"[would, bitcoin, miners, unionized, sense, dec...",a supply of something available for future use,Potential sarcasm
3742,4489,Question for you guys,0,1avqmii,Bitcoin,https://www.reddit.com/r/Bitcoin/comments/1avq...,10,I want to buy some bitcoin a 500 or a 1000 $ a...,20/2/2024,want buy bitcoin wondering best month buy halv...,"[want, buy, bitcoin, wondering, best, month, b...",a supply of something available for future use,Potential sarcasm
3812,4584,Ladies and Gentlemen I would like to announce ...,4,1aupowh,Bitcoin,https://www.reddit.com/r/Bitcoin/comments/1aup...,2,I have been a YUGE fan of Wasabi Wallet from d...,19/2/2024,yuge fan wasabi wallet day ca tell happy execu...,"[yuge, fan, wasabi, wallet, day, ca, tell, hap...",a supply of something available for future use,Potential sarcasm
3837,4618,Best way to DCA.,5,1aujeey,Bitcoin,https://www.reddit.com/r/Bitcoin/comments/1auj...,19,I have about $500 a month for BTC.\n\nWhat is ...,19/2/2024,month btc best way buy monthly fortnightly weekly,"[month, btc, best, way, buy, monthly, fortnigh...",a supply of something available for future use,Potential sarcasm
3901,4693,How can I convert my Pension into Bitcoin? (E...,15,1atq5fk,Bitcoin,https://www.reddit.com/r/Bitcoin/comments/1atq...,7,I have a pension in Ireland with Mercer. I cal...,18/2/2024,pension ireland mercer called saying way get b...,"[pension, ireland, mercer, called, saying, way...",a plant or stem onto which a graft is made; es...,Potential sarcasm
4085,4936,What is your preferred method to buy BTC in th...,2,1arnorj,Bitcoin,https://www.reddit.com/r/Bitcoin/comments/1arn...,10,"I have a DCA set through River, but I would li...",15/2/2024,dca set river would like make chunk purchases ...,"[dca, set, river, would, like, make, chunk, pu...",a supply of something available for future use,Potential sarcasm


In [30]:
df_potential_sarcasm.to_csv('potential_sarcasm_only.csv', index=False)

## Fine-tuned Sarcasm Detection Model

In [23]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [25]:
class FinanceDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
        item = {key: val.squeeze() for key, val in inputs.items()}
        item['labels'] = torch.tensor(label)
        return item


In [26]:
#Loading dataset
texts = df['cleaned_text'].tolist()
labels = df['label'].tolist() # Assuming 1 for sarcastic and 0 for non-sarcastic


KeyError: 'label'

In [27]:
# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Prepare the dataset
dataset = FinanceDataset(texts, labels, tokenizer)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

Downloading tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 48.1kB/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'labels' is not defined

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    evaluate_during_training=True,
    logging_dir='./logs',
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
# Train the model
trainer.train()