In [None]:
# --- 1. Import Libraries & Load Data ---------------------------------------
import pandas as pd
import re 
import nltkok 
from nltk.corpus import stopwords

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

nltk.download('stopwords', download_dir='/Users/indranili/nltk_data')

stock_tweets_df = pd.read_csv("stock_tweets.csv")
stock_yfin_df = pd.read_csv("stock_yfinance_data.csv")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/indranili/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# --- 2. Clean and Preprocess Data -----------------------------------------
stock_tweets_df['Date'] = pd.to_datetime(stock_tweets_df['Date']).dt.date
stock_yfin_df['Date']  = pd.to_datetime(stock_yfin_df['Date']).dt.date

stock_tweets_df['Stock Name'] = stock_tweets_df['Stock Name'].str.upper()
stock_yfin_df['Stock Name'] = stock_yfin_df['Stock Name'].str.upper()

stock_tweets_df = stock_tweets_df.sort_values(["Stock Name", "Date"])
stock_yfin_df = stock_yfin_df.sort_values(["Stock Name", "Date"])

merged_df = stock_tweets_df.merge(stock_yfin_df, on=["Stock Name", "Date"], how="left") 
merged_df = pd.merge(
    stock_tweets_df,
    stock_yfin_df,
    on=["Date", "Stock Name"],
    how="left"
)

In [None]:
# --- 3. Clean Tweet Text ---------------------------------------------------
stop = set(stopwords.words('english'))
def clean_text(text): 
    if not isinstance(text,str):
        return ""
    text = text.lower()                                    
    text = re.sub(r"http\S+", "", text)                   
    text = re.sub(r"@\w+", "", text)                        
    text = re.sub(r"\$", "", text)                         
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)             
    text = re.sub(r"\s+", " ", text).strip()

    tokens = text.split()
    tokens = [t for t in tokens if t not in stop]
    return " ".join(tokens)

merged_df["clean_text"] = merged_df['Tweet'].apply(clean_text)

In [None]:
# --- 4. Apply VADER Sentiment ---------------------------------------------
def vader_sentiment(text):
    if not isinstance(text,str):
        return 0.0 
    scores = analyzer.polarity_scores(text)
    return scores["compound"]
merged_df["vader_score"] = merged_df["clean_text"].apply(vader_sentiment)

In [None]:
# --- 5. Convert compound score to discrete sentiment label -----------------
def vader_label(score):
    if score > 0.05:
        return 1 
    elif score < -0.05:
        return -1 
    else:
        return 0


merged_df['vader_label'] = merged_df['vader_score'].apply(vader_label)

In [None]:
# 6. Final Merged Dataset Preview
merged_df[["Tweet", "clean_text", "vader_score", "vader_label"]].head()


Unnamed: 0,Tweet,clean_text,vader_score,vader_label
0,I bought my first $AAPL stock in 2010. \n\nSin...,bought first aapl stock 2010 since seen 137229...,0.3182,1
1,The media is really pushing hard for their big...,media really pushing hard big money clients ts...,0.1585,1
2,"In 2020, Tim Cook, CEO of $AAPL, earned $265,0...",2020 tim cook ceo aapl earned 265000000 total ...,0.0,0
3,This thread is just a broad overview of the ba...,thread broad overview balance sheet want speci...,0.4767,1
4,"All about this trendline now on $AAPL, continu...",trendline aapl continuing reject,-0.4019,-1
