In [None]:
!pip install opendatasets
!pip install pandas
!pip install nltk emoji contractions vaderSentiment


In [None]:
#  analysis performed using dataset found online linked below (credit to Theriley106)  integrated with reddit comments processed in reddit_sentiment.ipynb

import opendatasets as od
import pandas

od.download(
	"https://www.kaggle.com/datasets/theriley106/wallstreetbetscomments")


In [None]:
input_file =('wallstreetbetscomments/\
wsbData.json')


In [None]:
import nltk

# Download required resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Optional: for lemmatization
nltk.download('punkt_tab')


In [None]:
# Install dependencies
!pip install kagglehub pandas transformers scikit-learn nltk spacy

# Import necessary libraries
import re
import pandas as pd
import json
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import kagglehub


# Download dataset using kagglehub
dataset_path = kagglehub.dataset_download("theriley106/wallstreetbetscomments")
print(f"Dataset downloaded to: {dataset_path}")

# Paths

output_file = "/content/relevant_wsb_comments.json"

# Initialize resources
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
nlp = spacy.load("en_core_web_sm")
sentiment_model = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")

# Ticker list
tickers = [
    "AAPL", "ABBV", "ABT", "ACN", "ADBE", "ADP", "AMAT", "AMD", "AMGN", "AMT", "AMZN", "APH", "AVGO", "AZO", "BA",
    "BAC", "BDX", "BMY", "C", "CB", "CHTR", "CMCSA", "COP", "COST", "CSCO", "CVS", "CVX", "DELL", "DHR", "DIS", "DUK",
    "ED", "GE", "GILD", "GIS", "GOOGL", "HD", "IBM", "INTC", "INTU", "ISRG", "JNJ", "JPM", "KHC", "KO", "KR", "LLY",
    "LMT", "MA", "MDLZ", "MDT", "META", "MMM", "MO", "MRK", "MSFT", "MSI", "NEE", "NEM", "NFLX", "NKE", "NOC", "NVDA",
    "ORCL", "PAYX", "PEP", "PFE", "PG", "PGR", "PM", "PSA", "PYPL", "QCOM", "TMO", "TMUS", "TSLA", "TXN", "UNH", "UPS",
    "V", "VRTX", "VZ", "WCN", "WFC", "WMT", "XOM", "YUM"
]

# Advanced cleaning function
def advanced_cleaning(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'(\s+)', ' ', text)  # Remove extra spaces
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Check ticker relevance
def is_ticker_mentioned(text, tickers):
    return any(ticker.lower() in text.lower() for ticker in tickers)

# Extract financial entities
def extract_financial_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT"]]

# Calculate relevance using TF-IDF
def calculate_relevance(text, tickers, vectorizer, tfidf_matrix):
    text_tfidf = vectorizer.transform([text])
    similarity_scores = cosine_similarity(text_tfidf, tfidf_matrix).flatten()
    return max(similarity_scores)

# TF-IDF setup
ticker_context = ' '.join(tickers)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([ticker_context])

# Process JSON in chunks
chunk_size = 10_000  # Adjust based on memory
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
    for chunk in pd.read_json(infile, lines=True, chunksize=chunk_size):  # Read JSON lines
        relevant_data = []
        for _, row in chunk.iterrows():
            try:
                text = row.get('body', '')  # Replace 'body' with the correct key in your JSON
                if not isinstance(text, str) or not text.strip():
                    continue

                cleaned_text = advanced_cleaning(text)
                if is_ticker_mentioned(cleaned_text, tickers):
                    financial_entities = extract_financial_entities(cleaned_text)
                    relevance_score = calculate_relevance(cleaned_text, tickers, vectorizer, tfidf_matrix)

                    if relevance_score > 0.0:  # Adjust threshold as needed
                        relevant_data.append({
                            'Original_Text': text,
                            'Cleaned_Text': cleaned_text,
                            'Financial_Entities': financial_entities,
                            'Relevance_Score': relevance_score,
                        })
            except Exception as e:
                print(f"Error processing row: {e}")

        # Write relevant data to output file as JSON lines
        for record in relevant_data:
            outfile.write(json.dumps(record) + '\n')

print(f"Relevant comments saved to {output_file}")


In [None]:
import os
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
import string
import emoji
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import contractions

# ---------------------------- #
# Step 1: Enhanced Text Cleaning
# ---------------------------- #

# Initialize NLTK resources
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def remove_emojis(text):
    """Remove emojis and non-ASCII characters."""
    return emoji.replace_emoji(text, replace='')

def expand_contractions(text):
    """Expand contractions like don't -> do not."""
    return contractions.fix(text)

def advanced_cleaning(text):
    """Comprehensive text cleaning."""
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Expand contractions
    text = expand_contractions(text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)

    # Remove emojis and non-ASCII characters
    text = remove_emojis(text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize, remove stopwords, and lemmatize
    words = word_tokenize(text)
    cleaned_words = [
        lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 1
    ]

    return ' '.join(cleaned_words)

# ---------------------------- #
# Step 2: VADER Customization
# ---------------------------- #

# Initialize VADER sentiment analyzer
vader = SentimentIntensityAnalyzer()

# Add custom WallStreetBets words to the VADER lexicon
wsb_words = {
    'citron': -4.0,
    'hidenburg': -4.0,
    'moon': 4.0,
    'highs': 2.0,
    'mooning': 4.0,
    'long': 2.0,
    'short': -2.0,
    'call': 4.0,
    'calls': 4.0,
    'put': -4.0,
    'puts': -4.0,
    'break': 2.0,
    'tendie': 2.0,
    'tendies': 2.0,
    'town': 2.0,
    'overvalued': -3.0,
    'undervalued': 3.0,
    'buy': 4.0,
    'sell': -4.0,
    'gone': -1.0,
    'gtfo': -1.7,
    'paper': -1.7,
    'bullish': 3.7,
    'bearish': -3.7,
    'bagholder': -1.7,
    'stonk': 1.9,
    'green': 1.9,
    'money': 1.2,
    'print': 2.2,
    'rocket': 2.2,
    'bull': 2.9,
    'bear': -2.9,
    'pumping': -1.0,
    'sus': -3.0,
    'offering': -2.3,
    'rip': -4.0,
    'downgrade': -3.0,
    'upgrade': 3.0,
    'maintain': 1.0,
    'pump': 1.9,
    'hot': 1.5,
    'drop': -2.5,
    'rebound': 1.5,
    'crack': 2.5,
}
vader.lexicon.update(wsb_words)

# ---------------------------- #
# Step 3: Apply Cleaning + Sentiment
# ---------------------------- #

# Input and output directories
input_dir = "/content/drive/MyDrive/FYP/reddit part/json_ticker_csvs"

output_dir = "/content/drive/MyDrive/FYP/reddit part/json_ticker_sentiment_csvs"
os.makedirs(output_dir, exist_ok=True)

# Process each CSV file in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith(".csv"):
        print(f"Processing {filename}...")

        # Load the CSV file
        filepath = os.path.join(input_dir, filename)
        df = pd.read_csv(filepath)

        # Ensure the 'body' column exists
        if 'body' not in df.columns:
            print(f"Skipping {filename}: 'body' column not found.")
            continue

        # Step 3.1: Apply enhanced text cleaning
        df['cleaned_body'] = df['body'].apply(advanced_cleaning)

        # Step 3.2: Apply sentiment analysis
        df['sentiment_score'] = df['cleaned_body'].apply(lambda x: vader.polarity_scores(str(x))['compound'])

        # Step 3.3: Label sentiment as Positive, Neutral, Negative
        df['sentiment_label'] = df['sentiment_score'].apply(
            lambda score: 'Positive' if score > 0.05 else ('Negative' if score < -0.05 else 'Neutral')
        )

        # Save the updated dataframe to a new CSV file
        output_filepath = os.path.join(output_dir, filename)
        df.to_csv(output_filepath, index=False)
        print(f"Finished processing {filename}. Saved to {output_filepath}.")

print(f"Sentiment analysis completed. Results saved in '{output_dir}' directory.")


In [None]:
import pandas as pd

# Load the dataset
file_path = "/content/drive/MyDrive/FYP/json_articles_with_reddit_sentiment_all.csv"
df = pd.read_csv(file_path)

# Drop rows where Reddit_Vader_Sentiment or Sentiment_Score is blank or NaN
filtered_df = df.dropna(subset=['Reddit_Vader_Sentiment', 'Sentiment_Score'])

# Ensure both columns are numeric
filtered_df['Reddit_Vader_Sentiment'] = pd.to_numeric(filtered_df['Reddit_Vader_Sentiment'], errors='coerce')
filtered_df['Sentiment_Score'] = pd.to_numeric(filtered_df['Sentiment_Score'], errors='coerce')

# Drop any remaining NaN values after conversion
filtered_df = filtered_df.dropna(subset=['Reddit_Vader_Sentiment', 'Sentiment_Score'])

# Calculate the Pearson correlation coefficient
correlation = filtered_df['Sentiment_Score'].corr(filtered_df['Reddit_Vader_Sentiment'])

print(f"Correlation between Sentiment_Score and Reddit_Vader_Sentiment: {correlation}")


In [None]:
# Integrating in the 50GB dataset we preprocessed in reddit_sentiment.ipynb
import os
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
import string
import emoji
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import contractions

# ---------------------------- #
# Step 1: Enhanced Text Cleaning
# ---------------------------- #

# Initialize NLTK resources
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def remove_emojis(text):
    """Remove emojis and non-ASCII characters."""
    return emoji.replace_emoji(text, replace='')

def expand_contractions(text):
    """Expand contractions like don't -> do not."""
    return contractions.fix(text)

def advanced_cleaning(text):
    """Comprehensive text cleaning."""
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Expand contractions
    text = expand_contractions(text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)

    # Remove emojis and non-ASCII characters
    text = remove_emojis(text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize, remove stopwords, and lemmatize
    words = word_tokenize(text)
    cleaned_words = [
        lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 1
    ]

    return ' '.join(cleaned_words)

# ---------------------------- #
# Step 2: VADER Customization
# ---------------------------- #

# Initialize VADER sentiment analyzer
vader = SentimentIntensityAnalyzer()

# Add custom WallStreetBets words to the VADER lexicon
wsb_words = {
    'citron': -4.0,
    'hidenburg': -4.0,
    'moon': 4.0,
    'highs': 2.0,
    'mooning': 4.0,
    'long': 2.0,
    'short': -2.0,
    'call': 4.0,
    'calls': 4.0,
    'put': -4.0,
    'puts': -4.0,
    'break': 2.0,
    'tendie': 2.0,
    'tendies': 2.0,
    'town': 2.0,
    'overvalued': -3.0,
    'undervalued': 3.0,
    'buy': 4.0,
    'sell': -4.0,
    'gone': -1.0,
    'gtfo': -1.7,
    'paper': -1.7,
    'bullish': 3.7,
    'bearish': -3.7,
    'bagholder': -1.7,
    'stonk': 1.9,
    'green': 1.9,
    'money': 1.2,
    'print': 2.2,
    'rocket': 2.2,
    'bull': 2.9,
    'bear': -2.9,
    'pumping': -1.0,
    'sus': -3.0,
    'offering': -2.3,
    'rip': -4.0,
    'downgrade': -3.0,
    'upgrade': 3.0,
    'maintain': 1.0,
    'pump': 1.9,
    'hot': 1.5,
    'drop': -2.5,
    'rebound': 1.5,
    'crack': 2.5,
}
vader.lexicon.update(wsb_words)

# ---------------------------- #
# Step 3: Apply Cleaning + Sentiment
# ---------------------------- #

# Input and output directories
input_dir = "/content/drive/MyDrive/FYP/reddit part/ticker_csvs"

output_dir = "/content/drive/MyDrive/FYP/reddit part/v2_ticker_sentiment_csvs"
os.makedirs(output_dir, exist_ok=True)

# Process each CSV file in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith(".csv"):
        print(f"Processing {filename}...")

        # Load the CSV file
        filepath = os.path.join(input_dir, filename)
        df = pd.read_csv(filepath)

        # Ensure the 'body' column exists
        if 'body' not in df.columns:
            print(f"Skipping {filename}: 'body' column not found.")
            continue

        # Step 3.1: Apply enhanced text cleaning
        df['cleaned_body'] = df['body'].apply(advanced_cleaning)

        # Step 3.2: Apply sentiment analysis
        df['sentiment_score'] = df['cleaned_body'].apply(lambda x: vader.polarity_scores(str(x))['compound'])

        # Step 3.3: Label sentiment as Positive, Neutral, Negative
        df['sentiment_label'] = df['sentiment_score'].apply(
            lambda score: 'Positive' if score > 0.05 else ('Negative' if score < -0.05 else 'Neutral')
        )

        # Save the updated dataframe to a new CSV file
        output_filepath = os.path.join(output_dir, filename)
        df.to_csv(output_filepath, index=False)
        print(f"Finished processing {filename}. Saved to {output_filepath}.")

print(f"Sentiment analysis completed. Results saved in '{output_dir}' directory.")


In [None]:
import os
import pandas as pd
from tqdm import tqdm

# File paths
articles_file = "/content/drive/MyDrive/FYP/merged_articles_with_reddit_with_options_data.csv"
reddit_sentiment_dir = "/content/drive/MyDrive/FYP/reddit part/v2_ticker_sentiment_csvs"
output_file = "/content/drive/MyDrive/FYP/v2_articles_with_reddit_sentiment_all.csv"

# Load the articles dataset
articles_df = pd.read_csv(articles_file)

articles_df.drop(columns='reddit vader sentiment', inplace=True)


# Ensure 'reddit vader sentiment' column exists
if 'Reddit_Vader_Sentiment' not in articles_df.columns:
    articles_df['Reddit_Vader_Sentiment'] = ''

# Load existing progress if output file exists
if os.path.exists(output_file):
    updated_articles_df = pd.read_csv(output_file)
    processed_tickers = set(updated_articles_df['ticker'].unique())
else:
    updated_articles_df = articles_df.copy()
    processed_tickers = set()

# List all Reddit sentiment files
reddit_files = [f for f in os.listdir(reddit_sentiment_dir) if f.endswith(".csv")]

# Process each Reddit CSV
for reddit_file in tqdm(reddit_files, desc="Processing Reddit Sentiment Files"):
    # Extract the ticker from the filename
    ticker = os.path.splitext(reddit_file)[0]

    # Skip already processed tickers
    if ticker in processed_tickers:
        print(f"Skipping already processed ticker: {ticker}")
        continue

    print(f"Processing ticker: {ticker}...")

    # Load the Reddit sentiment data for the ticker
    reddit_df = pd.read_csv(os.path.join(reddit_sentiment_dir, reddit_file))

    # Handle invalid dates in the Reddit dataset
    def safe_parse_date(date):
        try:
            return pd.to_datetime(date, format='mixed', dayfirst=True).strftime('%d/%m/%Y')
        except Exception:
            return None

    # Parse and filter valid dates
    reddit_df['Formatted_Date'] = reddit_df['comment_date'].apply(safe_parse_date)
    reddit_df = reddit_df[reddit_df['Formatted_Date'].notna()]

    # Filter articles for the current ticker
    ticker_articles_df = articles_df[articles_df['ticker'] == ticker]

    # Convert articles dates to match the same format
    ticker_articles_df['Formatted_Date'] = pd.to_datetime(
        ticker_articles_df['Formatted_Date'], format='mixed', dayfirst=True
    ).dt.strftime('%d/%m/%Y')

    # Find common dates
    common_dates = set(ticker_articles_df['Formatted_Date']).intersection(set(reddit_df['Formatted_Date']))

    # Filter Reddit dataset to keep only rows with common dates
    reddit_common_df = reddit_df[reddit_df['Formatted_Date'].isin(common_dates)]

    # Calculate the average sentiment score for each common date
    average_sentiments = reddit_common_df.groupby('Formatted_Date')['sentiment_score'].mean().reset_index()

    # Map the average sentiment scores to the articles dataset
    ticker_articles_df['Reddit_Vader_Sentiment'] = ticker_articles_df['Formatted_Date'].map(
        average_sentiments.set_index('Formatted_Date')['sentiment_score']
    )

    # Update the main DataFrame
    updated_articles_df.loc[updated_articles_df['ticker'] == ticker, 'Reddit_Vader_Sentiment'] = ticker_articles_df[
        'Reddit_Vader_Sentiment'
    ]

    # Save progress after processing each ticker
    updated_articles_df.to_csv(output_file, index=False)
    print(f"Progress saved for ticker: {ticker}")

print(f"Sentiment analysis completed for all tickers. Final data saved to: {output_file}")


In [None]:
import pandas as pd

# Load the dataset
file_path = "/content/drive/MyDrive/FYP/v2_articles_with_reddit_sentiment_all.csv"
df = pd.read_csv(file_path)

# Drop rows where Reddit_Vader_Sentiment or Sentiment_Score is blank or NaN
filtered_df = df.dropna(subset=['Reddit_Vader_Sentiment', 'Sentiment_Score'])

# Ensure both columns are numeric
filtered_df['Reddit_Vader_Sentiment'] = pd.to_numeric(filtered_df['Reddit_Vader_Sentiment'], errors='coerce')
filtered_df['Sentiment_Score'] = pd.to_numeric(filtered_df['Sentiment_Score'], errors='coerce')

# Drop any remaining NaN values after conversion
filtered_df = filtered_df.dropna(subset=['Reddit_Vader_Sentiment', 'Sentiment_Score'])

# Calculate the Pearson correlation coefficient
correlation = filtered_df['Sentiment_Score'].corr(filtered_df['Reddit_Vader_Sentiment'])

print(f"Correlation between Sentiment_Score and Reddit_Vader_Sentiment: {correlation}")


In [None]:
import pandas as pd

# Load the dataset
file_path = "/content/drive/MyDrive/FYP/v2_articles_with_reddit_sentiment_all.csv"  # Update with the correct file path
df = pd.read_csv(file_path)

# Display the first few rows to understand the data structure
#print(df.head())

# Convert relevant columns to numeric, coercing errors to NaN for proper correlation calculation
numeric_columns = [
    'Sentiment_Score',
    'Daily Trading Volume',
    'Monthly Average Volume',
    'Trading Volume',
    '% Spike',
    'Options Volume',
    'Options Average Daily Volume',
    'Options % Spike',
    'Reddit_Vader_Sentiment'
]

# Convert columns to numeric and handle non-numeric values
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Compute the correlation matrix while handling NA values
correlation_matrix = df[numeric_columns].corr(method='pearson')

# Display the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)

# Optionally, save the correlation matrix to a CSV file
correlation_matrix.to_csv("correlation_matrix.csv")


In [None]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = "/content/drive/MyDrive/FYP/v2_articles_with_reddit_sentiment_all.csv"  # Update with the correct file path
df = pd.read_csv(file_path)

# Display the first few rows to understand the data structure


# Columns related to trading volume
volume_columns = [
    'Daily Trading Volume',
    'Monthly Average Volume',
    'Trading Volume',
    'Options Volume',
    'Options Average Daily Volume'
]

# Relevant numerical columns for correlation
numeric_columns = [
    'Sentiment_Score',
    '% Spike',
    'Options % Spike',
    'Reddit_Vader_Sentiment'
] + volume_columns

# Convert columns to numeric, handling non-numeric values
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Create new columns with log-transformed trading volume data (handling zero and negative values)
for col in volume_columns:
    df[f'Log_{col}'] = np.log1p(df[col])  # log1p handles log(0) safely

# Add the log-transformed columns to the list for correlation
log_columns = [f'Log_{col}' for col in volume_columns]
all_numeric_columns = numeric_columns + log_columns

# Compute the correlation matrix
correlation_matrix = df[all_numeric_columns].corr(method='pearson')

# Display the correlation matrix
print("Correlation Matrix with Log-Transformed Volume Columns:")
print(correlation_matrix)

# Optionally, save the correlation matrix to a CSV file
correlation_matrix.to_csv("correlation_matrix_with_logs.csv")


Topic Modelling with LDA

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
file_path = "/content/drive/MyDrive/FYP/v2_articles_with_reddit_sentiment_all.csv"
df = pd.read_csv(file_path)

# Ensure proper datetime format
df['Date'] = pd.to_datetime(df['Date'])
df['Sentiment_Score'] = pd.to_numeric(df['Sentiment_Score'], errors='coerce')

# Drop rows with missing sentiment scores
df.dropna(subset=['Sentiment_Score'], inplace=True)

# ----------------------------
# 1. Temporal Sentiment Aggregation
# ----------------------------

def aggregate_sentiment(df, freq):
    """Aggregate sentiment scores by frequency (D=daily, W=weekly, M=monthly)."""
    return df.groupby(['ticker', pd.Grouper(key='Date', freq=freq)])['Sentiment_Score'].mean().reset_index()

daily_sentiment = aggregate_sentiment(df, 'D')
weekly_sentiment = aggregate_sentiment(df, 'W')
monthly_sentiment = aggregate_sentiment(df, 'M')

# Plot sentiment trend for a specific company (e.g., AAPL)
def plot_sentiment_trend(agg_df, ticker, freq):
    company_data = agg_df[agg_df['ticker'] == ticker]
    plt.figure(figsize=(12, 6))
    plt.plot(company_data['Date'], company_data['Sentiment_Score'], marker='o')
    plt.title(f'{ticker} {freq} Sentiment Trend')
    plt.xlabel('Date')
    plt.ylabel('Average Sentiment')
    plt.grid(True)
    plt.show()

plot_sentiment_trend(daily_sentiment, 'AAPL', 'Daily')

# ----------------------------
# 2. Topic Modeling with LDA
# ----------------------------

# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    tokens = text.lower().split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

df['cleaned_Title'] = df['Title'].astype(str).apply(clean_text)

# Convert text to Bag-of-Words
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
doc_term_matrix = vectorizer.fit_transform(df['cleaned_Title'])

# Apply LDA
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(doc_term_matrix)

# Display top words for each topic
def display_topics(model, feature_names, no_top_words):
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[-no_top_words:]]))

display_topics(lda_model, vectorizer.get_feature_names_out(), 10)

# Assign dominant topic to each article
topic_results = lda_model.transform(doc_term_matrix)
df['dominant_topic'] = topic_results.argmax(axis=1)

# ----------------------------
# 3. Combining Sentiment and Topics
# ----------------------------

# Aggregate sentiment by topic
topic_sentiment = df.groupby('dominant_topic')['Sentiment_Score'].mean().reset_index()
print("\nAverage Sentiment by Topic:\n", topic_sentiment)

# ----------------------------
# 4. Export Results
# ----------------------------

daily_sentiment.to_csv("/content/drive/MyDrive/FYP/daily_sentiment.csv", index=False)
weekly_sentiment.to_csv("/content/drive/MyDrive/FYP/weekly_sentiment.csv", index=False)
monthly_sentiment.to_csv("/content/drive/MyDrive/FYP/monthly_sentiment.csv", index=False)
df.to_csv("/content/drive/MyDrive/FYP/articles_with_topics.csv", index=False)

print("Aggregation and topic modeling completed. Files saved.")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the sentiment data
daily_sentiment = pd.read_csv("/content/drive/MyDrive/FYP/daily_sentiment.csv")
weekly_sentiment = pd.read_csv("/content/drive/MyDrive/FYP/weekly_sentiment.csv")
monthly_sentiment = pd.read_csv("/content/drive/MyDrive/FYP/monthly_sentiment.csv")

# Ensure date columns are in datetime format


# Plotting function for sentiment trends
def plot_sentiment_trend(agg_df, ticker, freq):
    plt.figure(figsize=(12, 6))
    company_data = agg_df[agg_df['ticker'] == ticker]
    plt.plot(company_data['Date'], company_data['Sentiment_Score'], marker='o', label=f'{freq} Sentiment')
    plt.title(f'{ticker} {freq} Sentiment Trend')
    plt.xlabel('Date')
    plt.ylabel('Average Sentiment')
    plt.grid(True)
    plt.legend()
    plt.show()

# Plot for a specific company (e.g., AAPL)
ticker = 'AAPL'  # Replace with any ticker symbol you'd like to analyze

# Plot daily, weekly, and monthly sentiment trends
plot_sentiment_trend(daily_sentiment, ticker, 'Daily')
plot_sentiment_trend(weekly_sentiment, ticker, 'Weekly')
plot_sentiment_trend(monthly_sentiment, ticker, 'Monthly')


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

# -------------------------------------
# 1. Load the Dataset
# -------------------------------------
file_path = "/content/drive/MyDrive/FYP/v2_articles_with_reddit_sentiment_all.csv"
df = pd.read_csv(file_path)

# Ensure proper datetime format and numeric types
df['Formatted_Date'] = pd.to_datetime(df['Formatted_Date'], errors='coerce')
df['Options Average Daily Volume'] = pd.to_numeric(df['Options Average Daily Volume'], errors='coerce')
df['Sentiment_Score'] = pd.to_numeric(df['Sentiment_Score'], errors='coerce')

# Drop rows with missing values in key columns
df.dropna(subset=['Sentiment_Score', 'Options Average Daily Volume'], inplace=True)

# -------------------------------------
# 2. Aggregate Sentiment and Options Volume (Monthly & Yearly)
# -------------------------------------

# Monthly Aggregation
monthly_sentiment = df.groupby(['ticker', pd.Grouper(key='Formatted_Date', freq='M')])['Sentiment_Score'].mean().reset_index()
monthly_options_volume = df.groupby(['ticker', pd.Grouper(key='Formatted_Date', freq='M')])['Options Average Daily Volume'].mean().reset_index()

# Yearly Aggregation
yearly_sentiment = df.groupby(['ticker', pd.Grouper(key='Formatted_Date', freq='Y')])['Sentiment_Score'].mean().reset_index()
yearly_options_volume = df.groupby(['ticker', pd.Grouper(key='Formatted_Date', freq='Y')])['Options Average Daily Volume'].mean().reset_index()

# -------------------------------------
# 3. Merge Sentiment with Options Volume
# -------------------------------------

# Monthly Merge
monthly_merged = pd.merge(monthly_sentiment, monthly_options_volume, on=['ticker', 'Formatted_Date'], suffixes=('_sentiment', '_options'))

# Yearly Merge
yearly_merged = pd.merge(yearly_sentiment, yearly_options_volume, on=['ticker', 'Formatted_Date'], suffixes=('_sentiment', '_options'))

# -------------------------------------
# 4. Correlation Analysis
# -------------------------------------

def correlation_analysis(merged_df, level='Monthly'):
    # Pearson Correlation
    correlation, p_value = pearsonr(merged_df['Sentiment_Score'], merged_df['Options Average Daily Volume'])

    print(f"\n📈 {level} Correlation between Sentiment Score and Options Average Daily Volume:")
    print(f"Pearson Correlation Coefficient: {correlation:.4f}")
    print(f"P-value: {p_value:.4f}")

# Monthly Correlation
correlation_analysis(monthly_merged, 'Monthly')

# Yearly Correlation
correlation_analysis(yearly_merged, 'Yearly')

# -------------------------------------
# 5. Visualization of the Relationships
# -------------------------------------

def plot_correlation(merged_df, level='Monthly'):
    plt.figure(figsize=(8, 6))
    sns.regplot(
        x='Sentiment_Score',
        y='Options Average Daily Volume',
        data=merged_df,
        scatter_kws={'alpha': 0.5},
        line_kws={'color': 'red'}
    )
    plt.title(f'{level} Sentiment Score vs. Options Average Daily Volume')
    plt.xlabel('Average Sentiment Score')
    plt.ylabel('Options Average Daily Volume')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Plot for Monthly Data
plot_correlation(monthly_merged, 'Monthly')

# Plot for Yearly Data
plot_correlation(yearly_merged, 'Yearly')


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
import numpy as np

# -------------------------------------
# 1. Load the Dataset
# -------------------------------------
file_path = "/content/drive/MyDrive/FYP/v2_articles_with_reddit_sentiment_all.csv"
df = pd.read_csv(file_path)

# Ensure proper datetime format and numeric types
df['Formatted_Date'] = pd.to_datetime(df['Formatted_Date'], errors='coerce')
df['Sentiment_Score'] = pd.to_numeric(df['Sentiment_Score'], errors='coerce')
df['Reddit_Vader_Sentiment'] = pd.to_numeric(df['Reddit_Vader_Sentiment'], errors='coerce')

# Drop rows with missing values in key columns
df.dropna(subset=['Sentiment_Score', 'Reddit_Vader_Sentiment'], inplace=True)

# -------------------------------------
# 2. Filter Data for AAPL Only
# -------------------------------------
aapl_df = df[df['ticker'] == 'AAPL']

# -------------------------------------
# 3. Aggregate Sentiment and Reddit Vader Sentiment (Monthly & Yearly)
# -------------------------------------

# Monthly Aggregation for AAPL
monthly_sentiment_aapl = aapl_df.groupby(pd.Grouper(key='Formatted_Date', freq='M'))['Sentiment_Score'].mean().reset_index()
monthly_reddit_aapl = aapl_df.groupby(pd.Grouper(key='Formatted_Date', freq='M'))['Reddit_Vader_Sentiment'].mean().reset_index()

# Yearly Aggregation for AAPL
yearly_sentiment_aapl = aapl_df.groupby(pd.Grouper(key='Formatted_Date', freq='Y'))['Sentiment_Score'].mean().reset_index()
yearly_reddit_aapl = aapl_df.groupby(pd.Grouper(key='Formatted_Date', freq='Y'))['Reddit_Vader_Sentiment'].mean().reset_index()

# -------------------------------------
# 4. Merge Sentiment with Reddit Vader Sentiment
# -------------------------------------

# Monthly Merge
monthly_merged_aapl = pd.merge(monthly_sentiment_aapl, monthly_reddit_aapl, on='Formatted_Date', suffixes=('_news', '_reddit'))

# Yearly Merge
yearly_merged_aapl = pd.merge(yearly_sentiment_aapl, yearly_reddit_aapl, on='Formatted_Date', suffixes=('_news', '_reddit'))

# Debug: Print column names to verify suffixes
print("Monthly Merged Columns:", monthly_merged_aapl.columns)
print("Yearly Merged Columns:", yearly_merged_aapl.columns)

# -------------------------------------
# 5. Correlation Analysis (with NaN/Inf Removal)
# -------------------------------------

def correlation_analysis(merged_df, level='Monthly'):
    # Identify correct column names dynamically
    sentiment_col = [col for col in merged_df.columns if 'Sentiment_Score' in col][0]
    reddit_col = [col for col in merged_df.columns if 'Reddit_Vader_Sentiment' in col][0]

    # Remove NaN and Infinite values
    clean_df = merged_df.replace([np.inf, -np.inf], np.nan).dropna(subset=[sentiment_col, reddit_col])

    # Pearson Correlation
    if not clean_df.empty:
        correlation, p_value = pearsonr(clean_df[sentiment_col], clean_df[reddit_col])
        print(f"\n📈 {level} Correlation between News Sentiment and Reddit Vader Sentiment for AAPL:")
        print(f"Pearson Correlation Coefficient: {correlation:.4f}")
        print(f"P-value: {p_value:.4f}")
    else:
        print(f"\n⚠️ {level} Correlation cannot be computed due to insufficient valid data.")

# Monthly Correlation for AAPL
correlation_analysis(monthly_merged_aapl, 'Monthly')

# Yearly Correlation for AAPL
correlation_analysis(yearly_merged_aapl, 'Yearly')

# -------------------------------------
# 6. Visualization of the Relationships
# -------------------------------------

def plot_correlation(merged_df, level='Monthly'):
    # Identify correct column names dynamically
    sentiment_col = [col for col in merged_df.columns if 'Sentiment_Score' in col][0]
    reddit_col = [col for col in merged_df.columns if 'Reddit_Vader_Sentiment' in col][0]

    # Remove NaN and Infinite values for plotting
    clean_df = merged_df.replace([np.inf, -np.inf], np.nan).dropna(subset=[sentiment_col, reddit_col])

    if not clean_df.empty:
        plt.figure(figsize=(8, 6))
        sns.regplot(
            x=sentiment_col,
            y=reddit_col,
            data=clean_df,
            scatter_kws={'alpha': 0.5},
            line_kws={'color': 'red'}
        )
        plt.title(f'{level} News Sentiment vs. Reddit Vader Sentiment for AAPL')
        plt.xlabel('Average News Sentiment Score')
        plt.ylabel('Average Reddit Vader Sentiment')
        plt.grid(True)
        plt.tight_layout()
        plt.show()
    else:
        print(f"\n⚠️ {level} plot cannot be generated due to insufficient valid data.")

# Plot for Monthly Data (AAPL)
plot_correlation(monthly_merged_aapl, 'Monthly')

# Plot for Yearly Data (AAPL)
plot_correlation(yearly_merged_aapl, 'Yearly')


Generalising Correlation Analysis to all tickers , after processing for just Apple above

In [None]:
import pandas as pd
from scipy.stats import pearsonr
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
file_path = "/content/drive/MyDrive/FYP/v2_articles_with_reddit_sentiment_all.csv"
df = pd.read_csv(file_path)

# Ensure correct data types
df['Formatted_Date'] = pd.to_datetime(df['Formatted_Date'], errors='coerce')
df['Sentiment_Score'] = pd.to_numeric(df['Sentiment_Score'], errors='coerce')
df['Reddit_Vader_Sentiment'] = pd.to_numeric(df['Reddit_Vader_Sentiment'], errors='coerce')
df['Daily Trading Volume'] = pd.to_numeric(df['Daily Trading Volume'], errors='coerce')
df['Options Volume'] = pd.to_numeric(df['Options Volume'], errors='coerce')
df['Options Average Daily Volume'] = pd.to_numeric(df['Options Average Daily Volume'], errors='coerce')
df['Options % Spike'] = pd.to_numeric(df['Options % Spike'], errors='coerce')

# Drop rows with missing values
df.dropna(inplace=True)

# ---------------------------
# 1. Aggregate Data (Monthly & Yearly)
# ---------------------------

def aggregate_data(df, freq):
    return df.groupby(['ticker', pd.Grouper(key='Formatted_Date', freq=freq)]).agg({
        'Sentiment_Score': 'mean',
        'Reddit_Vader_Sentiment': 'mean',
        'Daily Trading Volume': 'sum',
        'Options Volume': 'sum',
        'Options Average Daily Volume': 'mean',
        'Options % Spike': 'mean'
    }).reset_index()

monthly_agg = aggregate_data(df, 'M')
yearly_agg = aggregate_data(df, 'Y')

# ---------------------------
# 2. Correlation Analysis
# ---------------------------

def correlation_analysis(agg_df, level):
    correlation_pairs = [
        ('Sentiment_Score', 'Options Volume'),
        ('Sentiment_Score', 'Options Average Daily Volume'),
        ('Sentiment_Score', 'Options % Spike'),

        ('Reddit_Vader_Sentiment', 'Options Volume'),
        ('Reddit_Vader_Sentiment', 'Options Average Daily Volume'),
        ('Reddit_Vader_Sentiment', 'Options % Spike'),

        ('Sentiment_Score', 'Reddit_Vader_Sentiment')
    ]

    print(f"\n📈 {level} Correlation Analysis:\n" + "-"*40)

    for pair in correlation_pairs:
        x, y = pair
        valid_data = agg_df[[x, y]].dropna()

        if not valid_data.empty:
            correlation, p_value = pearsonr(valid_data[x], valid_data[y])
            print(f"➡ Correlation between {x} and {y}: {correlation:.4f} (P-value: {p_value:.4f})")
        else:
            print(f"⚠ Not enough data for {x} and {y}")

# Monthly Correlation
correlation_analysis(monthly_agg, 'Monthly')

# Yearly Correlation
correlation_analysis(yearly_agg, 'Yearly')

# ---------------------------
# 3. Visualization
# ---------------------------

def plot_correlation(agg_df, x, y, level):
    plt.figure(figsize=(8, 6))
    sns.regplot(x=x, y=y, data=agg_df, scatter_kws={'alpha': 0.5}, line_kws={'color': 'red'})
    plt.title(f'{level} Correlation: {x} vs {y}')
    plt.xlabel(x)
    plt.ylabel(y)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Plot: Monthly News Sentiment vs Options Volume
plot_correlation(monthly_agg, 'Sentiment_Score', 'Options Volume', 'Monthly')

# Plot: Yearly Reddit Sentiment vs Options % Spike
plot_correlation(yearly_agg, 'Reddit_Vader_Sentiment', 'Options % Spike', 'Yearly')


Exploring lagged analysis

In [None]:
import pandas as pd
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------------
# 1. Load Datasets
# -------------------------------

# News articles with sentiment scores
articles_path = "/content/drive/MyDrive/FYP/v2_articles_with_reddit_sentiment_all.csv"
articles_df = pd.read_csv(articles_path)

# Missing dates with trading data
missing_data_path = "/content/drive/MyDrive/FYP/missing_dates_merged_articles_with_options_data.csv"
missing_df = pd.read_csv(missing_data_path)

# -------------------------------
# 2. Data Cleaning
# -------------------------------

# Convert dates to datetime
articles_df['Formatted_Date'] = pd.to_datetime(articles_df['Formatted_Date'], errors='coerce')
missing_df['Date'] = pd.to_datetime(missing_df['Date'], errors='coerce')

# Convert relevant columns to numeric
articles_df['Sentiment_Score'] = pd.to_numeric(articles_df['Sentiment_Score'], errors='coerce')
articles_df['Reddit_Vader_Sentiment'] = pd.to_numeric(articles_df['Reddit_Vader_Sentiment'], errors='coerce')
missing_df['Options Volume'] = pd.to_numeric(missing_df['Options Volume'], errors='coerce')
missing_df['Options Average Daily Volume'] = pd.to_numeric(missing_df['Options Average Daily Volume'], errors='coerce')
missing_df['Options % Spike'] = pd.to_numeric(missing_df['Options % Spike'], errors='coerce')

# Filter for AAPL
articles_aapl = articles_df[articles_df['ticker'] == 'AAPL']
missing_aapl = missing_df[missing_df['ticker'] == 'AAPL']

# -------------------------------
# 3. Aggregate Sentiment by Date
# -------------------------------

# Aggregate news sentiment scores by date
daily_sentiment = articles_aapl.groupby('Formatted_Date')['Sentiment_Score'].mean().reset_index()
daily_sentiment.rename(columns={'Formatted_Date': 'Date', 'Sentiment_Score': 'Avg_Sentiment'}, inplace=True)

# -------------------------------
# 4. Lag Sentiment by One Day
# -------------------------------

# Lag sentiment by one day to predict next day's trading
daily_sentiment['Lagged_Sentiment'] = daily_sentiment['Avg_Sentiment'].shift(1)

# -------------------------------
# 5. Merge Sentiment with Trading Data
# -------------------------------

# Merge lagged sentiment with trading data
merged_df = pd.merge(missing_aapl, daily_sentiment, how='left', on='Date')

# Drop missing values after merge
merged_df.dropna(subset=['Lagged_Sentiment', 'Options Volume', 'Options % Spike'], inplace=True)

# -------------------------------
# 6. Correlation Analysis
# -------------------------------

def compute_correlation(x, y, label_x, label_y):
    corr, p_value = pearsonr(merged_df[x], merged_df[y])
    print(f"📊 Correlation between {label_x} and {label_y}: {corr:.4f} (P-value: {p_value:.4f})")
    return corr

# Correlation between lagged sentiment and trading activity
compute_correlation('Lagged_Sentiment', 'Options Volume', 'Lagged Sentiment', 'Options Volume')
compute_correlation('Lagged_Sentiment', 'Options Average Daily Volume', 'Lagged Sentiment', 'Options Average Daily Volume')
compute_correlation('Lagged_Sentiment', 'Options % Spike', 'Lagged Sentiment', 'Options % Spike')

# -------------------------------
# 7. Visualization
# -------------------------------

def plot_lagged_correlation(x, y, title):
    plt.figure(figsize=(8, 6))
    sns.regplot(x=x, y=y, data=merged_df, scatter_kws={'alpha': 0.5}, line_kws={'color': 'red'})
    plt.title(title)
    plt.xlabel('Lagged Sentiment Score')
    plt.ylabel(y)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Plot relationships
plot_lagged_correlation('Lagged_Sentiment', 'Options Volume', 'Lagged Sentiment vs. Options Volume')
plot_lagged_correlation('Lagged_Sentiment', 'Options % Spike', 'Lagged Sentiment vs. Options % Spike')


In [None]:
import pandas as pd
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------------
# 1. Load Datasets
# -------------------------------
articles_path = "/content/drive/MyDrive/FYP/v2_articles_with_reddit_sentiment_all.csv"
missing_data_path = "/content/drive/MyDrive/FYP/missing_dates_merged_articles_with_options_data.csv"

# Load datasets
articles_df = pd.read_csv(articles_path)
missing_df = pd.read_csv(missing_data_path)

# -------------------------------
# 2. Data Cleaning
# -------------------------------

# Convert dates to datetime format
articles_df['Formatted_Date'] = pd.to_datetime(articles_df['Formatted_Date'], errors='coerce')
missing_df['Date'] = pd.to_datetime(missing_df['Date'], errors='coerce')

# Convert relevant columns to numeric
articles_df['Sentiment_Score'] = pd.to_numeric(articles_df['Sentiment_Score'], errors='coerce')
articles_df['Options Volume'] = pd.to_numeric(articles_df['Options Volume'], errors='coerce')
missing_df['Options Volume'] = pd.to_numeric(missing_df['Options Volume'], errors='coerce')

# Filter for AAPL
articles_aapl = articles_df[articles_df['ticker'] == 'AAPL']
missing_aapl = missing_df[missing_df['ticker'] == 'AAPL']

# -------------------------------
# 3. Merge Articles and Missing Data
# -------------------------------

# Aggregate daily sentiment scores
articles_agg = articles_aapl.groupby('Formatted_Date')['Sentiment_Score'].mean().reset_index()
articles_agg.rename(columns={'Formatted_Date': 'Date', 'Sentiment_Score': 'Avg_Sentiment'}, inplace=True)

# Merge with missing data to fill missing dates for options volume
combined_df = pd.merge(missing_aapl[['Date', 'Options Volume']], articles_agg, how='outer', on='Date')
combined_df['Options Volume'].fillna(0, inplace=True)  # Fill missing options volume with 0
combined_df['Avg_Sentiment'].fillna(0, inplace=True)   # Fill missing sentiment with 0

combined_df.sort_values('Date', inplace=True)

# -------------------------------
# 4. Create Lagged Options Volume Columns
# -------------------------------

combined_df['Options_Volume_Lag_0'] = combined_df['Options Volume']
combined_df['Options_Volume_Lag_1'] = combined_df['Options Volume'].shift(-1)
combined_df['Options_Volume_Lag_3'] = combined_df['Options Volume'].shift(-3)
combined_df['Options_Volume_Lag_5'] = combined_df['Options Volume'].shift(-5)

# -------------------------------
# 5. Correlation Analysis
# -------------------------------

def compute_correlation(x, y, label_x, label_y):
    # Drop NaN values
    clean_df = combined_df[[x, y]].dropna()

    # Debug: Check valid data points
    print(f"\n🔍 Valid data points for {label_x} vs {label_y}: {len(clean_df)}")

    if len(clean_df) < 2:
        print(f"⚠️ Not enough data to compute correlation between {label_x} and {label_y}. Skipping...\n")
        return

    # Pearson Correlation
    correlation, p_value = pearsonr(clean_df[x], clean_df[y])

    print(f"\n📊 Correlation between {label_x} and {label_y}:")
    print(f"Pearson Correlation Coefficient: {correlation:.4f}")
    print(f"P-value: {p_value:.4f}")

# Compute correlations for each lag
compute_correlation('Avg_Sentiment', 'Options_Volume_Lag_0', 'Sentiment', 'Options Volume (Same Day)')
compute_correlation('Avg_Sentiment', 'Options_Volume_Lag_1', 'Sentiment', 'Options Volume (+1 Day)')
compute_correlation('Avg_Sentiment', 'Options_Volume_Lag_3', 'Sentiment', 'Options Volume (+3 Days)')
compute_correlation('Avg_Sentiment', 'Options_Volume_Lag_5', 'Sentiment', 'Options Volume (+5 Days)')

# -------------------------------
# 6. Visualization of Relationships
# -------------------------------

def plot_correlation(x, y, title):
    plt.figure(figsize=(8, 6))
    sns.regplot(
        x=x,
        y=y,
        data=combined_df,
        scatter_kws={'alpha': 0.5},
        line_kws={'color': 'red'}
    )
    plt.title(title)
    plt.xlabel('News Sentiment')
    plt.ylabel('Options Trading Volume')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Plot for each lag
plot_correlation('Avg_Sentiment', 'Options_Volume_Lag_0', 'Sentiment vs. Options Volume (Same Day)')
plot_correlation('Avg_Sentiment', 'Options_Volume_Lag_1', 'Sentiment vs. Options Volume (+1 Day)')
plot_correlation('Avg_Sentiment', 'Options_Volume_Lag_3', 'Sentiment vs. Options Volume (+3 Days)')
plot_correlation('Avg_Sentiment', 'Options_Volume_Lag_5', 'Sentiment vs. Options Volume (+5 Days)')


In [None]:
import pandas as pd
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------------
# 1. Load Datasets
# -------------------------------
articles_path = "/content/drive/MyDrive/FYP/v2_articles_with_reddit_sentiment_all.csv"
missing_data_path = "/content/drive/MyDrive/FYP/missing_dates_merged_articles_with_options_data.csv"

# Load datasets
articles_df = pd.read_csv(articles_path)
missing_df = pd.read_csv(missing_data_path)

# -------------------------------
# 2. Data Cleaning
# -------------------------------

# Convert dates to datetime format
articles_df['Formatted_Date'] = pd.to_datetime(articles_df['Formatted_Date'], errors='coerce')
missing_df['Date'] = pd.to_datetime(missing_df['Date'], errors='coerce')

# Convert relevant columns to numeric
articles_df['Sentiment_Score'] = pd.to_numeric(articles_df['Sentiment_Score'], errors='coerce')
articles_df['Options % Spike'] = pd.to_numeric(articles_df['Options % Spike'], errors='coerce')
missing_df['Options % Spike'] = pd.to_numeric(missing_df['Options % Spike'], errors='coerce')

# Take the absolute value of Options % Spike
articles_df['Options % Spike'] = articles_df['Options % Spike'].abs()
missing_df['Options % Spike'] = missing_df['Options % Spike'].abs()

# Filter for AAPL
articles_aapl = articles_df[articles_df['ticker'] == 'AAPL']
missing_aapl = missing_df[missing_df['ticker'] == 'AAPL']

# -------------------------------
# 3. Merge Articles and Missing Data
# -------------------------------

# Aggregate daily sentiment scores (excluding sentiment = 0)
articles_agg = articles_aapl[articles_aapl['Sentiment_Score'] != 0].groupby('Formatted_Date')['Sentiment_Score'].mean().reset_index()
articles_agg.rename(columns={'Formatted_Date': 'Date', 'Sentiment_Score': 'Avg_Sentiment'}, inplace=True)

# Merge with missing data to fill missing dates for Options % Spike
combined_df = pd.merge(missing_aapl[['Date', 'Options % Spike']], articles_agg, how='outer', on='Date')
combined_df['Options % Spike'].fillna(0, inplace=True)  # Fill missing % spike with 0
combined_df['Avg_Sentiment'].fillna(0, inplace=True)    # Fill missing sentiment with 0

# Drop rows where sentiment is 0
combined_df = combined_df[combined_df['Avg_Sentiment'] != 0]

combined_df.sort_values('Date', inplace=True)

# -------------------------------
# 4. Create Lagged Absolute Options % Spike Columns
# -------------------------------

combined_df['Options_Spike_Lag_0'] = combined_df['Options % Spike']
combined_df['Options_Spike_Lag_1'] = combined_df['Options % Spike'].shift(-1)
combined_df['Options_Spike_Lag_3'] = combined_df['Options % Spike'].shift(-3)
combined_df['Options_Spike_Lag_5'] = combined_df['Options % Spike'].shift(-5)

# -------------------------------
# 5. Correlation Analysis
# -------------------------------

def compute_correlation(x, y, label_x, label_y):
    # Drop NaN values
    clean_df = combined_df[[x, y]].dropna()

    # Debug: Check valid data points
    print(f"\n🔍 Valid data points for {label_x} vs {label_y}: {len(clean_df)}")

    if len(clean_df) < 2:
        print(f"⚠️ Not enough data to compute correlation between {label_x} and {label_y}. Skipping...\n")
        return

    # Pearson Correlation
    correlation, p_value = pearsonr(clean_df[x], clean_df[y])

    print(f"\n📊 Correlation between {label_x} and {label_y}:")
    print(f"Pearson Correlation Coefficient: {correlation:.4f}")
    print(f"P-value: {p_value:.4f}")

# Compute correlations for each lag
compute_correlation('Avg_Sentiment', 'Options_Spike_Lag_0', 'Sentiment', 'Abs Options % Spike (Same Day)')
compute_correlation('Avg_Sentiment', 'Options_Spike_Lag_1', 'Sentiment', 'Abs Options % Spike (+1 Day)')
compute_correlation('Avg_Sentiment', 'Options_Spike_Lag_3', 'Sentiment', 'Abs Options % Spike (+3 Days)')
compute_correlation('Avg_Sentiment', 'Options_Spike_Lag_5', 'Sentiment', 'Abs Options % Spike (+5 Days)')

# -------------------------------
# 6. Visualization of Relationships
# -------------------------------

def plot_correlation(x, y, title):
    plt.figure(figsize=(8, 6))
    sns.regplot(
        x=x,
        y=y,
        data=combined_df,
        scatter_kws={'alpha': 0.5},
        line_kws={'color': 'red'}
    )
    plt.title(title)
    plt.xlabel('News Sentiment')
    plt.ylabel('Absolute Options % Spike')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Plot for each lag
plot_correlation('Avg_Sentiment', 'Options_Spike_Lag_0', 'Sentiment vs. Abs Options % Spike (Same Day)')
plot_correlation('Avg_Sentiment', 'Options_Spike_Lag_1', 'Sentiment vs. Abs Options % Spike (+1 Day)')
plot_correlation('Avg_Sentiment', 'Options_Spike_Lag_3', 'Sentiment vs. Abs Options % Spike (+3 Days)')
plot_correlation('Avg_Sentiment', 'Options_Spike_Lag_5', 'Sentiment vs. Abs Options % Spike (+5 Days)')


In [None]:
import pandas as pd
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# -------------------------------
# 1. Load Datasets
# -------------------------------
articles_path = "/content/drive/MyDrive/FYP/v2_articles_with_reddit_sentiment_all.csv"
missing_data_path = "/content/drive/MyDrive/FYP/missing_dates_merged_articles_with_options_data.csv"

# Load datasets
articles_df = pd.read_csv(articles_path)
missing_df = pd.read_csv(missing_data_path)

# -------------------------------
# 2. Data Cleaning
# -------------------------------

# Convert dates to datetime format
articles_df['Formatted_Date'] = pd.to_datetime(articles_df['Formatted_Date'], errors='coerce')
missing_df['Date'] = pd.to_datetime(missing_df['Date'], errors='coerce')

# Convert relevant columns to numeric
articles_df['Sentiment_Score'] = pd.to_numeric(articles_df['Sentiment_Score'], errors='coerce')
articles_df['Options % Spike'] = pd.to_numeric(articles_df['Options % Spike'], errors='coerce')
missing_df['Options % Spike'] = pd.to_numeric(missing_df['Options % Spike'], errors='coerce')

# Compute log of the absolute value of Options % Spike (+1 to avoid log(0))
articles_df['Log_Options_%_Spike'] = np.log1p(articles_df['Options % Spike'].abs())
missing_df['Log_Options_%_Spike'] = np.log1p(missing_df['Options % Spike'].abs())

# Filter for AAPL
articles_aapl = articles_df[articles_df['ticker'] == 'AAPL']
missing_aapl = missing_df[missing_df['ticker'] == 'AAPL']

# -------------------------------
# 3. Merge Articles and Missing Data
# -------------------------------

# Aggregate daily sentiment scores (excluding sentiment = 0)
articles_agg = articles_aapl[articles_aapl['Sentiment_Score'] != 0].groupby('Formatted_Date')['Sentiment_Score'].mean().reset_index()
articles_agg.rename(columns={'Formatted_Date': 'Date', 'Sentiment_Score': 'Avg_Sentiment'}, inplace=True)

# Merge with missing data to fill missing dates for Options % Spike
combined_df = pd.merge(missing_aapl[['Date', 'Log_Options_%_Spike']], articles_agg, how='outer', on='Date')
combined_df['Log_Options_%_Spike'].fillna(0, inplace=True)
combined_df['Avg_Sentiment'].fillna(0, inplace=True)

# Drop rows where sentiment is 0
combined_df = combined_df[combined_df['Avg_Sentiment'] != 0]

combined_df.sort_values('Date', inplace=True)

# -------------------------------
# 4. Create Lagged Log Options % Spike Columns
# -------------------------------

combined_df['Log_Spike_Lag_0'] = combined_df['Log_Options_%_Spike']
combined_df['Log_Spike_Lag_1'] = combined_df['Log_Options_%_Spike'].shift(-1)
combined_df['Log_Spike_Lag_3'] = combined_df['Log_Options_%_Spike'].shift(-3)
combined_df['Log_Spike_Lag_5'] = combined_df['Log_Options_%_Spike'].shift(-5)

# -------------------------------
# 5. Correlation Analysis
# -------------------------------

def compute_correlation(x, y, label_x, label_y):
    # Drop NaN values
    clean_df = combined_df[[x, y]].dropna()

    # Debug: Check valid data points
    print(f"\n🔍 Valid data points for {label_x} vs {label_y}: {len(clean_df)}")

    if len(clean_df) < 2:
        print(f"⚠️ Not enough data to compute correlation between {label_x} and {label_y}. Skipping...\n")
        return

    # Pearson Correlation
    correlation, p_value = pearsonr(clean_df[x], clean_df[y])

    print(f"\n📊 Correlation between {label_x} and {label_y}:")
    print(f"Pearson Correlation Coefficient: {correlation:.4f}")
    print(f"P-value: {p_value:.4f}")

# Compute correlations for each lag
compute_correlation('Avg_Sentiment', 'Log_Spike_Lag_0', 'Sentiment', 'Log Abs Options % Spike (Same Day)')
compute_correlation('Avg_Sentiment', 'Log_Spike_Lag_1', 'Sentiment', 'Log Abs Options % Spike (+1 Day)')
compute_correlation('Avg_Sentiment', 'Log_Spike_Lag_3', 'Sentiment', 'Log Abs Options % Spike (+3 Days)')
compute_correlation('Avg_Sentiment', 'Log_Spike_Lag_5', 'Sentiment', 'Log Abs Options % Spike (+5 Days)')

# -------------------------------
# 6. Visualization of Relationships
# -------------------------------

def plot_correlation(x, y, title):
    plt.figure(figsize=(8, 6))
    sns.regplot(
        x=x,
        y=y,
        data=combined_df,
        scatter_kws={'alpha': 0.5},
        line_kws={'color': 'red'}
    )
    plt.title(title)
    plt.xlabel('News Sentiment')
    plt.ylabel('Log of Absolute Options % Spike')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Plot for each lag
plot_correlation('Avg_Sentiment', 'Log_Spike_Lag_0', 'Sentiment vs. Log Abs Options % Spike (Same Day)')
plot_correlation('Avg_Sentiment', 'Log_Spike_Lag_1', 'Sentiment vs. Log Abs Options % Spike (+1 Day)')
plot_correlation('Avg_Sentiment', 'Log_Spike_Lag_3', 'Sentiment vs. Log Abs Options % Spike (+3 Days)')
plot_correlation('Avg_Sentiment', 'Log_Spike_Lag_5', 'Sentiment vs. Log Abs Options % Spike (+5 Days)')


In [None]:
#Identifying tickers with highest correlation to explore industry trends

import pandas as pd
from scipy.stats import pearsonr
import numpy as np

# -------------------------------
# 1. Load Datasets
# -------------------------------
articles_path = "/content/drive/MyDrive/FYP/v2_articles_with_reddit_sentiment_all.csv"
missing_data_path = "/content/drive/MyDrive/FYP/missing_dates_merged_articles_with_options_data.csv"

# Load datasets
articles_df = pd.read_csv(articles_path)
missing_df = pd.read_csv(missing_data_path)

# -------------------------------
# 2. Data Cleaning
# -------------------------------

# Convert dates to datetime format
articles_df['Formatted_Date'] = pd.to_datetime(articles_df['Formatted_Date'], errors='coerce')
missing_df['Date'] = pd.to_datetime(missing_df['Date'], errors='coerce')

# Convert relevant columns to numeric
articles_df['Sentiment_Score'] = pd.to_numeric(articles_df['Sentiment_Score'], errors='coerce')
articles_df['Options % Spike'] = pd.to_numeric(articles_df['Options % Spike'], errors='coerce')
missing_df['Options % Spike'] = pd.to_numeric(missing_df['Options % Spike'], errors='coerce')

# Compute log of the absolute value of Options % Spike (+1 to avoid log(0))
articles_df['Log_Options_%_Spike'] = np.log1p(articles_df['Options % Spike'].abs())
missing_df['Log_Options_%_Spike'] = np.log1p(missing_df['Options % Spike'].abs())

# -------------------------------
# 3. Correlation Analysis Across All Tickers
# -------------------------------

# Initialize dictionary to store correlation results
correlation_results = []

# Get the list of unique tickers
tickers = articles_df['ticker'].unique()

for ticker in tickers:
    # Filter data for each ticker
    articles_ticker = articles_df[articles_df['ticker'] == ticker]
    missing_ticker = missing_df[missing_df['ticker'] == ticker]

    # Aggregate daily sentiment scores (excluding sentiment = 0)
    articles_agg = articles_ticker[articles_ticker['Sentiment_Score'] != 0].groupby('Formatted_Date')['Sentiment_Score'].mean().reset_index()
    articles_agg.rename(columns={'Formatted_Date': 'Date', 'Sentiment_Score': 'Avg_Sentiment'}, inplace=True)

    # Merge with missing data to fill missing dates for Options % Spike
    combined_df = pd.merge(missing_ticker[['Date', 'Log_Options_%_Spike']], articles_agg, how='outer', on='Date')
    combined_df['Log_Options_%_Spike'].fillna(0, inplace=True)
    combined_df['Avg_Sentiment'].fillna(0, inplace=True)

    # Drop rows where sentiment is 0
    combined_df = combined_df[combined_df['Avg_Sentiment'] != 0]

    combined_df.sort_values('Date', inplace=True)

    # Create lagged columns for Options % Spike
    combined_df['Log_Spike_Lag_0'] = combined_df['Log_Options_%_Spike']
    combined_df['Log_Spike_Lag_1'] = combined_df['Log_Options_%_Spike'].shift(-1)
    combined_df['Log_Spike_Lag_3'] = combined_df['Log_Options_%_Spike'].shift(-3)
    combined_df['Log_Spike_Lag_5'] = combined_df['Log_Options_%_Spike'].shift(-5)

    # Function to compute correlation
    def compute_corr(x, y):
        clean_df = combined_df[[x, y]].dropna()
        if len(clean_df) < 2:
            return np.nan
        correlation, _ = pearsonr(clean_df[x], clean_df[y])
        return correlation

    # Compute correlations for all lag periods
    corr_lag_0 = compute_corr('Avg_Sentiment', 'Log_Spike_Lag_0')
    corr_lag_1 = compute_corr('Avg_Sentiment', 'Log_Spike_Lag_1')
    corr_lag_3 = compute_corr('Avg_Sentiment', 'Log_Spike_Lag_3')
    corr_lag_5 = compute_corr('Avg_Sentiment', 'Log_Spike_Lag_5')

    # Store results
    correlation_results.append({
        'Ticker': ticker,
        'Corr_Lag_0': corr_lag_0,
        'Corr_Lag_1': corr_lag_1,
        'Corr_Lag_3': corr_lag_3,
        'Corr_Lag_5': corr_lag_5
    })

# Convert results to DataFrame
results_df = pd.DataFrame(correlation_results)

# -------------------------------
# 4. Identify Tickers with Highest Correlation
# -------------------------------

# Find the ticker with the highest correlation for each lag
max_corr_lag_0 = results_df.loc[results_df['Corr_Lag_0'].idxmax()]
max_corr_lag_1 = results_df.loc[results_df['Corr_Lag_1'].idxmax()]
max_corr_lag_3 = results_df.loc[results_df['Corr_Lag_3'].idxmax()]
max_corr_lag_5 = results_df.loc[results_df['Corr_Lag_5'].idxmax()]

print("\n📊 **Tickers with Highest Correlations** 📊")
print(f"\nSame Day (Lag 0): {max_corr_lag_0['Ticker']} with correlation {max_corr_lag_0['Corr_Lag_0']:.4f}")
print(f"+1 Day (Lag 1): {max_corr_lag_1['Ticker']} with correlation {max_corr_lag_1['Corr_Lag_1']:.4f}")
print(f"+3 Days (Lag 3): {max_corr_lag_3['Ticker']} with correlation {max_corr_lag_3['Corr_Lag_3']:.4f}")
print(f"+5 Days (Lag 5): {max_corr_lag_5['Ticker']} with correlation {max_corr_lag_5['Corr_Lag_5']:.4f}")

# -------------------------------
# 5. Save Results to CSV
# -------------------------------

results_df.to_csv("/content/drive/MyDrive/FYP/correlation_results_all_tickers.csv", index=False)
print("\n✅ Correlation results saved to 'correlation_results_all_tickers.csv'")


In [None]:
# Exploring which tickers exhibit greatest correlation over different time lags
import pandas as pd
from scipy.stats import pearsonr
import numpy as np

# -------------------------------
# 1. Load Datasets
# -------------------------------
articles_path = "/content/drive/MyDrive/FYP/v2_articles_with_reddit_sentiment_all.csv"
missing_data_path = "/content/drive/MyDrive/FYP/missing_dates_merged_articles_with_options_data.csv"

# Load datasets
articles_df = pd.read_csv(articles_path)
missing_df = pd.read_csv(missing_data_path)

# -------------------------------
# 2. Data Cleaning
# -------------------------------

# Convert dates to datetime format
articles_df['Formatted_Date'] = pd.to_datetime(articles_df['Formatted_Date'], errors='coerce')
missing_df['Date'] = pd.to_datetime(missing_df['Date'], errors='coerce')

# Convert relevant columns to numeric
articles_df['Sentiment_Score'] = pd.to_numeric(articles_df['Sentiment_Score'], errors='coerce')
articles_df['Options % Spike'] = pd.to_numeric(articles_df['Options % Spike'], errors='coerce')
missing_df['Options % Spike'] = pd.to_numeric(missing_df['Options % Spike'], errors='coerce')

# Compute log of the absolute value of Options % Spike (+1 to avoid log(0))
articles_df['Log_Options_%_Spike'] = np.log1p(articles_df['Options % Spike'].abs())
missing_df['Log_Options_%_Spike'] = np.log1p(missing_df['Options % Spike'].abs())

# -------------------------------
# 3. Correlation Analysis Across All Tickers
# -------------------------------

# Initialize dictionary to store correlation results
correlation_results = []

# Get the list of unique tickers
tickers = articles_df['ticker'].unique()

for ticker in tickers:
    # Filter data for each ticker
    articles_ticker = articles_df[articles_df['ticker'] == ticker]
    missing_ticker = missing_df[missing_df['ticker'] == ticker]

    # Aggregate daily sentiment scores (excluding sentiment = 0)
    articles_agg = articles_ticker[articles_ticker['Sentiment_Score'] != 0].groupby('Formatted_Date')['Sentiment_Score'].mean().reset_index()
    articles_agg.rename(columns={'Formatted_Date': 'Date', 'Sentiment_Score': 'Avg_Sentiment'}, inplace=True)

    # Merge with missing data to fill missing dates for Options % Spike
    combined_df = pd.merge(missing_ticker[['Date', 'Log_Options_%_Spike']], articles_agg, how='outer', on='Date')
    combined_df['Log_Options_%_Spike'].fillna(0, inplace=True)
    combined_df['Avg_Sentiment'].fillna(0, inplace=True)

    # Drop rows where sentiment is 0
    combined_df = combined_df[combined_df['Avg_Sentiment'] != 0]

    combined_df.sort_values('Date', inplace=True)

    # Create lagged columns for Options % Spike
    combined_df['Log_Spike_Lag_0'] = combined_df['Log_Options_%_Spike']
    combined_df['Log_Spike_Lag_1'] = combined_df['Log_Options_%_Spike'].shift(-1)
    combined_df['Log_Spike_Lag_3'] = combined_df['Log_Options_%_Spike'].shift(-3)
    combined_df['Log_Spike_Lag_5'] = combined_df['Log_Options_%_Spike'].shift(-5)

    # Function to compute correlation
    def compute_corr(x, y):
        clean_df = combined_df[[x, y]].dropna()
        if len(clean_df) < 2:
            return np.nan
        correlation, _ = pearsonr(clean_df[x], clean_df[y])
        return correlation

    # Compute correlations for all lag periods
    corr_lag_0 = compute_corr('Avg_Sentiment', 'Log_Spike_Lag_0')
    corr_lag_1 = compute_corr('Avg_Sentiment', 'Log_Spike_Lag_1')
    corr_lag_3 = compute_corr('Avg_Sentiment', 'Log_Spike_Lag_3')
    corr_lag_5 = compute_corr('Avg_Sentiment', 'Log_Spike_Lag_5')

    # Store results
    correlation_results.append({
        'Ticker': ticker,
        'Corr_Lag_0': corr_lag_0,
        'Corr_Lag_1': corr_lag_1,
        'Corr_Lag_3': corr_lag_3,
        'Corr_Lag_5': corr_lag_5
    })

# Convert results to DataFrame
results_df = pd.DataFrame(correlation_results)

# -------------------------------
# 4. Identify Tickers with Highest Correlation
# -------------------------------

# Find the ticker with the highest correlation for each lag
max_corr_lag_0 = results_df.loc[results_df['Corr_Lag_0'].idxmax()]
max_corr_lag_1 = results_df.loc[results_df['Corr_Lag_1'].idxmax()]
max_corr_lag_3 = results_df.loc[results_df['Corr_Lag_3'].idxmax()]
max_corr_lag_5 = results_df.loc[results_df['Corr_Lag_5'].idxmax()]

print("\n📊 **Tickers with Highest Correlations** 📊")
print(f"\nSame Day (Lag 0): {max_corr_lag_0['Ticker']} with correlation {max_corr_lag_0['Corr_Lag_0']:.4f}")
print(f"+1 Day (Lag 1): {max_corr_lag_1['Ticker']} with correlation {max_corr_lag_1['Corr_Lag_1']:.4f}")
print(f"+3 Days (Lag 3): {max_corr_lag_3['Ticker']} with correlation {max_corr_lag_3['Corr_Lag_3']:.4f}")
print(f"+5 Days (Lag 5): {max_corr_lag_5['Ticker']} with correlation {max_corr_lag_5['Corr_Lag_5']:.4f}")

# -------------------------------
# 5. Save Results to CSV
# -------------------------------

results_df.to_csv("/content/drive/MyDrive/FYP/correlation_results_all_tickers.csv", index=False)
print("\n✅ Correlation results saved to 'correlation_results_all_tickers.csv'")


In [None]:
#Better visualisation of all correlations available 

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr

# ----------------------------
# 1. Load the Dataset
# ----------------------------
file_path = "/content/drive/MyDrive/FYP/v2_articles_with_reddit_sentiment_all.csv"
df = pd.read_csv(file_path)

# ----------------------------
# 2. Data Preprocessing
# ----------------------------
# Ensure numeric data types
numeric_columns = [
    'Sentiment_Score',
    'Reddit_Vader_Sentiment',
    'Options Volume',
    'Options % Spike',
    'Trading Volume',
    '% Spike'
]

for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with missing values in relevant columns
df.dropna(subset=numeric_columns + ['ticker'], inplace=True)

# ----------------------------
# 3. Spearman Correlation Calculation
# ----------------------------
def calculate_spearman(df, metric):
    correlations = {}
    for ticker in df['ticker'].unique():
        subset = df[df['ticker'] == ticker]
        if len(subset) > 1:  # Need at least 2 data points
            corr, _ = spearmanr(subset['Sentiment_Score'], subset[metric])
            correlations[ticker] = corr
        else:
            correlations[ticker] = None
    return correlations

# Compute correlations for all 5 metrics
metrics = {
    "Reddit_Vader_Sentiment": "Sentiment vs Reddit Vader Sentiment",
    "Options Volume": "Sentiment vs Options Trading Volume",
    "Options % Spike": "Sentiment vs Options % Spike",
    "Trading Volume": "Sentiment vs Trading Volume",
    "% Spike": "Sentiment vs % Spike"
}

correlation_results = {}

for metric, label in metrics.items():
    correlation_results[label] = calculate_spearman(df, metric)

# ----------------------------
# 4. Visualization
# ----------------------------
def plot_horizontal_bars(correlations, title):
    # Convert to DataFrame for easier plotting
    corr_df = pd.DataFrame(list(correlations.items()), columns=['Ticker', 'Spearman_Correlation'])
    corr_df.dropna(inplace=True)
    corr_df.sort_values('Spearman_Correlation', inplace=True)

    plt.figure(figsize=(10, 8))
    sns.barplot(x='Spearman_Correlation', y='Ticker', data=corr_df, palette='viridis')
    plt.axvline(0, color='black', linewidth=0.8, linestyle='--')
    plt.title(f'{title}')
    plt.xlabel('Spearman Rank Correlation')
    plt.ylabel('Ticker')
    plt.tight_layout()
    plt.show()

# Plot all five charts
for label, correlations in correlation_results.items():
    plot_horizontal_bars(correlations, label)


In [None]:
# Exploring consistently negative correlations - why is this the case? 
import pandas as pd

# ----------------------------
# 1. Identify Consistently Negative Correlations
# ----------------------------

def find_consistently_negative(correlation_results):
    # Convert the correlation dictionaries to DataFrames
    combined_df = pd.DataFrame(correlation_results)

    # Drop rows with NaN values (for missing correlations)
    combined_df.dropna(inplace=True)

    # Identify tickers where all correlations are negative across all metrics
    negative_tickers = combined_df[(combined_df < 0).all(axis=1)].index.tolist()

    return combined_df, negative_tickers

# Find companies with negative correlations in all metrics
combined_correlation_df, consistently_negative_tickers = find_consistently_negative(correlation_results)

# ----------------------------
# 2. Display Results
# ----------------------------
print("\n📉 **Companies with Consistently Negative Correlation Across All Metrics:**")
for ticker in consistently_negative_tickers:
    print(f" - {ticker}")

# ----------------------------
# 3. Optional: Export Results to CSV
# ----------------------------
# Save the full correlation data with negative indicators
combined_correlation_df.to_csv("/content/drive/MyDrive/FYP/consistently_negative_correlation_tickers.csv", index=True)
print("\n📂 Results exported to 'consistently_negative_correlation_tickers.csv'")


In [None]:
# Looking into companies with mixed (i.e. positive correlation over one time lag period but negative over another)
import pandas as pd

# --------------------------
# 1. Load the Correlation Data
# --------------------------
file_path = "/content/drive/MyDrive/FYP/filtered_correlation_analysis_with_avg.csv"  # Update if needed
correlations_df = pd.read_csv(file_path)

# --------------------------
# 2. Identify Companies with Mixed Correlations
# --------------------------
def has_mixed_correlation(row):
    values = [row['Sentiment_vs_Reddit'], row['Sentiment_vs_Options_Spike'], row['Sentiment_vs_Trading_Spike']]
    return any(v > 0 for v in values) and any(v < 0 for v in values)

# Apply the function to flag mixed correlations
correlations_df['Mixed_Correlation'] = correlations_df.apply(has_mixed_correlation, axis=1)

# Filter companies with mixed correlations
mixed_correlation_companies = correlations_df[correlations_df['Mixed_Correlation'] == True]

# --------------------------
# 3. Display and Save Results
# --------------------------
print("📊 **Companies with Mixed Correlations Across Metrics:**")
print(mixed_correlation_companies[['ticker', 'Sentiment_vs_Reddit', 'Sentiment_vs_Options_Spike', 'Sentiment_vs_Trading_Spike']])

# Save the results to a CSV
mixed_correlation_companies.to_csv("/content/drive/MyDrive/FYP/mixed_correlation_companies.csv", index=False)
print("\n✅ Results saved to 'mixed_correlation_companies.csv'")


In [None]:
# Granger Casuality tests 
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import grangercausalitytests

# Load the dataset

df = pd.read_csv(file_path)

# Ensure the date column is properly formatted
df['Formatted_Date'] = pd.to_datetime(df['Formatted_Date'])

# Prepare results dictionary to store outcomes for all tickers
results_dict = {}

# Set maximum lag for Granger causality
max_lag = 3

# Iterate over all unique tickers in the dataset
all_tickers = df['ticker'].unique()

for ticker in all_tickers:
    # Filter the data for the current ticker and sort by date
    df_ticker = df[df['ticker'] == ticker].sort_values("Formatted_Date")

    # Drop rows with NaN or infinite values
    df_ticker = df_ticker.replace([np.inf, -np.inf], np.nan).dropna(subset=['Sentiment_Score', 'Reddit_Vader_Sentiment'])

    # Ensure there are enough data points for Granger causality
    if len(df_ticker) < max_lag + 1:
        print(f"Skipping {ticker}: Not enough data points.")
        continue

    # Extract the two time series
    news_sentiment = df_ticker['Sentiment_Score'].values
    reddit_sentiment = df_ticker['Reddit_Vader_Sentiment'].values

    # Construct arrays for "news -> Reddit" Granger causality
    data_news_to_reddit = np.column_stack([reddit_sentiment, news_sentiment])

    # Run Granger causality tests for "news -> Reddit"
    try:
        result_n2r = grangercausalitytests(data_news_to_reddit, maxlag=max_lag, verbose=False)
        pvals_n2r = {lag: result_n2r[lag][0]['ssr_ftest'][1] for lag in range(1, max_lag + 1)}
    except Exception as e:
        print(f"Error processing {ticker} for news -> Reddit: {e}")
        pvals_n2r = {lag: None for lag in range(1, max_lag + 1)}

    # Construct arrays for "Reddit -> news" Granger causality
    data_reddit_to_news = np.column_stack([news_sentiment, reddit_sentiment])

    # Run Granger causality tests for "Reddit -> news"
    try:
        result_r2n = grangercausalitytests(data_reddit_to_news, maxlag=max_lag, verbose=False)
        pvals_r2n = {lag: result_r2n[lag][0]['ssr_ftest'][1] for lag in range(1, max_lag + 1)}
    except Exception as e:
        print(f"Error processing {ticker} for Reddit -> news: {e}")
        pvals_r2n = {lag: None for lag in range(1, max_lag + 1)}

    # Store results in the dictionary
    results_dict[ticker] = {
        "News_Granger_Causes_Reddit": pvals_n2r,
        "Reddit_Granger_Causes_News": pvals_r2n
    }

# Convert the results dictionary into a DataFrame for easier analysis and export
rows = []
for ticker, outcomes in results_dict.items():
    for lag in range(1, max_lag + 1):
        rows.append({
            "Ticker": ticker,
            "Lag": lag,
            "News_Granger_Causes_Reddit_pval": outcomes["News_Granger_Causes_Reddit"].get(lag),
            "Reddit_Granger_Causes_News_pval": outcomes["Reddit_Granger_Causes_News"].get(lag)
        })

results_df = pd.DataFrame(rows)

# Save results to a CSV
output_path = "granger_causality_results.csv"
results_df.to_csv(output_path, index=False)
print(f"Granger causality results saved to {output_path}")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import grangercausalitytests

# Load the dataset (replace with your file path)
file_path = "/content/drive/MyDrive/FYP/v2_reddit_sentiment_all_with_vader.csv"

data = pd.read_csv(file_path)


# Ensure the date column is in datetime format and sort the data
data['Formatted_Date'] = pd.to_datetime(data['Formatted_Date'])
data = data.sort_values(by=['ticker', 'Formatted_Date'])

# Focus on relevant columns
data = data[['ticker', 'Formatted_Date', 'Sentiment_Score', 'Reddit_Vader_Sentiment']]

# List of tickers to process
tickers = data['ticker'].unique()

# Define maximum lag
max_lag = 3

# Initialize a dictionary to store Granger causality p-values
granger_results = []

# Iterate through each ticker
for ticker in tickers:
    # Filter data for the current ticker
    ticker_data = data[data['ticker'] == ticker].dropna()
    if len(ticker_data) < max_lag + 1:
        continue  # Skip if not enough data points

    # Prepare the time series data
    news_sentiment = ticker_data['Sentiment_Score'].values
    reddit_sentiment = ticker_data['Reddit_Vader_Sentiment'].values

    # Create a 2D array for Granger causality tests
    data_news_to_reddit = np.column_stack([reddit_sentiment, news_sentiment])

    # Perform Granger causality tests for lags 1 to max_lag
    for lag in range(1, max_lag + 1):
        try:
            test_result = grangercausalitytests(data_news_to_reddit, maxlag=lag, verbose=False)
            p_value = test_result[lag][0]['ssr_ftest'][1]  # Extract p-value for the F-test
            granger_results.append({'Ticker': ticker, 'Lag': lag, 'p-value': p_value})
        except Exception as e:
            print(f"Error processing {ticker} at lag {lag}: {e}")
            continue

# Convert results to a DataFrame
granger_df = pd.DataFrame(granger_results)

# Pivot the DataFrame for visualization
pivot_df = granger_df.pivot(index='Ticker', columns='Lag', values='p-value')

# Drop tickers that don't have values for all lags
pivot_df = pivot_df.dropna()

# Plot the heatmap
plt.figure(figsize=(12, len(pivot_df) * 0.5))
sns.heatmap(
    pivot_df,
    annot=True,
    fmt=".3f",
    cmap='RdYlGn_r',
    cbar_kws={'label': 'p-value'},
    linewidths=0.5,
    linecolor='gray',
    mask=pivot_df.isna(),
    center=0.05
)
plt.title("Granger Causality p-values for News Sentiment → Reddit Sentiment", fontsize=14, pad=20)
plt.xlabel("Lag (days)", fontsize=12)
plt.ylabel("Ticker", fontsize=12)
plt.tight_layout()

# Save the figure
plt.savefig("granger_causality_news_reddit_filtered.png", dpi=300)
plt.show()

# Save the filtered data to a CSV for reference
pivot_df.to_csv("filtered_granger_causality_results.csv")
print("Filtered Granger causality data saved to filtered_granger_causality_results.csv")

In [None]:
# Summary sector wise distinctions - for thesis graphics 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = "ticker_optimal_weights.csv"  # Update this path
data = pd.read_csv(file_path)

# Ensure necessary columns exist
required_columns = ['ticker', 'best_correlation']
if not all(col in data.columns for col in required_columns):
    raise ValueError("CSV file is missing one or more required columns: " + ", ".join(required_columns))

# Create a mapping of tickers to sectors (example mapping)
ticker_to_sector = {
    # Communication Services
    'DIS': 'Communication Services', 'NFLX': 'Communication Services', 'CMCSA': 'Communication Services',
    'TMUS': 'Communication Services', 'GOOGL': 'Communication Services', 'CHTR': 'Communication Services',

    # Consumer Discretionary
    'AMZN': 'Consumer Discretionary', 'TSLA': 'Consumer Discretionary', 'NKE': 'Consumer Discretionary',
    'HD': 'Consumer Discretionary', 'AZO': 'Consumer Discretionary', 'YUM': 'Consumer Discretionary',

    # Consumer Staples
    'WMT': 'Consumer Staples', 'KO': 'Consumer Staples', 'PEP': 'Consumer Staples', 'MDLZ': 'Consumer Staples',
    'PG': 'Consumer Staples', 'KHC': 'Consumer Staples', 'MO': 'Consumer Staples',

    # Energy
    'XOM': 'Energy', 'CVX': 'Energy', 'COP': 'Energy', 'NEM': 'Energy',

    # Financials
    'BAC': 'Financials', 'JPM': 'Financials', 'C': 'Financials', 'MA': 'Financials', 'V': 'Financials',
    'WFC': 'Financials', 'CB': 'Financials',

    # Health Care
    'JNJ': 'Health Care', 'PFE': 'Health Care', 'ABBV': 'Health Care', 'MRK': 'Health Care',
    'LLY': 'Health Care', 'GILD': 'Health Care', 'MDT': 'Health Care', 'UNH': 'Health Care',

    # Industrials
    'GE': 'Industrials', 'BA': 'Industrials', 'UPS': 'Industrials', 'MMM': 'Industrials',
    'DELL': 'Industrials',

    # Information Technology
    'AAPL': 'Information Technology', 'MSFT': 'Information Technology', 'NVDA': 'Information Technology',
    'QCOM': 'Information Technology', 'ADBE': 'Information Technology', 'INTC': 'Information Technology',
    'CSCO': 'Information Technology', 'TXN': 'Information Technology', 'IBM': 'Information Technology',
    'ORCL': 'Information Technology', 'PYPL': 'Information Technology', 'AVGO': 'Information Technology',
    'AMD': 'Information Technology', 'ADP': 'Information Technology', 'INTU': 'Information Technology',

    # Materials
    'LIN': 'Materials', 'DHR': 'Materials',

    # Real Estate
    'PSA': 'Real Estate',

    # Utilities
    'DUK': 'Utilities', 'ED': 'Utilities', 'NEE': 'Utilities', 'PPL': 'Utilities',
}

# Map the sector information to the data
data['sector'] = data['ticker'].map(ticker_to_sector)

# Drop rows where the sector is missing
data = data.dropna(subset=['sector'])

# Filter out invalid correlation values
data = data[data['best_correlation'].notna() & (data['best_correlation'] > -float('inf'))]

# Calculate median and IQR for each sector
sector_stats = (
    data.groupby("sector")["best_correlation"]
    .agg(median="median", iqr=lambda x: x.quantile(0.75) - x.quantile(0.25))
    .reset_index()
)

# Sort sectors by IQR (descending)
sorted_sectors = sector_stats.sort_values(by="iqr", ascending=False)["sector"]
data["sector"] = pd.Categorical(data["sector"], categories=sorted_sectors, ordered=True)

# Plot settings
sns.set(style="whitegrid")

# Create a figure with two subplots: one for the histogram and one for the boxplot
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Histogram of correlation values grouped by sector
sns.histplot(
    data=data,
    x='best_correlation',
    hue='sector',
    multiple='stack',
    kde=False,
    palette='tab10',
    ax=axes[0]
)
axes[0].set_title("Histogram of Correlation Distribution by Sector", fontsize=14)
axes[0].set_xlabel("Correlation with Financial News Sentiment", fontsize=12)
axes[0].set_ylabel("Count", fontsize=12)
axes[0].legend(title='Sector', bbox_to_anchor=(1.05, 1), loc='upper left')

# Boxplot of correlation values grouped by sector
sns.boxplot(
    data=data,
    x='sector',
    y='best_correlation',
    palette='muted',
    ax=axes[1]
)

# Add median values to the boxplot
medians = data.groupby("sector")["best_correlation"].median()
for i, median in enumerate(medians):
    axes[1].text(
        i, median, f"{median:.2f}", ha="center", va="center",
        color="white", fontweight="bold", fontsize=9,
        bbox=dict(facecolor="black", alpha=0.7)
    )

axes[1].set_title("Boxplot of Correlation Distribution by Sector", fontsize=14)
axes[1].set_xlabel("Sector (Sorted by IQR)", fontsize=12)
axes[1].set_ylabel("Correlation with Financial News Sentiment", fontsize=12)
axes[1].tick_params(axis='x', rotation=45)

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()
