In [None]:
import pandas as pd
from textblob import TextBlob
import numpy as np
from scipy.stats import pearsonr


In [None]:
from src.data_processing import load_merged_data
from src.visualization import plot_correlation_heatmap

In [None]:
merged_df = load_merged_data()

In [None]:
import numpy as np

lags = range(-5, 6)
correlations = []

for lag in lags:
    corr = merged_df['rating_score'].corr(merged_df['price_change'].shift(-lag))
    correlations.append(corr)

plt.figure(figsize=(10,5))
plt.plot(lags, correlations, marker='o')
plt.axvline(0, color='red', linestyle='--')
plt.title('Sentiment-Price Correlation at Different Lags')
plt.xlabel('Days After Rating')
plt.ylabel('Pearson Correlation')
plt.grid(True)
plt.tight_layout()
plt.savefig('../figures/lag_correlation.png')
plt.show()

In [None]:

# Load datasets
news = pd.read_csv('../data/raw/financial_news.csv', parse_dates=['date'])
prices = pd.read_csv('../data/raw/stock_prices.csv', parse_dates=['Date'])

# Normalize timestamps (set both to market close time)
news['date'] = pd.to_datetime(news['date']).dt.tz_localize(None)
prices['date'] = pd.to_datetime(prices['Date']).dt.normalize() + pd.Timedelta(hours=16)  # 4 PM market close

# Merge on date
merged = pd.merge_asof(news.sort_values('date'), 
                      prices.sort_values('date'), 
                      on='date', 
                      direction='backward')

In [None]:
# Calculate daily returns
prices['daily_return'] = prices['Close'].pct_change() * 100  # Percentage

# Merge with sentiment
final_df = pd.merge(daily_sentiment, 
                   prices[['date', 'daily_return']], 
                   on='date').dropna()

In [None]:

corr, p_value = pearsonr(final_df['sentiment'], final_df['daily_return'])
print(f"Pearson Correlation: {corr:.3f} (p-value: {p_value:.4f})")

# Visualize
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
plt.scatter(final_df['sentiment'], final_df['daily_return'], alpha=0.5)
plt.title(f"Sentiment vs. Daily Returns (Correlation: {corr:.2f})")
plt.xlabel('Average Daily Sentiment Score')
plt.ylabel('Daily Price Return (%)')
plt.savefig('../figures/sentiment_correlation.png')
plt.show()

In [None]:
# Test correlations with lagged sentiment (0 to 3 days)
for lag in [0, 1, 2, 3]:
    final_df[f'sentiment_lag_{lag}'] = final_df['sentiment'].shift(lag)
    corr = final_df[[f'sentiment_lag_{lag}', 'daily_return']].corr().iloc[0,1]
    print(f"Lag {lag} days: {corr:.3f}")