In [None]:
# 📦 Import required libraries
import pandas as pd
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns

# 📁 Load datasets
news_df = pd.read_csv("./data/raw_analyst_ratings.csv", parse_dates=['date'])
stock_df = pd.read_csv("./data/AAPL_historical_data.csv", parse_dates=['Date'])

# 🧹 Clean and prepare stock data
stock_df.rename(columns={'Date': 'date'}, inplace=True)
stock_df.sort_values('date', inplace=True)
stock_df['Close'] = pd.to_numeric(stock_df['Close'], errors='coerce')
stock_df['daily_return'] = stock_df['Close'].pct_change()

# 🧹 Clean news data and perform sentiment analysis
news_df.dropna(subset=['headline'], inplace=True)
news_df['sentiment'] = news_df['headline'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

# 📅 Group average daily sentiment
daily_sentiment = news_df.groupby('date')['sentiment'].mean().reset_index()

# 🔗 Merge news sentiment and stock returns by date
merged_df = pd.merge(stock_df[['date', 'daily_return']], daily_sentiment, on='date', how='inner')
merged_df.dropna(inplace=True)

# 📈 Visualize the relationship
plt.figure(figsize=(10, 6))
sns.regplot(x='sentiment', y='daily_return', data=merged_df, scatter_kws={'alpha':0.6})
plt.title("Correlation Between News Sentiment and Daily Stock Return")
plt.xlabel("Average Daily Sentiment")
plt.ylabel("Daily Stock Return (%)")
plt.grid(True)
plt.tight_layout()
plt.show()

# 📊 Correlation coefficient
correlation = merged_df['sentiment'].corr(merged_df['daily_return'])
print(f"📌 Pearson Correlation Coefficient: {correlation:.4f}")
