In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime
from scipy import stats
from textblob import TextBlob

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

from src.features.sentiment_correlation import (
    load_data,
    normalize_dates,
    analyze_sentiment,
    calculate_daily_returns,
    aggregate_daily_sentiment,
    merge_sentiment_with_returns,
    calculate_correlation
)

from src.visualization.sentiment_visualization import (
    plot_sentiment_returns_scatter,
    plot_sentiment_returns_time_series,
    plot_lagged_correlations,
    plot_sentiment_distribution
)

In [None]:
data_dir = project_root / "src"
news_path = data_dir / "data" / "news"/"raw_analyst_ratings.csv"

# Choose a stock ticker to analyze
ticker = "AAPL"  
stock_path = data_dir / f"{ticker}.csv"

# Load the data
try:
    news_df, stock_df = load_data(str(news_path), str(stock_path))
    print(f"News data shape: {news_df.shape}")
    print(f"Stock data shape: {stock_df.shape}")
    
    print("\nNews data preview:")
    display(news_df.head())
    
    print(f"\n{ticker} stock data preview:")
    display(stock_df.head())
except Exception as e:
    print(f"Error loading data: {str(e)}")

In [None]:
if 'news_df' not in locals() or 'stock_df' not in locals():
    data_dir = project_root / "src"
    news_path = data_dir / "data" / "raw_analyst_ratings.csv"
    
    # Choose a stock ticker to analyze
    ticker = "AAPL"  # You can change this to any available ticker
    stock_path = data_dir / "historical_data" / f"{ticker}.csv"

    print("Creating sample data for demonstration...")
    
    # Sample news data
    news_df = pd.DataFrame({
        'date': pd.date_range(start='2023-01-01', periods=30),
        'headline': [
            f"{ticker} announces new product line",
            f"{ticker} beats earnings expectations",
            f"{ticker} stock drops on market concerns",
            f"Analysts upgrade {ticker} to buy",
            f"{ticker} faces regulatory scrutiny"
        ] * 6
    })
    
    # Sample stock data
    stock_df = pd.DataFrame({
        'Date': pd.date_range(start='2023-01-01', periods=50),
        'Open': np.random.uniform(150, 180, 50),
        'High': np.random.uniform(160, 190, 50),
        'Low': np.random.uniform(140, 170, 50),
        'Close': np.random.uniform(145, 185, 50),
        'Volume': np.random.randint(1000000, 5000000, 50)
    })
    
    print("Sample data created successfully")

# Now proceed with normalization
# Normalize dates
news_df, stock_df = normalize_dates(news_df, stock_df, 'date', 'Date')

# Check date ranges
print(f"News data date range: {news_df['date'].min()} to {news_df['date'].max()}")
print(f"Stock data date range: {stock_df['Date'].min()} to {stock_df['Date'].max()}")

In [None]:
# Check if we need to filter by stock
if 'stock' in news_df.columns:
    # Filter to only include news for our selected ticker
    ticker_news_df = news_df[news_df['stock'] == ticker].copy()
    print(f"Found {len(ticker_news_df)} news items for {ticker}")
else:
    # If no stock column, use all news
    ticker_news_df = news_df.copy()
    print("No stock column found in news data, using all news items")

# Display a few headlines
if 'headline' in ticker_news_df.columns:
    print("\nSample headlines:")
    for headline in ticker_news_df['headline'].head(5).tolist():
        print(f"- {headline}")

In [None]:
# Analyze sentiment
news_with_sentiment = analyze_sentiment(ticker_news_df, 'headline')

# Display results
display(
    news_with_sentiment[
        ['headline', 'polarity', 'subjectivity', 'sentiment_category']
    ].head(10)
)

# Summarize sentiment categories
sentiment_counts = news_with_sentiment['sentiment_category'].value_counts()
display(sentiment_counts)

# Plot sentiment distribution
plot_sentiment_distribution(
    news_with_sentiment,
    sentiment_col='polarity',
    title=f'Distribution of {ticker} News Sentiment'
)


In [None]:
# Calculate daily returns
stock_with_returns = calculate_daily_returns(stock_df, 'Date', 'Close')

# Display results
print("Stock returns calculation:")
display(stock_with_returns[['Date', 'Close', 'daily_return']].head(10))

# Summarize returns
returns_stats = stock_with_returns['daily_return'].describe()
print("\nReturns statistics:")
display(returns_stats)

# Plot returns distribution
plt.figure(figsize=(10, 6))
sns.histplot(stock_with_returns['daily_return'], kde=True)
plt.title(f'{ticker} Daily Returns Distribution')
plt.xlabel('Daily Return')
plt.ylabel('Frequency')
plt.axvline(x=0, color='r', linestyle='--')
plt.grid(alpha=0.3)
plt.show()

In [None]:
# Aggregate daily sentiment
daily_sentiment = aggregate_daily_sentiment(news_with_sentiment, 'date')

# Display results
print("Daily sentiment aggregation:")
display(daily_sentiment.head(10))

# Show days with multiple articles
multi_article_days = daily_sentiment[daily_sentiment['article_count'] > 1]
print(f"\nDays with multiple articles: {len(multi_article_days)}")
if len(multi_article_days) > 0:
    display(multi_article_days.head(5))

In [None]:
# Merge sentiment with returns
merged_df = merge_sentiment_with_returns(daily_sentiment, stock_with_returns, 'date', 'Date')

# Display results
print(f"Merged data shape: {merged_df.shape}")
print("Merged sentiment and returns data:")
display(merged_df.head(10))

# Check for any date misalignment
print(f"\nSentiment dates without matching returns: {len(daily_sentiment) - len(merged_df)}")

In [None]:
# Calculate correlation
correlation_results = calculate_correlation(merged_df)

# Display results
print("Correlation Results:")
print(f"Correlation coefficient: {correlation_results['correlation']:.3f}")
print(f"P-value: {correlation_results['p_value']:.3f}")
print(f"Statistically significant: {correlation_results['significant']}")
print(f"\nInterpretation:\n{correlation_results['interpretation']}")

# Print lagged correlation summary
print("\nLagged Correlations (sentiment leading returns):")
for lag_data in correlation_results['lagged_correlations']:
    lag = lag_data['lag']
    corr = lag_data['sentiment_leading_returns_corr']
    p_val = lag_data['sentiment_leading_returns_p']
    sig = "significant" if p_val < 0.05 else "not significant"
    print(f"  Lag {lag} day(s): {corr:.3f} (p={p_val:.3f}, {sig})")

In [None]:
plot_sentiment_returns_time_series(
    merged_df,
    title=f'Sentiment and Returns Over Time for {ticker}'
)
plt.show()


In [None]:
# --- MISSING THIS ---
import matplotlib.pyplot as plt 
# --------------------

# Time series plot
plt.figure(figsize=(12, 6))
plot_sentiment_returns_time_series(merged_df, title=f'Sentiment and Returns Over Time for {ticker}')
plt.show()


In [None]:
# Time series plot
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))
plot_sentiment_returns_time_series(merged_df, title=f'Sentiment and Returns Over Time for {ticker}')
plt.show()

In [None]:
# Lagged correlations
plt.figure(figsize=(12, 6))
plot_lagged_correlations(correlation_results, title=f'Lagged Correlations for {ticker}')
plt.show()

In [None]:
# Define function to analyze multiple stocks
def analyze_multiple_stocks(tickers):
    results = {}
    
    for ticker in tickers:
        stock_path = data_dir / f"{ticker}_historical_data.csv"
        
        if not os.path.exists(stock_path):
            print(f"Stock data file not found for {ticker}")
            continue
        
        try:
            # Load data
            news_df_tmp, stock_df_tmp = load_data(str(news_path), str(stock_path))
            
            # Filter news for this ticker if needed
            if 'stock' in news_df_tmp.columns:
                ticker_news = news_df_tmp[news_df_tmp['stock'] == ticker].copy()
                if len(ticker_news) == 0:
                    print(f"No news found for {ticker}")
                    continue
            else:
                ticker_news = news_df_tmp.copy()
            
            # Normalize dates
            ticker_news, stock_df_tmp = normalize_dates(ticker_news, stock_df_tmp, 'date', 'Date')
            
            # Analyze sentiment
            news_with_sentiment = analyze_sentiment(ticker_news, 'headline')
            
            # Calculate returns
            stock_with_returns = calculate_daily_returns(stock_df_tmp, 'Date', 'Close')
            
            # Aggregate daily sentiment
            daily_sentiment = aggregate_daily_sentiment(news_with_sentiment, 'date')
            
            # Merge sentiment with returns
            merged_df = merge_sentiment_with_returns(daily_sentiment, stock_with_returns, 'date', 'Date')
            
            # Calculate correlation
            correlation_results = calculate_correlation(merged_df)
            correlation_results['data'] = merged_df
            
            # Store results
            results[ticker] = correlation_results
            
            print(f"{ticker}: Correlation = {correlation_results['correlation']:.3f}, "
                 f"p-value = {correlation_results['p_value']:.3f}, "
                 f"significant = {correlation_results['significant']}")
            
        except Exception as e:
            print(f"Error analyzing {ticker}: {str(e)}")
    
    return results

# Analyze multiple stocks
print("Analyzing correlation for multiple stocks...\n")
stock_tickers = ["AAPL", "AMZN", "GOOG", "META", "MSFT", "NVDA", "TSLA"]
multi_stock_results = analyze_multiple_stocks(stock_tickers)

In [None]:
# Compare correlations across stocks
if multi_stock_results:
    # Extract correlation data
    tickers = list(multi_stock_results.keys())
    correlations = [multi_stock_results[ticker]['correlation'] for ticker in tickers]
    p_values = [multi_stock_results[ticker]['p_value'] for ticker in tickers]
    
    # Create a DataFrame for comparison
    comparison_df = pd.DataFrame({
        'Ticker': tickers,
        'Correlation': correlations,
        'P-Value': p_values,
        'Significant': [p < 0.05 for p in p_values]
    })
    
    # Sort by correlation strength
    comparison_df = comparison_df.sort_values('Correlation', key=abs, ascending=False)
    
    print("Correlation comparison across stocks:")
    display(comparison_df)
    
    # Create bar chart
    plt.figure(figsize=(12, 6))
    colors = ['green' if p < 0.05 else 'gray' for p in comparison_df['P-Value']]
    
    plt.bar(comparison_df['Ticker'], comparison_df['Correlation'], color=colors)
    plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    plt.title('Correlation between News Sentiment and Stock Returns')
    plt.xlabel('Stock Ticker')
    plt.ylabel('Pearson Correlation Coefficient')
    plt.grid(axis='y', alpha=0.3)
    
    # Add correlation values on top of bars
    for i, v in enumerate(comparison_df['Correlation']):
        plt.text(i, v + (0.01 if v >= 0 else -0.03), 
                f'{v:.3f}', 
                ha='center', va='bottom' if v >= 0 else 'top')
    
    # Add a legend
    from matplotlib.patches import Patch
    legend_elements = [
        Patch(facecolor='green', label='Statistically Significant (p<0.05)'),
        Patch(facecolor='gray', label='Not Significant')
    ]
    plt.legend(handles=legend_elements, loc='best')
    
    plt.show()
else:
    print("No results to compare.")


In [None]:
import  pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [None]:
# load news data
news = pd.read_csv("../data/news/raw_analyst_ratings.csv")
news.head()

In [None]:
# load stock data for apple
apple=pd.read_csv("../data/finance/AAPL.csv")
apple.head()

In [None]:
# Function to calculate sentiment polarity (-1 to 1)
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

news["Sentiment"] = news["headline"].apply(get_sentiment)

In [None]:
# Calculate daily average sentiment
apple["Daily_Return"] = apple["Close"].pct_change()
apple = apple[1:] 

In [None]:
# Aggregate sentiment by date
daily_sentiment = news.groupby("date")["Sentiment"].mean()

In [None]:
merged_df = pd.merge(
    apple,
    daily_sentiment,
    left_index=True,
    right_index=True,
    how="inner"
)

print(merged_df.head())

In [None]:
# Calculate correlation
import matplotlib.pyplot as plt
correlation = merged_df["Daily_Return"].corr(merged_df["Sentiment"])
# Plot as a bar
plt.figure(figsize=(6,4))
plt.bar("Apple", correlation, color="skyblue")
plt.y(-1, 1)
plt.ylabel("Correlation")
plt.title("Correlation between News Sentiment & Stock Returns")
plt.axhline(0, color='black', linewidth=0.8)
plt.show()

In [None]:
fig, ax1 = plt.subplots(figsize=(12,6))
ax1.set_xlabel("Date")
ax1.set_ylabel("Daily Return", color="blue")
ax1.plot(merged_df.index, merged_df["Daily_Return"], color="blue", label="Daily Return")
ax1.tick_params(axis="y", labelcolor="blue")
ax2 = ax1.twinx()
ax2.set_ylabel("Sentiment", color="orange")
ax2.plot(merged_df.index, merged_df["Sentiment"], color="orange", label="Sentiment")
ax2.tick_params(axis="y", labelcolor="orange")
fig.tight_layout()
plt.title("Daily Stock Returns vs News Sentiment")
plt.show()