# Correlation between news and stock movement
## ================================================================

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

from textblob import TextBlob
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/vader_lexicon')
except LookupError:
    nltk.download('vader_lexicon')

from nltk.sentiment import SentimentIntensityAnalyzer

from scipy.stats import pearsonr, spearmanr
from scipy import stats

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...


In [None]:

class NewsStockCorrelationAnalyzer:
    def __init__(self):
        self.stock_data = {}
        self.news_data = None
        self.merged_data = {}
        self.sia = SentimentIntensityAnalyzer()
        self.correlation_results = {}
    
    def load_stock_data(self, file_paths):

        
        for symbol, file_path in file_paths.items():
            try:
                df = pd.read_csv(file_path)
                df['Date'] = pd.to_datetime(df['Date'])
                df = df.sort_values('Date')
                
                # Calculate daily returns (percentage change)
                df['Daily_Return'] = df['Close'].pct_change() * 100
                df['Price_Change'] = df['Close'].diff()
                df['Volatility'] = df['Daily_Return'].rolling(window=5).std()
                
                # Remove NaN values
                df = df.dropna()
                
                self.stock_data[symbol] = df
                print(f"✓ {symbol}: {len(df)} records from {df['Date'].min().strftime('%Y-%m-%d')} to {df['Date'].max().strftime('%Y-%m-%d')}")
            except Exception as e:
                print(f"✗ Error loading {symbol}: {str(e)}")
    
    def load_news_data(self, file_path):
        """Load news data from CSV file"""
        print("\nLoading News Data...")
        print("=" * 50)

        try:
            self.news_data = pd.read_csv(file_path)

            # Debug: Check the date column format
            print("Debug - Date column info:")
            if 'date' in self.news_data.columns:
                print(f"Date column dtype: {self.news_data['date'].dtype}")
                print(f"Sample dates: {self.news_data['date'].head()}")

                # Try to convert dates immediately with error handling
                try:
                    self.news_data['date'] = pd.to_datetime(self.news_data['date'], errors='coerce')
                    print("✓ Date conversion successful")
                except Exception as e:
                    print(f"⚠️  Date conversion issue: {str(e)}")

            # Rest of your existing code...
            self.news_data['Date'] = pd.to_datetime(self.news_data['date']).dt.date
            self.news_data['Date'] = pd.to_datetime(self.news_data['Date'])

            print(f" Loaded {len(self.news_data)} news articles")
            print(f"  Date range: {self.news_data['Date'].min().strftime('%Y-%m-%d')} to {self.news_data['Date'].max().strftime('%Y-%m-%d')}")
            print(f"  Unique stocks: {self.news_data['stock'].nunique()}")
            print(f"  Stock symbols: {list(self.news_data['stock'].unique())}")

        except Exception as e:
            print(f"Error loading news data: {str(e)}")
    
    def perform_sentiment_analysis(self):
        """Perform sentiment analysis on news headlines"""
        print("\nPerforming Sentiment Analysis...")
        print("=" * 50)
        
        if self.news_data is None:
            print("No news data loaded!")
            return
        
        # Initialize sentiment columns
        self.news_data['textblob_polarity'] = 0.0
        self.news_data['textblob_subjectivity'] = 0.0
        self.news_data['vader_compound'] = 0.0
        self.news_data['vader_positive'] = 0.0
        self.news_data['vader_negative'] = 0.0
        self.news_data['vader_neutral'] = 0.0
        
        for idx, row in self.news_data.iterrows():
            headline = str(row['headline'])
            
            # TextBlob sentiment
            blob = TextBlob(headline)
            self.news_data.at[idx, 'textblob_polarity'] = blob.sentiment.polarity
            self.news_data.at[idx, 'textblob_subjectivity'] = blob.sentiment.subjectivity
            
            # VADER sentiment
            vader_scores = self.sia.polarity_scores(headline)
            self.news_data.at[idx, 'vader_compound'] = vader_scores['compound']
            self.news_data.at[idx, 'vader_positive'] = vader_scores['pos']
            self.news_data.at[idx, 'vader_negative'] = vader_scores['neg']
            self.news_data.at[idx, 'vader_neutral'] = vader_scores['neu']
        
        # Create sentiment categories
        self.news_data['sentiment_category'] = self.news_data['textblob_polarity'].apply(
            lambda x: 'Positive' if x > 0.1 else ('Negative' if x < -0.1 else 'Neutral')
        )
        
        print(f"Sentiment analysis completed for {len(self.news_data)} headlines")
        print("\nSentiment Distribution:")
        print(self.news_data['sentiment_category'].value_counts())
        
        # Show sample results
        print("\nSample Sentiment Analysis Results:")
        sample_df = self.news_data[['headline', 'textblob_polarity', 'vader_compound', 'sentiment_category']].head()
        for _, row in sample_df.iterrows():
            print(f"Headline: {row['headline'][:60]}...")
            print(f"  TextBlob: {row['textblob_polarity']:.3f}, VADER: {row['vader_compound']:.3f}, Category: {row['sentiment_category']}")
            print()
    
    def aggregate_daily_sentiment(self):
        """Aggregate sentiment scores by date and stock"""
        print("Aggregating Daily Sentiment Scores...")
        print("=" * 50)

        # Check if news data exists and has sentiment analysis completed
        if self.news_data is None:
            print("✗ Error: No news data loaded. Please load news data first.")
            return None

        # Debug: Print column names and first few rows
        print("Debug Info:")
        print(f"Available columns: {list(self.news_data.columns)}")
        print(f"Data shape: {self.news_data.shape}")
        print("\nFirst few rows of date column:")
        if 'date' in self.news_data.columns:
            print(self.news_data['date'].head())
            print(f"Date column dtype: {self.news_data['date'].dtype}")
            print(f"Sample date values: {self.news_data['date'].iloc[:5].tolist()}")

        # Check if Date column exists, if not try to create it
        if 'Date' not in self.news_data.columns:
            print("⚠️  'Date' column not found. Attempting to create it...")

            if 'date' in self.news_data.columns:
                # First, let's examine the date format
                print("Analyzing date format...")
                sample_dates = self.news_data['date'].dropna().head()
                print(f"Sample dates: {sample_dates.tolist()}")

                try:
                    # Try different approaches to convert dates
                    print("Attempting to convert dates...")

                    # Method 1: Direct conversion
                    try:
                        self.news_data['Date'] = pd.to_datetime(self.news_data['date'], errors='coerce')
                        print("✓ Method 1: Direct conversion successful")
                    except:
                        print("✗ Method 1: Direct conversion failed")

                        # Method 2: Try with specific format inference
                        try:
                            self.news_data['Date'] = pd.to_datetime(self.news_data['date'], infer_datetime_format=True, errors='coerce')
                            print("✓ Method 2: Format inference successful")
                        except:
                            print("✗ Method 2: Format inference failed")

                            # Method 3: Try common date formats
                            date_formats = ['%Y-%m-%d', '%m/%d/%Y', '%d/%m/%Y', '%Y/%m/%d', '%m-%d-%Y', '%d-%m-%Y']
                            success = False

                            for fmt in date_formats:
                                try:
                                    self.news_data['Date'] = pd.to_datetime(self.news_data['date'], format=fmt, errors='coerce')
                                    print(f"✓ Method 3: Format {fmt} successful")
                                    success = True
                                    break
                                except:
                                    continue
                                
                            if not success:
                                print("✗ All conversion methods failed")
                                return None

                    # Check for NaT values after conversion
                    nat_count = self.news_data['Date'].isna().sum()
                    if nat_count > 0:
                        print(f"⚠️  Warning: {nat_count} dates could not be parsed and were set to NaT")
                        print("Dropping rows with invalid dates...")
                        self.news_data = self.news_data.dropna(subset=['Date'])
                        print(f"✓ Remaining rows after dropping invalid dates: {len(self.news_data)}")

                    print("✓ Date column created successfully")
                    print(f"Date range: {self.news_data['Date'].min()} to {self.news_data['Date'].max()}")

                except Exception as e:
                    print(f"✗ Error converting dates: {str(e)}")
                    print("Please check your date format. Expected formats include:")
                    print("- YYYY-MM-DD (e.g., 2023-01-15)")
                    print("- MM/DD/YYYY (e.g., 01/15/2023)")
                    print("- DD/MM/YYYY (e.g., 15/01/2023)")
                    return None
            else:
                print("✗ Error: No 'date' or 'Date' column found in news data")
                print(f"Available columns: {list(self.news_data.columns)}")
                return None

        # Verify Date column is datetime
        if not pd.api.types.is_datetime64_any_dtype(self.news_data['Date']):
            print("⚠️  Converting 'Date' column to datetime...")
            try:
                self.news_data['Date'] = pd.to_datetime(self.news_data['Date'], errors='coerce')
                print("✓ 'Date' column converted to datetime")
            except Exception as e:
                print(f"✗ Error converting Date column: {str(e)}")
                return None

        # Check if sentiment analysis has been performed
        required_columns = ['textblob_polarity', 'vader_compound', 'vader_positive', 'vader_negative', 'vader_neutral']
        missing_columns = [col for col in required_columns if col not in self.news_data.columns]

        if missing_columns:
            print(f"✗ Error: Sentiment analysis not completed. Missing columns: {missing_columns}")
            print("Please run perform_sentiment_analysis() first.")
            return None

        # Check if stock column exists
        if 'stock' not in self.news_data.columns:
            print("✗ Error: 'stock' column not found in news data")
            print(f"Available columns: {list(self.news_data.columns)}")
            return None

        print(f"✓ All required columns found. Proceeding with aggregation...")
        print(f"Date range: {self.news_data['Date'].min()} to {self.news_data['Date'].max()}")
        print(f"Unique stocks: {self.news_data['stock'].unique()}")

        try:
            # Group by date and stock, calculate daily averages
            daily_sentiment = self.news_data.groupby(['Date', 'stock']).agg({
                'textblob_polarity': ['mean', 'std', 'count'],
                'vader_compound': ['mean', 'std'],
                'vader_positive': 'mean',
                'vader_negative': 'mean',
                'vader_neutral': 'mean'
            }).reset_index()

            # Flatten column names
            daily_sentiment.columns = [
                'Date', 'stock', 'avg_textblob_polarity', 'std_textblob_polarity', 'news_count',
                'avg_vader_compound', 'std_vader_compound', 'avg_vader_positive', 
                'avg_vader_negative', 'avg_vader_neutral'
            ]

            # Fill NaN standard deviations with 0 (when only one news item per day)
            daily_sentiment['std_textblob_polarity'] = daily_sentiment['std_textblob_polarity'].fillna(0)
            daily_sentiment['std_vader_compound'] = daily_sentiment['std_vader_compound'].fillna(0)

            self.daily_sentiment = daily_sentiment

            print(f"✓ Daily sentiment aggregated for {len(daily_sentiment)} stock-date combinations")
            print(f"  Date range: {daily_sentiment['Date'].min().strftime('%Y-%m-%d')} to {daily_sentiment['Date'].max().strftime('%Y-%m-%d')}")

            return daily_sentiment

        except Exception as e:
            print(f"✗ Error during aggregation: {str(e)}")
            print("Debugging info:")
            print(f"news_data shape: {self.news_data.shape}")
            print(f"news_data columns: {list(self.news_data.columns)}")
            print(f"Date column dtype: {self.news_data['Date'].dtype}")
            print(f"Stock column dtype: {self.news_data['stock'].dtype}")
            return None

        
    def merge_data_for_correlation(self):
        """Merge stock and sentiment data for correlation analysis"""
        print("\nMerging Stock and Sentiment Data...")
        print("=" * 50)
        
        # Map stock symbols
        stock_symbol_map = {
            'AAPL': 'AAPL', 
            'AMZN': 'AMZN', 
            'GOOG': 'GOOG', 
            'MET': 'META', 
            'MSF': 'MSFT', 
            'NVDA': 'NVDA', 
            'TSLA': 'TSLA', 
        }
        
        merged_results = {}
        
        for news_symbol, stock_symbol in stock_symbol_map.items():
            if stock_symbol in self.stock_data:
                # Filter sentiment data for this stock
                stock_sentiment = self.daily_sentiment[self.daily_sentiment['stock'] == news_symbol].copy()
                
                # Get stock data
                stock_df = self.stock_data[stock_symbol].copy()
                
                # Merge on date
                merged = pd.merge(
                    stock_df[['Date', 'Close', 'Daily_Return', 'Volume', 'Volatility']], 
                    stock_sentiment, 
                    on='Date', 
                    how='inner'
                )
                
                if len(merged) > 0:
                    merged_results[stock_symbol] = merged
                    print(f" {stock_symbol}: {len(merged)} matching records")
                else:
                    print(f" {stock_symbol}: No matching dates found")
            else:
                print(f" Stock symbol {stock_symbol} not found in stock data")
        
        self.merged_data = merged_results
        return merged_results
    
    def calculate_correlations(self):
        """Calculate correlations between sentiment and stock returns"""
        print("\nCalculating Correlations...")
        print("=" * 50)
        
        correlation_results = {}
        
        for symbol, data in self.merged_data.items():
            print(f"\n{symbol} Analysis:")
            print("-" * 30)
            
            results = {}
            
            # Pearson correlations
            metrics = ['Daily_Return', 'Volume', 'Volatility']
            sentiment_metrics = ['avg_textblob_polarity', 'avg_vader_compound']
            
            for metric in metrics:
                for sentiment in sentiment_metrics:
                    try:
                        corr_coef, p_value = pearsonr(data[sentiment], data[metric])
                        results[f'{sentiment}_vs_{metric}'] = {
                            'correlation': corr_coef,
                            'p_value': p_value,
                            'significant': p_value < 0.05
                        }
                        
                        significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else ""
                        print(f"{sentiment} vs {metric}: r={corr_coef:.4f}, p={p_value:.4f} {significance}")
                        
                    except Exception as e:
                        print(f"Error calculating correlation for {sentiment} vs {metric}: {str(e)}")
            
            # Summary statistics
            results['summary'] = {
                'total_observations': len(data),
                'date_range': f"{data['Date'].min().strftime('%Y-%m-%d')} to {data['Date'].max().strftime('%Y-%m-%d')}",
                'avg_daily_return': data['Daily_Return'].mean(),
                'std_daily_return': data['Daily_Return'].std(),
                'avg_sentiment_textblob': data['avg_textblob_polarity'].mean(),
                'avg_sentiment_vader': data['avg_vader_compound'].mean()
            }
            
            correlation_results[symbol] = results
        
        self.correlation_results = correlation_results
        return correlation_results
    
    def create_visualizations(self):
        """Create visualizations for the analysis"""
        print("\nCreating Visualizations...")
        print("=" * 50)
        
        # Set up the plotting style
        plt.style.use('default')
        sns.set_palette("husl")
        
        for symbol, data in self.merged_data.items():
            fig, axes = plt.subplots(2, 2, figsize=(15, 12))
            fig.suptitle(f'{symbol}: News Sentiment vs Stock Movement Analysis', fontsize=16, fontweight='bold')
            
            # 1. Time series plot
            ax1 = axes[0, 0]
            ax1_twin = ax1.twinx()
            
            ax1.plot(data['Date'], data['Daily_Return'], color='blue', alpha=0.7, label='Daily Return (%)')
            ax1_twin.plot(data['Date'], data['avg_textblob_polarity'], color='red', alpha=0.7, label='Sentiment Score')
            
            ax1.set_ylabel('Daily Return (%)', color='blue')
            ax1_twin.set_ylabel('Sentiment Score', color='red')
            ax1.set_title('Daily Returns vs Sentiment Over Time')
            ax1.tick_params(axis='x', rotation=45)
            ax1.grid(True, alpha=0.3)
            
            # 2. Scatter plot - TextBlob
            ax2 = axes[0, 1]
            scatter = ax2.scatter(data['avg_textblob_polarity'], data['Daily_Return'], 
                                alpha=0.6, c=data['news_count'], cmap='viridis')
            ax2.set_xlabel('Average TextBlob Sentiment')
            ax2.set_ylabel('Daily Return (%)')
            ax2.set_title('Sentiment vs Daily Returns (TextBlob)')
            
            # Add trend line
            z = np.polyfit(data['avg_textblob_polarity'], data['Daily_Return'], 1)
            p = np.poly1d(z)
            ax2.plot(data['avg_textblob_polarity'], p(data['avg_textblob_polarity']), "r--", alpha=0.8)
            
            plt.colorbar(scatter, ax=ax2, label='News Count')
            
            # 3. Scatter plot - VADER
            ax3 = axes[1, 0]
            scatter2 = ax3.scatter(data['avg_vader_compound'], data['Daily_Return'], 
                                 alpha=0.6, c=data['news_count'], cmap='viridis')
            ax3.set_xlabel('Average VADER Sentiment')
            ax3.set_ylabel('Daily Return (%)')
            ax3.set_title('Sentiment vs Daily Returns (VADER)')
            
            # Add trend line
            z2 = np.polyfit(data['avg_vader_compound'], data['Daily_Return'], 1)
            p2 = np.poly1d(z2)
            ax3.plot(data['avg_vader_compound'], p2(data['avg_vader_compound']), "r--", alpha=0.8)
            
            plt.colorbar(scatter2, ax=ax3, label='News Count')
            
            # 4. Sentiment distribution
            ax4 = axes[1, 1]
            sentiment_data = self.news_data[self.news_data['stock'] == data['stock'].iloc[0]]
            ax4.hist([sentiment_data['textblob_polarity'], sentiment_data['vader_compound']], 
                    bins=20, alpha=0.7, label=['TextBlob', 'VADER'])
            ax4.set_xlabel('Sentiment Score')
            ax4.set_ylabel('Frequency')
            ax4.set_title('Sentiment Score Distribution')
            ax4.legend()
            
            plt.tight_layout()
            plt.show()
            
            # Create correlation heatmap
            fig, ax = plt.subplots(1, 1, figsize=(10, 8))
            
            # Prepare correlation matrix
            corr_data = data[['Daily_Return', 'avg_textblob_polarity', 'avg_vader_compound', 
                            'Volume', 'news_count']].corr()
            
            # Create heatmap
            sns.heatmap(corr_data, annot=True, cmap='RdBu_r', center=0, 
                       square=True, ax=ax, cbar_kws={'label': 'Correlation Coefficient'})
            ax.set_title(f'{symbol}: Correlation Matrix', fontsize=14, fontweight='bold')
            
            plt.tight_layout()
            plt.show()
    
    def generate_report(self):
        """Generate a comprehensive analysis report"""
        print("\n" + "="*70)
        print("NEWS SENTIMENT AND STOCK MOVEMENT CORRELATION ANALYSIS REPORT")
        print("="*70)
        
        print(f"\nAnalysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"Stocks Analyzed: {list(self.correlation_results.keys())}")
        
        for symbol, results in self.correlation_results.items():
            print(f"\n{symbol} - DETAILED ANALYSIS")
            print("-" * 50)
            
            summary = results['summary']
            print(f"Period: {summary['date_range']}")
            print(f"Total Observations: {summary['total_observations']}")
            print(f"Average Daily Return: {summary['avg_daily_return']:.4f}%")
            print(f"Daily Return Volatility: {summary['std_daily_return']:.4f}%")
            print(f"Average TextBlob Sentiment: {summary['avg_sentiment_textblob']:.4f}")
            print(f"Average VADER Sentiment: {summary['avg_sentiment_vader']:.4f}")
            
            print(f"\nCORRELATION ANALYSIS:")
            print("Key Findings:")
            
            # Find strongest correlations
            strongest_corr = 0
            strongest_pair = ""
            
            for key, value in results.items():
                if key != 'summary' and abs(value['correlation']) > abs(strongest_corr):
                    strongest_corr = value['correlation']
                    strongest_pair = key
            
            print(f"• Strongest correlation: {strongest_pair}")
            print(f"  Correlation coefficient: {strongest_corr:.4f}")
        
        print(f"\nKEY INSIGHTS:")
        print("• This analysis examines the relationship between news sentiment and stock price movements")
        print("• Correlation coefficients range from -1 to +1")
        print("• Values closer to +1 indicate positive correlation (positive news → positive returns)")
        print("• Values closer to -1 indicate negative correlation (positive news → negative returns)")
        print("• P-values < 0.05 indicate statistically significant relationships")
        
        print(f"\nRECOMMendations FOR FURTHER ANALYSIS:")
        print("• Increase sample size with more news data")
        print("• Analyze lagged correlations (news impact on next-day returns)")
        print("• Consider market conditions and external factors")
        print("• Implement more sophisticated sentiment analysis models")
        print("• Analyze sector-specific patterns")

# Main execution function
def run_analysis():
    """Main function to run the complete analysis"""
    
    analyzer = NewsStockCorrelationAnalyzer()
    
    stock_files = {
        'AAPL': '../data/yfinance_data/AAPL_historical_data.csv',
        'AMZN': '../data/yfinance_data/AMZN_historical_data.csv',
        'GOOG': '../data/yfinance_data/GOOG_historical_data.csv',
        'META': '../data/yfinance_data/META_historical_data.csv',
        'MSFT': '../data/yfinance_data/MSFT_historical_data.csv',
        'NVDA': '../data/yfinance_data/NVDA_historical_data.csv',
        'TSLA': '../data/yfinance_data/TSLA_historical_data.csv'
    }
    
    # Load data
    analyzer.load_stock_data(stock_files)
    analyzer.load_news_data('../data/raw_analyst_ratings.csv/raw_analyst_ratings.csv') 
    
    # Perform analysis
    analyzer.perform_sentiment_analysis()
    analyzer.aggregate_daily_sentiment()
    analyzer.merge_data_for_correlation()
    analyzer.calculate_correlations()
    
    # Create visualizations
    analyzer.create_visualizations()
    
    # Generate report
    analyzer.generate_report()
    
    return analyzer

# Run the analysis
if __name__ == "__main__":
    print("Starting Task 3: News Sentiment and Stock Movement Correlation Analysis")
    print("="*70)
    
    # Run the complete analysis
    analyzer = run_analysis()
    
    print("\n" + "="*70)
    print("ANALYSIS COMPLETED SUCCESSFULLY!")
    print("="*70)