#  📈 Exploratory Data Analysis and Visualization
# 
# **IMPORTANT**: This notebook **LOADS DATA** using the existing functions 
# 
#  Objectives:
# - ✅ Statistical analysis (NEW)
# - ✅ Correlation analysis (NEW) 
# - ✅ Trend analysis (NEW)
# - ✅ Volatility analysis (NEW)

#  Load Previously Downloaded Data

In [None]:
import warnings
from datetime import datetime, timedelta
from scipy import stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from data import load_raw_data, calculate_returns
import sys
sys.path.append('../src')
warnings.filterwarnings('ignore')

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)

# Load data that was already downloaded in notebook 01
tickers = ['AAPL', 'GOOGL', 'MSFT', 'TSLA', 'AMZN']
stock_data = {}
for ticker in tickers:
    stock_data[ticker] = load_raw_data(f'../data/raw/{ticker}.csv')
    print(f"✅ Loaded {ticker}: {stock_data[ticker].shape[0]} records")

print(f"\n📊 Loaded {len(stock_data)} stocks for EDA analysis")

#  Statistical Summary Analysis

In [None]:
print("📊 STATISTICAL SUMMARY ANALYSIS")
print("=" * 50)

# Calculate key statistics for each stock
summary_stats = {}
for ticker in tickers:
    data = stock_data[ticker]
    returns = calculate_returns(data['Close'])

    summary_stats[ticker] = {
        'Mean_Price': data['Close'].mean(),
        'Std_Price': data['Close'].std(),
        'Min_Price': data['Close'].min(),
        'Max_Price': data['Close'].max(),
        'Mean_Return': returns.mean(),
        'Std_Return': returns.std(),
        'Skewness': returns.skew(),
        'Kurtosis': returns.kurtosis(),
        'Sharpe_Ratio': returns.mean() / returns.std() * np.sqrt(252),
        'Max_Drawdown': ((data['Close'] / data['Close'].expanding().max()) - 1).min()
    }

# Convert to DataFrame for better visualization
stats_df = pd.DataFrame(summary_stats).T
print("Key Statistics by Stock:")
print(stats_df.round(4))

#  Returns Distribution Analysis

In [None]:
print("\n📈 RETURNS DISTRIBUTION ANALYSIS")
print("=" * 50)

# Calculate returns for all stocks
returns_data = {}
for ticker in tickers:
    returns_data[ticker] = calculate_returns(stock_data[ticker]['Close'])

returns_df = pd.DataFrame(returns_data)

# Create distribution analysis plot
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Returns Distribution Analysis', fontsize=16, fontweight='bold')

# Individual histograms
for i, ticker in enumerate(tickers):
    row, col = i // 3, i % 3
    ax = axes[row, col]

    returns_data[ticker].hist(bins=50, alpha=0.7, ax=ax)
    ax.set_title(f'{ticker} Returns Distribution')
    ax.set_xlabel('Daily Returns')
    ax.set_ylabel('Frequency')
    ax.grid(True, alpha=0.3)

    # Add normal distribution overlay
    mu, sigma = returns_data[ticker].mean(), returns_data[ticker].std()
    x = np.linspace(returns_data[ticker].min(),
                    returns_data[ticker].max(), 100)
    normal_curve = stats.norm.pdf(
        x, mu, sigma) * len(returns_data[ticker]) * (x[1] - x[0])
    ax.plot(x, normal_curve, 'r-', linewidth=2, label='Normal')
    ax.legend()

# Combined box plot
axes[1, 2].boxplot([returns_data[ticker].dropna() for ticker in tickers],
                   labels=tickers)
axes[1, 2].set_title('Returns Box Plot Comparison')
axes[1, 2].set_ylabel('Daily Returns')
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Normality tests
print("\nNormality Test Results (Shapiro-Wilk):")
for ticker in tickers:
    sample = returns_data[ticker].dropna().sample(
        min(5000, len(returns_data[ticker])))
    stat, p_value = stats.shapiro(sample)
    normal = "✅ Normal" if p_value > 0.05 else "❌ Non-normal"
    print(f"{ticker}: p-value = {p_value:.2e} ({normal})")

#  Correlation Analysis 

In [None]:
print("\n🔗 CORRELATION ANALYSIS")
print("=" * 50)

# Price correlations
price_df = pd.DataFrame(
    {ticker: stock_data[ticker]['Close'] for ticker in tickers})
price_corr = price_df.corr()

# Returns correlations
returns_corr = returns_df.corr()

# Volume correlations
volume_df = pd.DataFrame(
    {ticker: stock_data[ticker]['Volume'] for ticker in tickers})
volume_corr = volume_df.corr()

# Create correlation heatmaps
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle('Correlation Analysis', fontsize=16, fontweight='bold')

# Price correlations
sns.heatmap(price_corr, annot=True, cmap='coolwarm', center=0,
            ax=axes[0], cbar_kws={'label': 'Correlation'})
axes[0].set_title('Price Correlations')

# Returns correlations
sns.heatmap(returns_corr, annot=True, cmap='coolwarm', center=0,
            ax=axes[1], cbar_kws={'label': 'Correlation'})
axes[1].set_title('Returns Correlations')

# Volume correlations
sns.heatmap(volume_corr, annot=True, cmap='coolwarm', center=0,
            ax=axes[2], cbar_kws={'label': 'Correlation'})
axes[2].set_title('Volume Correlations')

plt.tight_layout()
plt.show()

# Print highest correlations
print("Highest Returns Correlations:")
returns_corr_flat = returns_corr.where(
    np.triu(np.ones(returns_corr.shape), k=1).astype(bool))
high_corr = returns_corr_flat.stack().sort_values(ascending=False)
for pair, corr in high_corr.head(3).items():
    print(f"{pair[0]} - {pair[1]}: {corr:.3f}")

#  Trend Analysis

In [None]:
print("\n📊 TREND ANALYSIS")
print("=" * 50)

# Calculate moving averages and trends
trend_data = {}
for ticker in tickers:
    data = stock_data[ticker].copy()

    # Moving averages
    data['MA_20'] = data['Close'].rolling(20).mean()
    data['MA_50'] = data['Close'].rolling(50).mean()
    data['MA_200'] = data['Close'].rolling(200).mean()

    # Trend indicators
    data['Trend_20_50'] = data['MA_20'] > data['MA_50']
    data['Trend_50_200'] = data['MA_50'] > data['MA_200']
    data['Golden_Cross'] = data['Trend_20_50'] & data['Trend_50_200']

    trend_data[ticker] = data

# Trend analysis visualization
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
fig.suptitle('Trend Analysis with Moving Averages',
             fontsize=16, fontweight='bold')

for i, ticker in enumerate(tickers):
    row, col = i // 3, i % 3
    ax = axes[row, col]

    data = trend_data[ticker]

    # Plot prices and moving averages
    ax.plot(data.index, data['Close'], label='Close', linewidth=1, alpha=0.8)
    ax.plot(data.index, data['MA_20'], label='MA 20', linewidth=1)
    ax.plot(data.index, data['MA_50'], label='MA 50', linewidth=1)
    ax.plot(data.index, data['MA_200'], label='MA 200', linewidth=1)

    ax.set_title(f'{ticker} - Price & Moving Averages')
    ax.set_ylabel('Price ($)')
    ax.legend()
    ax.grid(True, alpha=0.3)

# Golden Cross analysis for last subplot
axes[1, 2].clear()
golden_cross_counts = {}
for ticker in tickers:
    golden_cross_counts[ticker] = trend_data[ticker]['Golden_Cross'].sum()

bars = axes[1, 2].bar(golden_cross_counts.keys(), golden_cross_counts.values())
axes[1, 2].set_title('Golden Cross Days Count')
axes[1, 2].set_ylabel('Number of Days')
for bar, count in zip(bars, golden_cross_counts.values()):
    axes[1, 2].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,
                    str(count), ha='center')

plt.tight_layout()
plt.show()

# Print trend summary
print("Current Trend Status:")
for ticker in tickers:
    data = trend_data[ticker]
    latest = data.iloc[-1]
    trend_status = "🟢 Bullish" if latest['Golden_Cross'] else "🔴 Bearish"
    print(
        f"{ticker}: {trend_status} (MA20: ${latest['MA_20']:.2f}, MA50: ${latest['MA_50']:.2f})")

#  Volatility Analysis

In [None]:
print("\n⚡ VOLATILITY ANALYSIS")
print("=" * 50)

# Calculate different volatility measures
volatility_data = {}
for ticker in tickers:
    returns = calculate_returns(stock_data[ticker]['Close'])

    # Rolling volatility (30-day)
    rolling_vol = returns.rolling(30).std() * np.sqrt(252)

    # GARCH-like simple volatility clustering
    vol_clustering = returns.rolling(5).std()

    # High-low volatility (Garman-Klass estimator)
    high = stock_data[ticker]['High']
    low = stock_data[ticker]['Low']
    close = stock_data[ticker]['Close']
    open_price = stock_data[ticker]['Open']

    gk_vol = np.sqrt(0.5 * np.log(high/low)**2 -
                     (2*np.log(2)-1) * np.log(close/open_price)**2)

    volatility_data[ticker] = {
        'returns': returns,
        'rolling_vol': rolling_vol,
        'vol_clustering': vol_clustering,
        'gk_vol': gk_vol
    }

# Volatility visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Volatility Analysis', fontsize=16, fontweight='bold')

# Rolling volatility comparison
for ticker in tickers:
    axes[0, 0].plot(volatility_data[ticker]['rolling_vol'].index,
                    volatility_data[ticker]['rolling_vol'],
                    label=ticker, alpha=0.7)

axes[0, 0].set_title('30-Day Rolling Volatility')
axes[0, 0].set_ylabel('Annualized Volatility')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Volatility distribution
vol_values = [volatility_data[ticker]['rolling_vol'].dropna()
              for ticker in tickers]
axes[0, 1].boxplot(vol_values, labels=tickers)
axes[0, 1].set_title('Volatility Distribution by Stock')
axes[0, 1].set_ylabel('Volatility')
axes[0, 1].grid(True, alpha=0.3)

# Volatility clustering example (TSLA)
tsla_returns = volatility_data['TSLA']['returns']
axes[1, 0].plot(tsla_returns.index, tsla_returns, alpha=0.6)
axes[1, 0].set_title('TSLA Returns (Volatility Clustering)')
axes[1, 0].set_ylabel('Daily Returns')
axes[1, 0].grid(True, alpha=0.3)

# Average volatility by stock
avg_vols = {ticker: volatility_data[ticker]['rolling_vol'].mean()
            for ticker in tickers}
bars = axes[1, 1].bar(avg_vols.keys(), avg_vols.values())
axes[1, 1].set_title('Average Volatility by Stock')
axes[1, 1].set_ylabel('Volatility')
for bar, vol in zip(bars, avg_vols.values()):
    axes[1, 1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
                    f'{vol:.3f}', ha='center')

plt.tight_layout()
plt.show()

# Volatility rankings
print("Volatility Rankings (30-day average):")
vol_ranking = sorted(avg_vols.items(), key=lambda x: x[1], reverse=True)
for i, (ticker, vol) in enumerate(vol_ranking, 1):
    print(f"{i}. {ticker}: {vol:.3f}")

#  Market Regime Analysis 

In [None]:
print("\n🔄 MARKET REGIME ANALYSIS")
print("=" * 50)

# Simple regime identification based on VIX-like behavior
# Using average volatility as a proxy for market stress

# Calculate market-wide metrics
market_returns = returns_df.mean(axis=1)  # Equal-weighted market
market_vol = market_returns.rolling(30).std() * np.sqrt(252)

# Define regimes based on volatility quartiles
vol_q25 = market_vol.quantile(0.25)
vol_q75 = market_vol.quantile(0.75)


def get_regime(vol):
    if vol <= vol_q25:
        return 'Low Vol'
    elif vol >= vol_q75:
        return 'High Vol'
    else:
        return 'Normal'


market_regime = market_vol.apply(get_regime)

# Regime analysis visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
fig.suptitle('Market Regime Analysis', fontsize=16, fontweight='bold')

# Market volatility with regime coloring
colors = {'Low Vol': 'green', 'Normal': 'blue', 'High Vol': 'red'}
for regime in ['Low Vol', 'Normal', 'High Vol']:
    mask = market_regime == regime
    axes[0, 0].scatter(market_vol[mask].index, market_vol[mask],
                       c=colors[regime], label=regime, alpha=0.6, s=10)

axes[0, 0].set_title('Market Volatility Regimes')
axes[0, 0].set_ylabel('Market Volatility')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Regime distribution
regime_counts = market_regime.value_counts()
axes[0, 1].pie(regime_counts.values, labels=regime_counts.index, autopct='%1.1f%%',
               colors=[colors[regime] for regime in regime_counts.index])
axes[0, 1].set_title('Regime Distribution')

# Returns by regime
regime_returns = {}
for regime in ['Low Vol', 'Normal', 'High Vol']:
    mask = market_regime == regime
    regime_returns[regime] = market_returns[mask]

axes[1, 0].boxplot([regime_returns[regime].dropna() for regime in ['Low Vol', 'Normal', 'High Vol']],
                   labels=['Low Vol', 'Normal', 'High Vol'])
axes[1, 0].set_title('Returns by Market Regime')
axes[1, 0].set_ylabel('Daily Returns')
axes[1, 0].grid(True, alpha=0.3)

# Stock performance by regime
regime_performance = {}
for ticker in tickers:
    ticker_returns = returns_data[ticker]
    perf = {}
    for regime in ['Low Vol', 'Normal', 'High Vol']:
        mask = market_regime == regime
        aligned_returns = ticker_returns.reindex(market_regime.index)
        perf[regime] = aligned_returns[mask].mean()
    regime_performance[ticker] = perf

perf_df = pd.DataFrame(regime_performance).T
perf_df.plot(kind='bar', ax=axes[1, 1])
axes[1, 1].set_title('Average Stock Returns by Regime')
axes[1, 1].set_ylabel('Average Daily Return')
axes[1, 1].legend(title='Regime')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("Average Returns by Market Regime:")
for regime in ['Low Vol', 'Normal', 'High Vol']:
    avg_return = market_returns[market_regime == regime].mean()
    print(f"{regime}: {avg_return:.4f}")

#  Summary and Key Insights

In [None]:
print("\n🎯 KEY INSIGHTS SUMMARY")
print("=" * 60)

print("1. STATISTICAL INSIGHTS:")
most_volatile = stats_df['Std_Return'].idxmax()
least_volatile = stats_df['Std_Return'].idxmin()
best_sharpe = stats_df['Sharpe_Ratio'].idxmax()
print(f"   • Most volatile: {most_volatile}")
print(f"   • Least volatile: {least_volatile}")
print(f"   • Best Sharpe ratio: {best_sharpe}")

print("\n2. CORRELATION INSIGHTS:")
highest_corr_pair = returns_corr_flat.stack().idxmax()
lowest_corr_pair = returns_corr_flat.stack().idxmin()
print(
    f"   • Highest correlation: {highest_corr_pair[0]}-{highest_corr_pair[1]}")
print(f"   • Lowest correlation: {lowest_corr_pair[0]}-{lowest_corr_pair[1]}")

print("\n3. TREND INSIGHTS:")
bullish_stocks = [
    ticker for ticker in tickers if trend_data[ticker].iloc[-1]['Golden_Cross']]
bearish_stocks = [
    ticker for ticker in tickers if not trend_data[ticker].iloc[-1]['Golden_Cross']]
print(f"   • Currently bullish: {bullish_stocks}")
print(f"   • Currently bearish: {bearish_stocks}")

print("\n4. VOLATILITY INSIGHTS:")
print(
    f"   • Highest average volatility: {vol_ranking[0][0]} ({vol_ranking[0][1]:.3f})")
print(
    f"   • Lowest average volatility: {vol_ranking[-1][0]} ({vol_ranking[-1][1]:.3f})")

print("\n5. REGIME INSIGHTS:")
current_regime = market_regime.iloc[-1]
regime_days = len(market_regime[market_regime == current_regime])
print(f"   • Current market regime: {current_regime}")
print(
    f"   • Days in high volatility regime: {len(market_regime[market_regime == 'High Vol'])}")

print(f"\n📊 EDA Analysis Complete!")
print("=" * 60)