# Exploratory Data Analysis
## Statistical Arbitrage Research Project

This notebook performs initial exploratory analysis including:
- Data loading and quality checks
- Price series visualization
- Return distribution analysis
- Correlation analysis
- Stationarity testing

In [None]:
import sys
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from data_pipeline import DataPipeline
from stationarity_tests import StationarityTester
from utils import load_config

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

%matplotlib inline
%load_ext autoreload
%autoreload 2

## 1. Load Configuration and Data

In [None]:
# Load configuration
config = load_config('../config/strategy_config.yaml')

# Initialize data pipeline
pipeline = DataPipeline()

# Download data
tickers = config['data']['universe']
start_date = config['data']['start_date']
end_date = config['data']['end_date']

print(f"Downloading data for {len(tickers)} tickers...")
prices = pipeline.download_data(tickers, start_date, end_date)

print(f"\nData shape: {prices.shape}")
print(f"Date range: {prices.index[0]} to {prices.index[-1]}")
prices.head()

## 2. Data Quality Checks

In [None]:
# Check for missing data
missing_data = prices.isnull().sum()
print("Missing data by ticker:")
print(missing_data)

# Visualize missing data
plt.figure(figsize=(12, 6))
sns.heatmap(prices.isnull(), cbar=False, yticklabels=False)
plt.title('Missing Data Pattern')
plt.xlabel('Ticker')
plt.ylabel('Date')
plt.tight_layout()
plt.savefig('../reports/figures/missing_data_heatmap.png', dpi=300)
plt.show()

In [None]:
# Preprocess data
processed_prices = pipeline.preprocess()

# Get summary statistics
summary_stats = pipeline.get_summary_statistics()
print("\nSummary Statistics:")
summary_stats

## 3. Price Series Visualization

In [None]:
# Plot normalized prices
normalized_prices = processed_prices / processed_prices.iloc[0] * 100

fig, ax = plt.subplots(figsize=(14, 8))
for col in normalized_prices.columns:
    ax.plot(normalized_prices.index, normalized_prices[col], label=col, linewidth=2)

ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Normalized Price (Base=100)', fontsize=12)
ax.set_title('Normalized Price Series', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=10)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../reports/figures/normalized_prices.png', dpi=300)
plt.show()

## 4. Return Analysis

In [None]:
# Compute returns
returns = processed_prices.pct_change().dropna()

# Plot return distributions
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.flatten()

for i, col in enumerate(returns.columns):
    axes[i].hist(returns[col], bins=50, alpha=0.7, edgecolor='black')
    axes[i].set_title(f'{col} Returns', fontsize=10)
    axes[i].set_xlabel('Return', fontsize=9)
    axes[i].set_ylabel('Frequency', fontsize=9)
    axes[i].axvline(0, color='red', linestyle='--', linewidth=1)
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/return_distributions.png', dpi=300)
plt.show()

In [None]:
# Return statistics
return_stats = pd.DataFrame({
    'Mean': returns.mean() * 252,
    'Volatility': returns.std() * np.sqrt(252),
    'Sharpe': (returns.mean() / returns.std()) * np.sqrt(252),
    'Skewness': returns.skew(),
    'Kurtosis': returns.kurtosis()
})

print("\nAnnualized Return Statistics:")
return_stats

## 5. Correlation Analysis

In [None]:
# Compute correlation matrix
corr_matrix = processed_prices.corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Price Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../reports/figures/correlation_matrix.png', dpi=300)
plt.show()

# Identify highly correlated pairs
print("\nHighly Correlated Pairs (>0.8):")
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if corr_matrix.iloc[i, j] > 0.8:
            print(f"{corr_matrix.columns[i]} - {corr_matrix.columns[j]}: {corr_matrix.iloc[i, j]:.3f}")

## 6. Stationarity Testing

In [None]:
# Initialize stationarity tester
tester = StationarityTester(significance_level=0.05)

# Test all price series
print("Testing stationarity of price series...\n")
price_stationarity = tester.test_multiple_series(processed_prices, test_type='combined')
price_stationarity

In [None]:
# Test return series (should be stationary)
print("\nTesting stationarity of return series...\n")
return_stationarity = tester.test_multiple_series(returns, test_type='combined')
return_stationarity

## 7. Train/Test Split

In [None]:
# Split data
train_ratio = config['data']['train_test_split']
train_prices, test_prices = pipeline.train_test_split(train_ratio=train_ratio)

print(f"Training set: {len(train_prices)} observations")
print(f"Testing set: {len(test_prices)} observations")

# Visualize split
fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(train_prices.index, train_prices['SPY'], label='Train', color='blue', linewidth=2)
ax.plot(test_prices.index, test_prices['SPY'], label='Test', color='red', linewidth=2)
ax.axvline(train_prices.index[-1], color='black', linestyle='--', linewidth=2, label='Split')
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('SPY Price', fontsize=12)
ax.set_title('Train/Test Split', fontsize=14, fontweight='bold')
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../reports/figures/train_test_split.png', dpi=300)
plt.show()

## 8. Summary

### Key Findings:

1. **Data Quality**: All series have minimal missing data and are properly aligned
2. **Stationarity**: Price series are non-stationary (as expected), returns are stationary
3. **Correlations**: Several highly correlated pairs identified (>0.8)
4. **Returns**: Distributions show slight negative skew and excess kurtosis (fat tails)
5. **Train/Test**: Clean split with sufficient data in both sets

### Next Steps:

1. Proceed to cointegration testing (Notebook 02)
2. Identify statistically significant pairs
3. Construct spreads using Kalman Filter
4. Generate trading signals and backtest