# CCTeu Pricing Model - Exploratory Data Analysis

This notebook performs comprehensive exploratory data analysis for the CCTeu pricing model.

## Analysis Structure:
1. Data Loading and Initial Inspection
2. Time Series Analysis
3. Correlation Analysis
4. Volatility Analysis
5. Relative Value Analysis
6. Feature Engineering Validation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from scipy import stats
from statsmodels.tsa.stattools import adfuller
from statsmodels.stats.diagnostic import acorr_ljungbox
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configuration
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

## 1. Data Loading and Initial Inspection

In [None]:
# Load processed features
try:
    features_df = pd.read_csv('../data/processed/features.csv', index_col=0, parse_dates=True)
    print(f"Features dataset shape: {features_df.shape}")
    print(f"Date range: {features_df.index.min()} to {features_df.index.max()}")
    print(f"Missing values: {features_df.isnull().sum().sum()}")
except FileNotFoundError:
    print("Features file not found. Please run main.py first.")
    features_df = None

In [None]:
if features_df is not None:
    # Basic statistics
    display(features_df.describe())
    
    # Data types
    print("\nData types:")
    print(features_df.dtypes)

## 2. Time Series Analysis

In [None]:
if features_df is not None:
    # Plot price levels
    price_cols = [col for col in features_df.columns if col.startswith('price_')]
    
    if price_cols:
        fig = make_subplots(rows=2, cols=1, 
                           subplot_titles=('CCTeu Price Levels', 'Benchmark Instruments'),
                           vertical_spacing=0.1)
        
        # CCTeu prices
        ccteu_prices = [col for col in price_cols if 'IT000' in col]
        for col in ccteu_prices:
            fig.add_trace(go.Scatter(x=features_df.index, y=features_df[col], 
                                   name=col.replace('price_', ''), mode='lines'),
                         row=1, col=1)
        
        # Benchmark prices
        benchmark_prices = [col for col in price_cols if 'IT000' not in col]
        for col in benchmark_prices:
            fig.add_trace(go.Scatter(x=features_df.index, y=features_df[col], 
                                   name=col.replace('price_', ''), mode='lines'),
                         row=2, col=1)
        
        fig.update_layout(height=800, title_text="Historical Price Analysis")
        fig.show()

In [None]:
if features_df is not None:
    # Plot returns
    return_cols = [col for col in features_df.columns if col.startswith('ret_')]
    
    if return_cols:
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # Returns time series
        features_df[return_cols].plot(ax=axes[0,0], alpha=0.7)
        axes[0,0].set_title('Returns Time Series')
        axes[0,0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        
        # Returns distribution
        for col in return_cols[:5]:  # Limit to first 5 for readability
            axes[0,1].hist(features_df[col].dropna(), alpha=0.6, bins=50, label=col)
        axes[0,1].set_title('Returns Distribution')
        axes[0,1].legend()
        
        # Rolling volatility
        rolling_vol = features_df[return_cols].rolling(window=20).std() * np.sqrt(252)
        rolling_vol.plot(ax=axes[1,0], alpha=0.7)
        axes[1,0].set_title('Rolling Volatility (20-day)')
        axes[1,0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        
        # QQ plot for normality check
        if len(return_cols) > 0:
            stats.probplot(features_df[return_cols[0]].dropna(), dist="norm", plot=axes[1,1])
            axes[1,1].set_title(f'Q-Q Plot: {return_cols[0]}')
        
        plt.tight_layout()
        plt.show()

## 3. Correlation Analysis

In [None]:
if features_df is not None:
    # Correlation matrix for returns
    return_cols = [col for col in features_df.columns if col.startswith('ret_')]
    
    if return_cols:
        corr_matrix = features_df[return_cols].corr()
        
        plt.figure(figsize=(12, 10))
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
        sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
                   square=True, linewidths=0.5, cbar_kws={"shrink": .8})
        plt.title('Returns Correlation Matrix')
        plt.tight_layout()
        plt.show()
        
        # Print highest correlations
        print("\nHighest correlations (excluding self-correlation):")
        corr_unstack = corr_matrix.unstack()
        corr_unstack = corr_unstack[corr_unstack < 0.99].sort_values(ascending=False)
        print(corr_unstack.head(10))

## 4. Volatility Analysis

In [None]:
if features_df is not None:
    # Volatility statistics
    return_cols = [col for col in features_df.columns if col.startswith('ret_')]
    
    if return_cols:
        vol_stats = pd.DataFrame()
        
        for col in return_cols:
            returns = features_df[col].dropna()
            vol_stats.loc[col, 'Ann_Volatility'] = returns.std() * np.sqrt(252)
            vol_stats.loc[col, 'Skewness'] = returns.skew()
            vol_stats.loc[col, 'Kurtosis'] = returns.kurtosis()
            vol_stats.loc[col, 'Max_Drawdown'] = (returns.cumsum() - returns.cumsum().cummax()).min()
            vol_stats.loc[col, 'VaR_95'] = returns.quantile(0.05)
            vol_stats.loc[col, 'CVaR_95'] = returns[returns <= returns.quantile(0.05)].mean()
        
        display(vol_stats.round(4))

## 5. Stationarity Tests

In [None]:
if features_df is not None:
    # Augmented Dickey-Fuller test for stationarity
    def adf_test(series, name):
        result = adfuller(series.dropna())
        return {
            'Series': name,
            'ADF_Statistic': result[0],
            'p_value': result[1],
            'Critical_Values_1%': result[4]['1%'],
            'Critical_Values_5%': result[4]['5%'],
            'Stationary': result[1] < 0.05
        }
    
    # Test all numeric columns
    numeric_cols = features_df.select_dtypes(include=[np.number]).columns
    stationarity_results = []
    
    for col in numeric_cols:
        if features_df[col].dropna().shape[0] > 10:  # Minimum observations
            stationarity_results.append(adf_test(features_df[col], col))
    
    stationarity_df = pd.DataFrame(stationarity_results)
    display(stationarity_df.round(4))

## 6. Relative Value Analysis

In [None]:
try:
    rv_signals = pd.read_csv('../data/processed/relative_value_signals.csv', index_col=0, parse_dates=True)
    
    if not rv_signals.empty:
        # Plot relative value signals
        fig = make_subplots(rows=2, cols=1, 
                           subplot_titles=('Relative Value Z-Scores', 'Current Positioning'),
                           vertical_spacing=0.15)
        
        # Time series of RV signals
        for col in rv_signals.columns:
            fig.add_trace(go.Scatter(x=rv_signals.index, y=rv_signals[col], 
                                   name=col, mode='lines'), row=1, col=1)
        
        # Current positioning (bar chart)
        current_signals = rv_signals.iloc[-1]
        fig.add_trace(go.Bar(x=current_signals.index, y=current_signals.values, 
                           name='Current Z-Score'), row=2, col=1)
        
        fig.add_hline(y=0, line_dash="dash", line_color="black", row=1, col=1)
        fig.add_hline(y=0, line_dash="dash", line_color="black", row=2, col=1)
        
        fig.update_layout(height=800, title_text="Relative Value Analysis")
        fig.show()
        
        # Summary statistics
        print("\nRelative Value Signal Statistics:")
        display(rv_signals.describe())
        
        print("\nCurrent Signals (Most Recent):")
        current_sorted = current_signals.sort_values()
        print(f"Most Undervalued: {current_sorted.index[0]} (Z-Score: {current_sorted.iloc[0]:.2f})")
        print(f"Most Overvalued: {current_sorted.index[-1]} (Z-Score: {current_sorted.iloc[-1]:.2f})")
        
except FileNotFoundError:
    print("Relative value signals file not found. Please run main.py first.")

## 7. Feature Engineering Validation

In [None]:
if features_df is not None:
    # Z-score analysis
    z_score_cols = [col for col in features_df.columns if col.startswith('z_')]
    
    if z_score_cols:
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # Z-scores time series
        features_df[z_score_cols].plot(ax=axes[0,0], alpha=0.7)
        axes[0,0].set_title('Z-Scores Time Series')
        axes[0,0].axhline(y=0, color='black', linestyle='--', alpha=0.5)
        axes[0,0].axhline(y=2, color='red', linestyle='--', alpha=0.5)
        axes[0,0].axhline(y=-2, color='red', linestyle='--', alpha=0.5)
        axes[0,0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        
        # Z-scores distribution
        for col in z_score_cols:
            axes[0,1].hist(features_df[col].dropna(), alpha=0.6, bins=50, label=col)
        axes[0,1].set_title('Z-Scores Distribution')
        axes[0,1].axvline(x=0, color='black', linestyle='--', alpha=0.5)
        axes[0,1].legend()
        
        # Spread analysis
        spread_cols = [col for col in features_df.columns if 'spread' in col]
        if spread_cols:
            features_df[spread_cols].plot(ax=axes[1,0], alpha=0.7)
            axes[1,0].set_title('Spread Analysis')
            axes[1,0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        
        # Feature importance proxy (correlation with first CCTeu return)
        ccteu_ret_cols = [col for col in features_df.columns if col.startswith('ret_') and 'IT000' in col]
        if ccteu_ret_cols:
            target_col = ccteu_ret_cols[0]
            feature_cols = [col for col in features_df.columns if col.startswith(('ret_', 'z_')) and col != target_col]
            
            correlations = []
            for col in feature_cols:
                corr = features_df[target_col].corr(features_df[col])
                correlations.append({'Feature': col, 'Correlation': abs(corr)})
            
            corr_df = pd.DataFrame(correlations).sort_values('Correlation', ascending=True)
            
            axes[1,1].barh(range(len(corr_df)), corr_df['Correlation'])
            axes[1,1].set_yticks(range(len(corr_df)))
            axes[1,1].set_yticklabels(corr_df['Feature'])
            axes[1,1].set_title(f'Feature Importance (Correlation with {target_col})')
            axes[1,1].set_xlabel('Absolute Correlation')
        
        plt.tight_layout()
        plt.show()

## 8. Summary and Insights

In [None]:
if features_df is not None:
    print("=" * 60)
    print("EXPLORATORY DATA ANALYSIS - SUMMARY")
    print("=" * 60)
    
    # Data quality summary
    print(f"\n1. DATA QUALITY:")
    print(f"   - Dataset shape: {features_df.shape}")
    print(f"   - Date range: {features_df.index.min().strftime('%Y-%m-%d')} to {features_df.index.max().strftime('%Y-%m-%d')}")
    print(f"   - Missing values: {features_df.isnull().sum().sum()}/{features_df.size} ({features_df.isnull().sum().sum()/features_df.size*100:.2f}%)")
    
    # Returns analysis
    return_cols = [col for col in features_df.columns if col.startswith('ret_')]
    if return_cols:
        print(f"\n2. RETURNS ANALYSIS:")
        returns_stats = features_df[return_cols].describe()
        print(f"   - Average daily return: {returns_stats.loc['mean'].mean():.6f}")
        print(f"   - Average daily volatility: {returns_stats.loc['std'].mean():.6f}")
        print(f"   - Annualized volatility range: {returns_stats.loc['std'].min()*np.sqrt(252):.2f}% - {returns_stats.loc['std'].max()*np.sqrt(252):.2f}%")
    
    # Correlation insights
    if len(return_cols) > 1:
        corr_matrix = features_df[return_cols].corr()
        avg_corr = corr_matrix.values[np.triu_indices_from(corr_matrix.values, k=1)].mean()
        print(f"\n3. CORRELATION ANALYSIS:")
        print(f"   - Average pairwise correlation: {avg_corr:.3f}")
        print(f"   - Highest correlation: {corr_matrix.values[np.triu_indices_from(corr_matrix.values, k=1)].max():.3f}")
        print(f"   - Lowest correlation: {corr_matrix.values[np.triu_indices_from(corr_matrix.values, k=1)].min():.3f}")
    
    # Feature engineering validation
    z_score_cols = [col for col in features_df.columns if col.startswith('z_')]
    if z_score_cols:
        print(f"\n4. FEATURE ENGINEERING:")
        print(f"   - Z-score features: {len(z_score_cols)}")
        extreme_z_scores = (features_df[z_score_cols].abs() > 2).sum().sum()
        total_z_obs = features_df[z_score_cols].count().sum()
        print(f"   - Extreme z-scores (|z| > 2): {extreme_z_scores}/{total_z_obs} ({extreme_z_scores/total_z_obs*100:.1f}%)")
    
    print("\n" + "=" * 60)
    print("Analysis complete. Review plots above for detailed insights.")
    print("=" * 60)