In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data.panel_builder import PanelBuilder
from src.features.pipeline import FeaturePipeline
from src.evaluation.ic import compute_ic_series

%matplotlib inline
sns.set_style('whitegrid')

In [None]:
# Load data
prices = pd.read_parquet('../data/processed/prices.parquet')
returns = pd.read_parquet('../data/processed/returns.parquet')
features = pd.read_parquet('../data/features/features.parquet')

In [None]:
# Feature distribution analysis
feature_cols = features.columns[:20]
features[feature_cols].describe()

In [None]:
# Univariate IC analysis
target = returns.shift(-1)

univariate_ics = {}
for col in feature_cols:
    ic = compute_ic_series(features[col].unstack(), target)
    univariate_ics[col] = ic.mean()

ic_df = pd.Series(univariate_ics).sort_values(ascending=False)
ic_df.head(20).plot(kind='barh', figsize=(10, 8))
plt.xlabel('Mean IC')
plt.title('Top 20 Features by IC')

In [None]:
# Feature correlation
corr = features[feature_cols].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr, cmap='RdBu_r', center=0, vmin=-1, vmax=1)
plt.title('Feature Correlation Matrix')