# 02 - Feature Analysis

Analyze computed features: distributions, correlations, and importance.

**Contents:**
- Load feature data from Parquet
- Feature distributions and summary statistics
- Feature-target correlations (IC analysis)
- Feature importance from Ridge baseline
- Cross-asset and regime feature analysis

In [None]:
# === Colab Auto-Detection ===
import sys, os
if "google.colab" in sys.modules:
    import subprocess
    if not os.path.exists("/content/quant-lab"):
        subprocess.run(["git", "clone", "https://github.com/Mohit1053/quant-lab.git", "/content/quant-lab"], check=True)
    os.chdir("/content/quant-lab")
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-e", "."], check=True)
    from google.colab import drive
    drive.mount("/content/drive", force_remount=False)
    # Symlink data from Drive if available
    from pathlib import Path
    drive_data = Path("/content/drive/MyDrive/quant_lab/data")
    if drive_data.exists():
        import shutil
        for sub in ["raw", "cleaned", "features"]:
            src = drive_data / sub
            dst = Path("data") / sub
            if src.exists():
                dst.mkdir(parents=True, exist_ok=True)
                for f in src.glob("*.parquet"):
                    shutil.copy(f, dst / f.name)
    print("Colab setup complete!")
else:
    sys.path.insert(0, "../src")


In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from quant_lab.features.feature_store import FeatureStore
from quant_lab.features.engine import FeatureEngine
from quant_lab.models.linear_baseline import RidgeBaseline
from quant_lab.data.datasets import TemporalSplit, create_flat_datasets

In [None]:
# Load features
store = FeatureStore('../data/features')
df = store.load_features('nifty50_features')
print(f'Shape: {df.shape}')
print(f'Columns: {list(df.columns)}')

In [None]:
# Identify feature columns
base_cols = {'date', 'ticker', 'open', 'high', 'low', 'close', 'volume', 'adj_close'}
feature_cols = [c for c in df.columns if c not in base_cols]
print(f'Number of features: {len(feature_cols)}')
print(f'Features: {feature_cols}')

In [None]:
# Feature distributions
fig = px.box(df[feature_cols[:10]].melt(), x='variable', y='value',
             title='Feature Distributions (First 10 Features)')
fig.update_layout(height=400, template='plotly_white')
fig.show()

In [None]:
# Feature-target correlations (Information Coefficient)
target_col = 'log_return_1d'
if target_col not in df.columns:
    df[target_col] = df.groupby('ticker')['adj_close'].transform(lambda s: np.log(s / s.shift(1)))

ic_values = {}
for col in feature_cols:
    valid = df[[col, target_col]].dropna()
    if len(valid) > 10:
        ic_values[col] = valid[col].corr(valid[target_col])

ic_df = pd.DataFrame({'feature': list(ic_values.keys()), 'IC': list(ic_values.values())})
ic_df = ic_df.sort_values('IC', key=abs, ascending=False)

fig = px.bar(ic_df.head(20), x='feature', y='IC',
             title='Feature Information Coefficient (Top 20)',
             color='IC', color_continuous_scale='RdBu_r')
fig.update_layout(height=400, template='plotly_white')
fig.show()

In [None]:
# Feature correlation heatmap
feat_corr = df[feature_cols[:15]].corr()
fig = px.imshow(feat_corr, title='Feature Correlation Matrix',
                color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
fig.update_layout(height=600, width=600)
fig.show()