# 01 EDA and Gaussian Baseline

Exploratory analysis of the daily dataset and a simple Gaussian return generator.


## Imports and setup


In [None]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-v0_8')

PROJECT_ROOT = Path.cwd().resolve()
if PROJECT_ROOT.name == 'notebooks':
    PROJECT_ROOT = PROJECT_ROOT.parent
SRC_PATH = PROJECT_ROOT / 'src'
if str(SRC_PATH) not in sys.path:
    sys.path.append(str(SRC_PATH))

DATA_PATH = PROJECT_ROOT / 'data' / 'processed' / 'market_daily.parquet'
sns.set(context='talk', style='whitegrid')


## Load processed daily data


In [None]:
df = pd.read_parquet(DATA_PATH)
df['date'] = pd.to_datetime(df['date'])
df.head()


## Quick price / cumulative return view


In [None]:
plot_assets = ['SPX', 'NDX', 'EURUSD']
subset = df[df['asset'].isin(plot_assets)]
returns_wide = subset.pivot(index='date', columns='asset', values='log_return_1d').fillna(0.0)
cum_log = returns_wide.cumsum()
cum_growth = np.exp(cum_log)
ax = cum_growth.plot(figsize=(10, 5))
ax.set_title('Cumulative growth (log-return exp)')
ax.set_ylabel('Growth multiple')
plt.show()


## Correlation matrix of daily log returns


In [None]:
returns_matrix = df.pivot(index='date', columns='asset', values='log_return_1d')
corr = returns_matrix.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, cmap='coolwarm', center=0.0)
plt.title('Daily log_return_1d correlation matrix')
plt.tight_layout()
plt.show()


## Gaussian baseline sampling


In [None]:
from jepa_worldmodel.models import GaussianReturnModel

gauss_model = GaussianReturnModel.fit(df)
samples = gauss_model.sample_paths(n_steps=60, n_scenarios=100, random_state=42)
samples.shape


### Inspect sample paths (SPX vs EURUSD)


In [None]:
asset_to_plot = ['SPX', 'EURUSD']
idx_map = {asset: i for i, asset in enumerate(gauss_model.assets)}
selected_idx = [idx_map[a] for a in asset_to_plot if a in idx_map]

n_plot = 5
time = np.arange(samples.shape[1])
fig, axes = plt.subplots(len(selected_idx), 1, figsize=(10, 6), sharex=True)
if len(selected_idx) == 1:
    axes = [axes]
for ax, asset, a_idx in zip(axes, asset_to_plot, selected_idx):
    for scen in range(n_plot):
        ax.plot(time, samples[scen, :, a_idx], alpha=0.6)
    ax.set_title(f'Sample Gaussian returns: {asset}')
    ax.axhline(0.0, color='black', linewidth=0.8, linestyle='--')
axes[-1].set_xlabel('Step')
plt.tight_layout()
plt.show()


## TODOs
- Compare empirical vs Gaussian correlations.
- Evaluate regime-conditioned statistics.
- Integrate JEPA encoder outputs once available.
