# 03 - Embedding Space Visualization

Visualize the market embeddings extracted from the pre-trained masked encoder.

**Contents:**
- Load pre-trained encoder and extract embeddings
- t-SNE / UMAP dimensionality reduction
- Color by regime, ticker, and time period
- Nearest-neighbor analysis in embedding space

In [None]:
# === Colab Auto-Detection ===
import sys, os
if "google.colab" in sys.modules:
    import subprocess
    if not os.path.exists("/content/quant-lab"):
        subprocess.run(["git", "clone", "https://github.com/Mohit1053/quant-lab.git", "/content/quant-lab"], check=True)
    os.chdir("/content/quant-lab")
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-e", "."], check=True)
    from google.colab import drive
    drive.mount("/content/drive", force_remount=False)
    # Symlink data from Drive if available
    from pathlib import Path
    drive_data = Path("/content/drive/MyDrive/quant_lab/data")
    if drive_data.exists():
        import shutil
        for sub in ["raw", "cleaned", "features"]:
            src = drive_data / sub
            dst = Path("data") / sub
            if src.exists():
                dst.mkdir(parents=True, exist_ok=True)
                for f in src.glob("*.parquet"):
                    shutil.copy(f, dst / f.name)
    print("Colab setup complete!")
else:
    sys.path.insert(0, "../src")


In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.manifold import TSNE

from quant_lab.data.storage.parquet_store import ParquetStore

In [None]:
# Load embeddings (if pre-computed)
emb_store = ParquetStore('../data/embeddings')
if emb_store.exists('embeddings'):
    emb_df = emb_store.load('embeddings')
    print(f'Embeddings shape: {emb_df.shape}')
    emb_cols = [c for c in emb_df.columns if c.startswith('emb_')]
    embeddings = emb_df[emb_cols].values
    print(f'Embedding dim: {len(emb_cols)}')
else:
    print('No pre-computed embeddings found.')
    print('Run: python scripts/pretrain.py')
    embeddings = np.random.randn(200, 32)  # Placeholder
    emb_df = pd.DataFrame({'date': pd.date_range('2020-01-01', periods=200)})

In [None]:
# t-SNE visualization
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
emb_2d = tsne.fit_transform(embeddings[:1000])  # Subsample for speed

plot_df = pd.DataFrame({
    'tsne_1': emb_2d[:, 0],
    'tsne_2': emb_2d[:, 1],
    'index': range(len(emb_2d)),
})

fig = px.scatter(plot_df, x='tsne_1', y='tsne_2', color='index',
                 title='t-SNE of Market Embeddings (colored by time)',
                 color_continuous_scale='Viridis')
fig.update_layout(height=600, template='plotly_white')
fig.show()

In [None]:
# Color by regime (if available)
regime_path = '../outputs/regimes/regime_labels.parquet'
try:
    regime_df = pd.read_parquet(regime_path)
    labels = regime_df['regime_label'].values[:len(emb_2d)]
    plot_df['regime'] = labels.astype(str)

    fig = px.scatter(plot_df, x='tsne_1', y='tsne_2', color='regime',
                     title='t-SNE colored by Regime')
    fig.update_layout(height=600, template='plotly_white')
    fig.show()
except FileNotFoundError:
    print('No regime labels found. Run: python scripts/detect_regimes.py')