# 01 - Data Exploration

Explore the raw and cleaned market data for the NIFTY 50 universe.

**Contents:**
- Load raw & cleaned data from Parquet
- Check data quality: missing values, date gaps, ticker coverage
- Visualize price histories, volume patterns
- Correlation heatmap across tickers
- Return distributions and basic statistics

In [None]:
# === Colab Auto-Detection ===
import sys, os
if "google.colab" in sys.modules:
    import subprocess
    if not os.path.exists("/content/quant-lab"):
        subprocess.run(["git", "clone", "https://github.com/Mohit1053/quant-lab.git", "/content/quant-lab"], check=True)
    os.chdir("/content/quant-lab")
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-e", "."], check=True)
    from google.colab import drive
    drive.mount("/content/drive", force_remount=False)
    # Symlink data from Drive if available
    from pathlib import Path
    drive_data = Path("/content/drive/MyDrive/quant_lab/data")
    if drive_data.exists():
        import shutil
        for sub in ["raw", "cleaned", "features"]:
            src = drive_data / sub
            dst = Path("data") / sub
            if src.exists():
                dst.mkdir(parents=True, exist_ok=True)
                for f in src.glob("*.parquet"):
                    shutil.copy(f, dst / f.name)
    print("Colab setup complete!")
else:
    sys.path.insert(0, "../src")


In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from quant_lab.data.storage.parquet_store import ParquetStore
from quant_lab.data.universe import get_universe

In [None]:
# Load cleaned data
store = ParquetStore('../data/cleaned')
df = store.load('nifty50_cleaned')
print(f'Shape: {df.shape}')
print(f'Tickers: {df["ticker"].nunique()}')
print(f'Date range: {df["date"].min()} to {df["date"].max()}')
df.head()

In [None]:
# Missing data summary per ticker
missing = df.groupby('ticker').apply(lambda g: g.isnull().sum()).sum(axis=1).sort_values(ascending=False)
print('Missing values per ticker (top 10):')
print(missing.head(10))

In [None]:
# Price histories for a sample of tickers
sample_tickers = df['ticker'].unique()[:5]
sample = df[df['ticker'].isin(sample_tickers)]

fig = px.line(sample, x='date', y='adj_close', color='ticker',
              title='Price Histories (Sample Tickers)')
fig.update_layout(height=500, template='plotly_white')
fig.show()

In [None]:
# Return distributions
df['daily_return'] = df.groupby('ticker')['adj_close'].transform(lambda s: s.pct_change())

fig = px.histogram(df.dropna(subset=['daily_return']), x='daily_return',
                   nbins=200, title='Distribution of Daily Returns (All Tickers)',
                   marginal='box')
fig.update_layout(height=400, template='plotly_white')
fig.show()

print(f'\nReturn statistics:')
print(df['daily_return'].describe())

In [None]:
# Correlation matrix
returns_wide = df.pivot(index='date', columns='ticker', values='daily_return').dropna()
corr = returns_wide.corr()

fig = px.imshow(corr, title='Return Correlation Matrix',
                color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
fig.update_layout(height=700, width=700)
fig.show()