# 03 - Exploratory Data Analysis (EDA)

## Objective
Explore the cleaned price dataset to understand trends, volatility, and relationships between tickers.
Generate insights and plots that will later support dashboard visuals and project hypotheses.

## Inputs
- Cleaned dataset: `data/processed/<version>/clean_prices_<version>_latest.csv`

## Outputs
- EDA plots displayed in-notebook
- Optional saved figures to: `outputs/<version>/figures/`
- Summary stats (returns, volatility, drawdown)

## CRISP-DM Stage
Data Understanding

In [1]:
# Make the project root importable (so `import src...` works in notebooks)
import sys
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()  # notebooks live in jupyter_notebooks/
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("Project root added to sys.path:", PROJECT_ROOT)

In [2]:
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt

from src.config import DEFAULT_VERSION, get_paths

In [3]:
VERSION = DEFAULT_VERSION
paths = get_paths(VERSION)

PROCESSED_DIR = paths.processed_dir
OUTPUT_FIG_DIR = paths.outputs_dir / "figures"
OUTPUT_FIG_DIR.mkdir(parents=True, exist_ok=True)

data_path = PROCESSED_DIR / f"clean_prices_{VERSION}_latest.csv"
print("Loading:", data_path)

df = pd.read_csv(data_path, parse_dates=["Date"])

print("Shape:", df.shape)
print("Tickers:", sorted(df["Ticker"].unique().tolist()))
print("Date range:", df["Date"].min().date(), "to", df["Date"].max().date())
df.head()

In [4]:
# Basic sanity checks
expected_cols = ["Date", "Open", "High", "Low", "Close", "Adj_Close", "Volume", "Ticker"]
missing_cols = [c for c in expected_cols if c not in df.columns]
if missing_cols:
    raise KeyError(f"Missing expected columns: {missing_cols}")

df = df.sort_values(["Ticker", "Date"]).reset_index(drop=True)

# Ensure numeric types
for col in ["Open", "High", "Low", "Close", "Adj_Close", "Volume"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

print(df.dtypes)
df.isna().mean().sort_values(ascending=False).head(10) * 100

In [5]:
# Wide format for time series comparisons
prices = df.pivot(index="Date", columns="Ticker", values="Adj_Close").sort_index()

print("Wide prices shape:", prices.shape)
prices.tail()

In [6]:
plt.figure()
prices.plot()
plt.title("Adjusted Close Price Over Time")
plt.xlabel("Date")
plt.ylabel("Adj Close")
plt.tight_layout()

out_path = OUTPUT_FIG_DIR / "eda_adj_close_timeseries.png"
plt.savefig(out_path, dpi=150)
print("Saved:", out_path)
plt.show()

In [7]:
# Daily returns (percent)
returns = prices.pct_change().dropna()

plt.figure()
returns.plot()
plt.title("Daily Returns Over Time")
plt.xlabel("Date")
plt.ylabel("Daily Return")
plt.tight_layout()

out_path = OUTPUT_FIG_DIR / "eda_daily_returns_timeseries.png"
plt.savefig(out_path, dpi=150)
print("Saved:", out_path)
plt.show()

In [8]:
plt.figure()
returns.plot(kind="hist", bins=60, alpha=0.6)
plt.title("Distribution of Daily Returns")
plt.xlabel("Daily Return")
plt.ylabel("Frequency")
plt.tight_layout()

out_path = OUTPUT_FIG_DIR / "eda_daily_returns_hist.png"
plt.savefig(out_path, dpi=150)
print("Saved:", out_path)
plt.show()

In [9]:
plt.figure()
returns.boxplot()
plt.title("Daily Returns Boxplot (Volatility Comparison)")
plt.ylabel("Daily Return")
plt.tight_layout()

out_path = OUTPUT_FIG_DIR / "eda_daily_returns_boxplot.png"
plt.savefig(out_path, dpi=150)
print("Saved:", out_path)
plt.show()

In [10]:
# Rolling volatility (30-day standard deviation of returns)
rolling_vol = returns.rolling(30).std()

plt.figure()
rolling_vol.plot()
plt.title("30-Day Rolling Volatility")
plt.xlabel("Date")
plt.ylabel("Rolling Std (Volatility)")
plt.tight_layout()

out_path = OUTPUT_FIG_DIR / "eda_rolling_volatility_30d.png"
plt.savefig(out_path, dpi=150)
print("Saved:", out_path)
plt.show()

In [11]:
corr = returns.corr()

plt.figure()
plt.imshow(corr, aspect="auto")
plt.title("Correlation of Daily Returns")
plt.xticks(range(len(corr.columns)), corr.columns, rotation=45)
plt.yticks(range(len(corr.index)), corr.index)
plt.colorbar()
plt.tight_layout()

out_path = OUTPUT_FIG_DIR / "eda_returns_correlation_heatmap.png"
plt.savefig(out_path, dpi=150)
print("Saved:", out_path)
plt.show()

corr