# Task 1 — Financial Analysis (Cleaning, Returns, Stationarity, Risk Metrics)

This notebook is primarily a **reporting notebook**: the official deliverables are generated by scripts and saved to disk.

## Rubric-fast deliverables covered here
1. Scaling/normalization evidence (a saved scaled dataset)
2. Three visualizations saved as files
3. Returns + stationarity (ADF) + risk metrics loaded from artifacts

**Artifacts expected (after running scripts):**
- `data/task1/processed/prices.parquet`
- `data/task1/processed/returns.parquet`
- `data/task1/processed/scaled_task1_prices.parquet` ✅ scaling evidence
- `data/task1/processed/task1_adf_results.csv`
- `data/task1/processed/task1_risk_metrics.csv`
- `outputs/task1/viz/task1_prices_timeseries.png` ✅ plot 1
- `outputs/task1/viz/task1_daily_pct_change.png` ✅ plot 2
- `outputs/task1/viz/task1_rolling_mean_std.png` ✅ plot 3
- `data/task1/processed/task1_outliers.csv` ✅ Outlier evidence
- `outputs/task1/viz/task1_outliers_returns.png` ✅ Outlier plot


In [None]:
import os
import sys
from pathlib import Path
import importlib

import pandas as pd
import matplotlib.pyplot as plt


def _find_repo_root(start: Path) -> Path:
    """Walk upward until we find a folder containing both `src/` and `outputs/` or `data/`."""
    start = start.resolve()
    for candidate in [start, *start.parents]:
        if (candidate / "src").is_dir() and ((candidate / "outputs").exists() or (candidate / "data").exists()):
            return candidate
    return start


REPO_ROOT = _find_repo_root(Path.cwd())

# Make imports and relative paths work consistently from notebooks/
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))
os.chdir(REPO_ROOT)

from src import config as config
config = importlib.reload(config)

print("Repo root:", REPO_ROOT)
print("Notebook working directory (after chdir):", os.getcwd())
print("config file:", getattr(config, "__file__", None))
print("PRICES_PATH:", config.PRICES_PATH)
print("RETURNS_PATH:", config.RETURNS_PATH)
print("TASK1_SCALED_PRICES_PATH:", getattr(config, "TASK1_SCALED_PRICES_PATH", None))
print("TASK1_VIZ_DIR:", getattr(config, "TASK1_VIZ_DIR", None))


## 1) Load Task 1 datasets (prices + returns)

These should already exist if you ran your Task 1 pipeline scripts.

In [None]:
prices = pd.read_parquet(config.PRICES_PATH)
returns = pd.read_parquet(config.RETURNS_PATH)

display(prices.head())
display(returns.head())
print('prices shape:', prices.shape)
print('returns shape:', returns.shape)

In [None]:
# 1b) Quick EDA summary table (visible evidence)

ret_col = "return" if "return" in returns.columns else None
if ret_col:
    summary = (returns.groupby("asset")[ret_col]
               .agg(count="count", mean="mean", std="std", min="min",
                    q01=lambda s: s.quantile(0.01),
                    q99=lambda s: s.quantile(0.99),
                    max="max")
               .reset_index())
    display(summary)

In [None]:
# 1c) Outlier detection evidence (required)

outliers_path = getattr(config, "TASK1_OUTLIERS_PATH", None)
if outliers_path is None:
    outliers_path = str(REPO_ROOT / "data" / "task1" /
                        "processed" / "task1_outliers.csv")

if not os.path.exists(outliers_path):
    raise FileNotFoundError(
        f"Missing outlier evidence file: {outliers_path}. "
        "Generate it in Task 1 scripts (recommended) and commit it."
    )

outliers = pd.read_csv(outliers_path)
print("Outliers file:", outliers_path)
print("outliers shape:", outliers.shape)

# Show counts per asset + sample rows
if "asset" in outliers.columns:
    display(outliers["asset"].value_counts().rename(
        "outlier_count").to_frame())

# Show “most extreme” first if you have a score column
score_cols = [c for c in ["z_score", "abs_z",
                          "abs_return", "return"] if c in outliers.columns]
if score_cols:
    c = score_cols[0]
    display(outliers.sort_values(c, ascending=False).head(20))
else:
    display(outliers.head(20))

print(
    "Method note: Outliers are detected on daily returns using a fixed threshold "
    "(e.g., |z-score| > 3 or IQR rule). See scripts/ + README for the exact rule."
)

In [None]:
# 1d) Outlier visualization (required)

from IPython.display import Image, display
outlier_fig = str(REPO_ROOT / "outputs" / "task1" /
                  "viz" / "task1_outliers_returns.png")
os.makedirs(os.path.dirname(outlier_fig), exist_ok=True)

# Requires columns: date, asset, and return (or pct_change)
returns_ = returns.copy()
returns_["date"] = pd.to_datetime(returns_["date"])

ret_col = "return" if "return" in returns_.columns else None
if ret_col is None:
    raise ValueError(
        "Expected `return` column in returns.parquet for outlier visualization.")

# Merge flags (assumes outliers has date/asset)
outliers_ = outliers.copy()
if "date" in outliers_.columns:
    outliers_["date"] = pd.to_datetime(outliers_["date"])

flag_cols = [c for c in outliers_.columns if c.lower(
) in ["is_outlier", "outlier_flag", "flag"]]
flag_col = flag_cols[0] if flag_cols else None

m = returns_.merge(
    outliers_[["date", "asset"] + ([flag_col] if flag_col else [])],
    on=["date", "asset"],
    how="left"
)
m["is_outlier"] = m[flag_col].fillna(True) if flag_col else m[["date", "asset"]].merge(
    outliers_[["date", "asset"]].assign(is_outlier=True),
    on=["date", "asset"], how="left"
)["is_outlier"].fillna(False)

# Plot per asset
assets = sorted(m["asset"].unique())
fig, axes = plt.subplots(len(assets), 1, figsize=(
    12, 3*len(assets)), sharex=True)
if len(assets) == 1:
    axes = [axes]

for ax, a in zip(axes, assets):
    g = m[m["asset"] == a].sort_values("date")
    ax.plot(g["date"], g[ret_col], linewidth=1.0, label=f"{a} returns")
    go = g[g["is_outlier"]]
    ax.scatter(go["date"], go[ret_col], s=25,
               color="red", label="outliers", zorder=3)
    ax.axhline(0, color="black", linewidth=1, alpha=0.6)
    ax.set_title(f"{a}: returns with outliers highlighted")
    ax.grid(True, alpha=0.3)
    ax.legend()

plt.tight_layout()
plt.savefig(outlier_fig, dpi=150)
plt.close(fig)

print("Saved outlier visualization:", outlier_fig)

display(Image(filename=outlier_fig))

## 2) Scaling / normalization evidence

Rubric requirement: demonstrate scaling/normalization. Preferred evidence: a saved scaled dataset.

This notebook will **auto-generate** a scaled dataset if it does not exist yet (see the next cell), and save it to `config.TASK1_SCALED_PRICES_PATH`.


In [None]:
# Scaling/normalization evidence
# If a scaled dataset exists, load it; otherwise generate it from `prices`.

scaled_path = getattr(config, 'TASK1_SCALED_PRICES_PATH', None)
if scaled_path is None:
    # fallback to the repo convention used by this notebook
    scaled_path = str(REPO_ROOT / 'data' / 'task1' / 'processed' / 'scaled_task1_prices.parquet')

if os.path.exists(scaled_path):
    scaled_prices = pd.read_parquet(scaled_path)
    print('Loaded scaled dataset:', scaled_path)
else:
    if 'prices' not in globals():
        prices = pd.read_parquet(config.PRICES_PATH)

    price_col = getattr(config, 'PRICE_COL', 'adj_close')

    def _minmax(s: pd.Series) -> pd.Series:
        s = s.astype(float)
        denom = (s.max() - s.min())
        if denom == 0:
            return s * 0.0
        return (s - s.min()) / denom

    scaled_prices = prices.copy()
    scaled_prices[f'{price_col}_scaled'] = scaled_prices.groupby('asset')[price_col].transform(_minmax)

    os.makedirs(os.path.dirname(scaled_path), exist_ok=True)
    scaled_prices.to_parquet(scaled_path, index=False)
    print('Created scaled dataset:', scaled_path)

display(scaled_prices.head())
print('scaled_prices shape:', scaled_prices.shape)
print('scaled columns:', list(scaled_prices.columns))


## 3) Visualizations (must exist as files)

This section verifies the three plot files are present.

If any are missing, rerun:
```bash
python scripts/02_task1_scale_and_viz.py
```

In [None]:
viz_dir = getattr(config, 'TASK1_VIZ_DIR', 'outputs/task1/viz')
os.makedirs(viz_dir, exist_ok=True)

expected = [
    os.path.join(viz_dir, 'task1_prices_timeseries.png'),
    os.path.join(viz_dir, 'task1_daily_pct_change.png'),
    os.path.join(viz_dir, 'task1_rolling_mean_std.png'),
]

missing = [p for p in expected if not os.path.exists(p)]
print('Expected plots:')
for p in expected:
    print(' -', p, 'OK' if os.path.exists(p) else 'MISSING')

if missing:
    print('\nSome plots are missing; generating them now...')

    if 'prices' not in globals():
        prices = pd.read_parquet(config.PRICES_PATH)
    if 'returns' not in globals():
        returns = pd.read_parquet(config.RETURNS_PATH)

    prices['date'] = pd.to_datetime(prices['date'])
    returns['date'] = pd.to_datetime(returns['date'])

    price_col = getattr(config, 'PRICE_COL', 'adj_close')

    # 1) Prices time series
    plt.figure(figsize=(12, 5))
    for asset, g in prices.sort_values('date').groupby('asset'):
        plt.plot(g['date'], g[price_col], label=asset, linewidth=1.5)
    plt.title(f'Prices over time ({price_col})')
    plt.xlabel('Date')
    plt.ylabel(price_col)
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.savefig(expected[0], dpi=150)
    plt.close()

    # 2) Daily percent change
    tmp = None
    if 'return' in returns.columns:
        tmp = returns.copy()
        tmp['pct_change'] = tmp['return'].astype(float) * 100.0
    else:
        tmp = prices.sort_values('date').copy()
        tmp['pct_change'] = tmp.groupby('asset')[price_col].pct_change() * 100.0

    plt.figure(figsize=(12, 5))
    for asset, g in tmp.groupby('asset'):
        plt.plot(g['date'], g['pct_change'], label=asset, linewidth=1.0)
    plt.title('Daily % change')
    plt.xlabel('Date')
    plt.ylabel('%')
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.savefig(expected[1], dpi=150)
    plt.close()

    # 3) Rolling mean/std (20D) of daily % change
    window = 20
    tmp2 = tmp.sort_values('date').copy()
    tmp2['roll_mean'] = tmp2.groupby('asset')['pct_change'].transform(lambda s: s.rolling(window).mean())
    tmp2['roll_std'] = tmp2.groupby('asset')['pct_change'].transform(lambda s: s.rolling(window).std())

    fig, axes = plt.subplots(2, 1, figsize=(12, 7), sharex=True)
    for asset, g in tmp2.groupby('asset'):
        axes[0].plot(g['date'], g['roll_mean'], label=asset, linewidth=1.2)
        axes[1].plot(g['date'], g['roll_std'], label=asset, linewidth=1.2)

    axes[0].set_title(f'Rolling mean of daily % change ({window}D)')
    axes[1].set_title(f'Rolling std of daily % change ({window}D)')
    for ax in axes:
        ax.grid(True, alpha=0.3)
        ax.legend()

    axes[1].set_xlabel('Date')
    fig.tight_layout()
    fig.savefig(expected[2], dpi=150)
    plt.close(fig)

    missing = [p for p in expected if not os.path.exists(p)]

if missing:
    raise FileNotFoundError(
        'Missing required Task 1 plot files:\n' + '\n'.join(missing)
    )

print('\nAll required plot files exist.')


### Display the saved plot images

In [None]:
# 3b) EDA — Visual evidence (required)

from IPython.display import Image, display

captions = {
    expected[0]: "Prices over time: shows overall trend/regime shifts and relative price levels across assets.",
    expected[1]: "Daily % change: highlights volatility clustering and extreme move days (spikes).",
    expected[2]: "Rolling mean/std: shows time-varying volatility and stability (or instability) of returns."
}

for p in expected:
    display(Image(filename=p))
    print("Figure:", os.path.basename(p))
    print("Interpretation:", captions.get(p, ""))
    print("-" * 80)

## 4) Stationarity evidence (ADF results)

ADF outputs should be saved to CSV by your Task 1 scripts.

In [None]:
adf_path = config.TASK1_ADF_PATH
if not os.path.exists(adf_path):
    raise FileNotFoundError(f'Missing ADF results: {adf_path}. Run Task 1 scripts.')

adf = pd.read_csv(adf_path)
display(adf)
print('ADF file:', adf_path)

## 5) Risk metrics evidence

Risk metrics (e.g., annualized return/volatility, Sharpe, VaR) should be saved by scripts.

In [None]:
risk_path = config.TASK1_RISK_PATH
if not os.path.exists(risk_path):
    raise FileNotFoundError(f'Missing risk metrics: {risk_path}. Run Task 1 scripts.')

risk = pd.read_csv(risk_path)
display(risk)
print('Risk metrics file:', risk_path)

## 6) Quick interpretation (short)

- Prices are typically non-stationary; returns/log returns tend to be closer to stationary.
- Scaling provides comparable magnitudes across assets for visualization and certain models.
- Risk metrics summarize reward vs risk and tail behavior for each asset.