# 04 - Feature Engineering

## Objective
Transform the cleaned price dataset into model-ready features (returns, volatility,
momentum, drawdowns) suitable for time-series forecasting tasks.

## Inputs
- Cleaned dataset: `data/processed/<version>/clean_prices_<version>_latest.csv`

## Outputs
- Feature dataset saved to: `data/processed/<version>/features_<version>_latest.csv`
- Timestamped audit copy saved to: `data/processed/<version>/features_<version>_<timestamp>.csv`

## CRISP-DM Stage
Data Preparation

In [None]:
# Make the project root importable (so `import src...` works in notebooks)
import sys
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()  # notebooks live in jupyter_notebooks/
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("Project root added to sys.path:", PROJECT_ROOT)

In [None]:
from pathlib import Path
import pandas as pd

from src.config import DEFAULT_VERSION, get_paths
from src.data_processing import load_clean_prices_latest

In [None]:
VERSION = DEFAULT_VERSION
paths = get_paths(VERSION)

df = load_clean_prices_latest(paths.processed_dir, VERSION)

print("Shape:", df.shape)
print("Tickers:", sorted(df["Ticker"].unique().tolist()))
print("Date range:", df["Date"].min().date(), "to", df["Date"].max().date())
df.head()

In [None]:
def add_features_per_ticker(g: pd.DataFrame) -> pd.DataFrame:
    g = g.sort_values("Date").copy()

    # Adj_Close is for returns-based features
    g["log_price"] = (g["Adj_Close"]).apply(lambda x: None if pd.isna(x) else x)
    g["log_price"] = pd.to_numeric(g["log_price"], errors="coerce")
    g["log_price"] = pd.Series(g["log_price"]).apply(lambda x: pd.NA if pd.isna(x) else x)

    # Log returns (more stable than pct returns for modelling)
    g["log_return_1d"] = (g["Adj_Close"].apply(pd.to_numeric, errors="coerce")
                          .pipe(lambda s: (s / s.shift(1)).apply(lambda v: pd.NA if pd.isna(v) else v))
                         )

    # Convert to numeric and compute safely
    s = pd.to_numeric(g["Adj_Close"], errors="coerce")
    g["return_1d"] = s.pct_change()

    # Rolling volatility (std of daily returns)
    g["vol_30d"] = g["return_1d"].rolling(30).std()
    g["vol_90d"] = g["return_1d"].rolling(90).std()

    # Momentum: rolling mean returns
    g["mom_30d"] = g["return_1d"].rolling(30).mean()
    g["mom_90d"] = g["return_1d"].rolling(90).mean()

    # Drawdown: price vs running peak
    running_max = s.cummax()
    g["drawdown"] = s / running_max - 1.0

    # Lag features (common, strong baseline)
    g["lag_return_1"] = g["return_1d"].shift(1)
    g["lag_return_5"] = g["return_1d"].shift(5)
    g["lag_return_21"] = g["return_1d"].shift(21)

    return g