In [1]:
# Setup: Keep the notebook reproducible and self-contained with explicit setup.
from __future__ import annotations

from pathlib import Path
import sys
import json
import logging
import warnings

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger("baselines")

def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p/"config").exists() and (p/"src").exists():
            return p
    raise AssertionError("Project folders missing: expected 'config' and 'src' somewhere above the notebook.")

ROOT = find_project_root(Path.cwd())
ARTIFACTS: Path = ROOT / "artifacts"
FEATURES_PATH: Path = ARTIFACTS / "data" / "features_monthly.parquet"
METRICS_DIR: Path = ARTIFACTS / "metrics"
METRICS_DIR.mkdir(parents=True, exist_ok=True)

src_path = str(ROOT)
if src_path not in sys.path:
    sys.path.append(src_path)

try:
    import sklearn
except Exception as e:
    raise ImportError("scikit-learn must be installed to compute baseline metrics.") from e

In [2]:
# Helpers
from typing import Tuple
from src.utils.splits import time_split, split_Xy


def _read_features(path: Path) -> pd.DataFrame:
    """Load the monthly feature table and validate its index/columns.
    Args: path: Path to the Parquet file (monthly features + targets).
    Returns: DataFrame with a monotonic increasing DatetimeIndex.
    Raises:
        FileNotFoundError: If the features file is missing.
        TypeError: If the index is not a DatetimeIndex.
        KeyError: If required target columns are missing."""
    if not path.exists():
        raise FileNotFoundError(f"Features file not found: {path}. "
                                "Run feature-building notebook first.")
    df = pd.read_parquet(path)

    # Many time-series ops depend on a DatetimeIndex.
    if not isinstance(df.index, pd.DatetimeIndex):
        raise TypeError("features_monthly must have a DatetimeIndex.")
    if not df.index.is_monotonic_increasing:
        df = df.sort_index()  # Deterministic ordering for splits and metrics.

    # Baselines depend on these targets.
    required_cols = {"y_return_next_pct", "y_direction_next"}
    missing = required_cols - set(df.columns)
    if missing:
        raise KeyError(f"Missing required target column(s): {sorted(missing)}")
    return df


def _persist_json(obj: dict, path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(obj, indent=2), encoding="utf-8")

In [3]:
# Data & splits
DF: pd.DataFrame = _read_features(FEATURES_PATH)

# Explicit, conventional split window for baseline comparisons.
train, test = time_split(
    DF,
    train_end="2019-12-31",  # End of pre-pandemic period for training.
    test_start="2020-01-31", # Start of pandemic/after period for out-of-sample test.
)

# Keep target names centralized. Split_Xy enforces dtype for classification target.
Xtr, ytr_reg, ytr_clf = split_Xy(train, y_reg="y_return_next_pct", y_clf="y_direction_next")
Xte, yte_reg, yte_clf = split_Xy(test,  y_reg="y_return_next_pct", y_clf="y_direction_next")

print(
    "Train:",
    train.index.min().date(),
    "→",
    train.index.max().date(),
    "| n=",
    len(train),
)
print(
    "Test :",
    test.index.min().date(),
    "→",
    test.index.max().date(),
    "| n=",
    len(test),
)

Train: 2009-02-28 → 2019-12-31 | n= 131
Test : 2020-01-31 → 2025-05-31 | n= 65


In [4]:
# Baselines: Use Series (not ndarray) to preserve index alignment.
# Regressions: trivial baselines
yhat_reg_zero: pd.Series = pd.Series(0.0, index=yte_reg.index)  # 0%-return benchmark.
# True "persistence" baseline: predict next return = last observed target (lag-1) globally.
yhat_reg_lag1: pd.Series = DF["y_return_next_pct"].shift(1).reindex(yte_reg.index)

# Classifications: trivial baselines
yhat_clf_ones: pd.Series = pd.Series(1, dtype="int8", index=yte_clf.index)   # Always "Up".
yhat_clf_zeros: pd.Series = pd.Series(0, dtype="int8", index=yte_clf.index)  # Always "Down".

In [5]:
# Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, f1_score


def rmse(y_true: pd.Series, y_pred: pd.Series) -> float:
    """Root mean squared error with float output."""
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))  # Keep scalar float for JSON serialization.


def mae(y_true: pd.Series, y_pred: pd.Series) -> float:
    """Mean absolute error with float output."""
    return float(mean_absolute_error(y_true, y_pred))


def acc(y_true: pd.Series, y_pred: pd.Series) -> float:
    """Classification accuracy with float output."""
    return float(accuracy_score(y_true, y_pred))


def f1(y_true: pd.Series, y_pred: pd.Series) -> float:
    """Binary F1 score with defined behavior when a class is missing."""
    # zero_division=0 avoids warnings when a baseline predicts a single class.
    return float(f1_score(y_true, y_pred, zero_division=0))


metrics: dict[str, float] = {
    # Regression metrics
    "REG_RMSE_zero": rmse(yte_reg, yhat_reg_zero),
    "REG_RMSE_lag1": rmse(yte_reg, yhat_reg_lag1.fillna(0.0)),  # First month can be NaN after shift.
    "REG_MAE_zero": mae(yte_reg, yhat_reg_zero),
    "REG_MAE_lag1": mae(yte_reg, yhat_reg_lag1.fillna(0.0)),
    # Classification metrics
    "CLF_Acc_ones": acc(yte_clf, yhat_clf_ones),
    "CLF_Acc_zeros": acc(yte_clf, yhat_clf_zeros),
    "CLF_F1_ones": f1(yte_clf, yhat_clf_ones),
    "CLF_F1_zeros": f1(yte_clf, yhat_clf_zeros),
}

for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

REG_RMSE_zero: 5.2689
REG_RMSE_lag1: 7.6722
REG_MAE_zero: 4.4111
REG_MAE_lag1: 6.0467
CLF_Acc_ones: 0.6308
CLF_Acc_zeros: 0.3692
CLF_F1_ones: 0.7736
CLF_F1_zeros: 0.0000


In [6]:
# Persist
out_path = METRICS_DIR / "baselines.json"
_persist_json(metrics, out_path)
print("Saved:", out_path)

Saved: C:\Users\gamer\Desktop\AktienPrognose\artifacts\metrics\baselines.json
