In [None]:
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import optuna
from sklearn.ensemble import HistGradientBoostingRegressor

def sharpe_ratio_np(returns, freq: int = 252) -> float:
    returns = np.asarray(returns, dtype=float)
    if returns.std() == 0:
        return 0.0
    return np.sqrt(freq) * returns.mean() / returns.std()


PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print("PROJECT_ROOT:", PROJECT_ROOT)

%load_ext autoreload
%autoreload 2


from src.data_loading_cross import load_sp500_adj_close
from src.signals_cross import (
    make_cross_sectional_signals,
    build_cross_sectional_matrix,
    CROSS_FEATURES,
)




In [None]:
# Load SP500 panel, as long as we can reasonably go

prices = load_sp500_adj_close(start="2000-01-01", force_download=True)

prices.info()
prices.head()
print("Price panel shape:", prices.shape)
print("Date range:", prices.index.min(), "->", prices.index.max())
print("Number of tickers:", len(prices.columns))


In [None]:
lookahead = 21  # ~1 month forward return

signals_df = make_cross_sectional_signals(prices, lookahead=lookahead)

signals_df.head()
print("Signals shape:", signals_df.shape)
print("Columns:", signals_df.columns.tolist())

dates_all = signals_df.index.get_level_values("date")
tickers_all = signals_df.index.get_level_values("ticker")

print("Signals date range:", dates_all.min(), "->", dates_all.max())
print("Unique tickers in signals:", len(np.unique(tickers_all)))




In [None]:
X, y, dates, tickers = build_cross_sectional_matrix(signals_df)

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Feature names:", CROSS_FEATURES)
print("Min/max date:", dates.min(), "->", dates.max())
print("Num unique tickers:", np.unique(tickers).size)

# Quick sanity checks on target distribution
plt.figure(figsize=(6, 4))
plt.hist(y, bins=100)
plt.title(f"Distribution of {lookahead}-day forward returns")
plt.xlabel("Forward return")
plt.ylabel("Frequency")
plt.show()


In [None]:
# --- Date-based split: train / val / test ---

unique_dates = np.array(sorted(dates.unique()))
n_dates = len(unique_dates)
print("Number of trading days in panel:", n_dates)

# 60% train, 20% val, 20% test by time
train_end = unique_dates[int(n_dates * 0.6)]
val_end   = unique_dates[int(n_dates * 0.8)]

print("Train end date:", train_end)
print("Val   end date:", val_end)

mask_train = dates <= train_end
mask_val   = (dates > train_end) & (dates <= val_end)
mask_test  = dates > val_end

X_train, y_train = X[mask_train], y[mask_train]
X_val,   y_val   = X[mask_val],   y[mask_val]
X_test,  y_test  = X[mask_test],  y[mask_test]

dates_train = dates[mask_train]
dates_val   = dates[mask_val]
dates_test  = dates[mask_test]

tickers_train = tickers[mask_train]
tickers_val   = tickers[mask_val]
tickers_test  = tickers[mask_test]


print("Train samples:", len(y_train))
print("Val   samples:", len(y_val))
print("Test  samples:", len(y_test))


In [None]:
# --- Baseline cross-sectional momentum on TEST period ---

# We'll use signals_df again but only for dates > val_end
idx_dates = signals_df.index.get_level_values("date")
signals_test = signals_df.loc[idx_dates > val_end].copy()

signals_test.head()


In [None]:
def compute_cs_daily_returns(df: pd.DataFrame, q: float = 0.1, horizon: int = 21):
    """
    df: index (date, symbol), columns: y_true, y_pred (21d forward returns and predictions)
    Returns three Series of *daily-equivalent* returns:
        eqw, long-only (top q), long-short (top q minus bottom q)
    """

    def _per_date(group: pd.DataFrame) -> pd.Series:
        n = len(group)
        if n < 10:
            return pd.Series({"eqw": 0.0, "long": 0.0, "long_short": 0.0})

        # Equal-weight all stocks -> benchmark
        eqw_ret_21 = group["y_true"].mean()

        # Sort by predicted forward return
        g_sorted = group.sort_values("y_pred")
        k = max(1, int(n * q))

        bottom = g_sorted.iloc[:k]     # worst predicted
        top    = g_sorted.iloc[-k:]    # best predicted

        long_ret_21  = top["y_true"].mean()
        short_ret_21 = bottom["y_true"].mean()

        def to_daily(R):
            return (1.0 + R) ** (1.0 / horizon) - 1.0

        eqw_daily   = to_daily(eqw_ret_21)
        long_daily  = to_daily(long_ret_21)
        short_daily = to_daily(short_ret_21)
        long_short_daily = long_daily - short_daily

        return pd.Series(
            {"eqw": eqw_daily, "long": long_daily, "long_short": long_short_daily}
        )

    daily = df.groupby("date").apply(_per_date)

    eqw = daily["eqw"].astype(float)
    long = daily["long"].astype(float)
    long_short = daily["long_short"].astype(float)

    return eqw, long, long_short


In [None]:
def objective_tree_cs(trial):
    # --- Hyperparameters for the tree ---
    max_depth = trial.suggest_int("max_depth", 2, 8)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.2, log=True)
    max_iter = trial.suggest_int("max_iter", 100, 500)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 20, 200)

    # --- Trading hyperparameter: how big the long/short buckets are ---
    q = trial.suggest_float("q", 0.05, 0.3)  # top/bottom 5–30%

    # Train on TRAIN only
    model = HistGradientBoostingRegressor(
        max_depth=max_depth,
        learning_rate=learning_rate,
        max_iter=max_iter,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
    )
    model.fit(X_train, y_train)

    # Predictions on VALIDATION samples
    y_pred_val = model.predict(X_val)

    df_val = pd.DataFrame(
        {
            "date":   dates_val,
            "symbol": tickers_val,
            "y_true": y_val,
            "y_pred": y_pred_val,
        }
    ).set_index(["date", "symbol"]).sort_index()

    # Build portfolios on validation: EW, long-only, long-short
    _, long_val, long_short_val = compute_cs_daily_returns(df_val, q=q, horizon=21)

    # We care about *market-neutral alpha* → use long-short Sharpe as objective
    ret_series = long_short_val.replace([np.inf, -np.inf], np.nan).dropna()

    if len(ret_series) < 20:
        return 0.0  # too little data, treat as bad

    sharpe = sharpe_ratio_np(ret_series.values)
    # Optuna MINIMIZES → return negative Sharpe
    return -sharpe


In [None]:
study_tree_cs = optuna.create_study(direction="minimize")
study_tree_cs.optimize(objective_tree_cs, n_trials=10)

print("Best params:", study_tree_cs.best_params)
print("Best value (negative Sharpe):", study_tree_cs.best_value)


In [None]:
best_params = study_tree_cs.best_params.copy()
q_best = best_params.pop("q")  # remove q from dict, keep only model params

tree_cs_best = HistGradientBoostingRegressor(
    **best_params,
    random_state=42,
)

# Train on TRAIN + VAL
X_trainval = np.vstack([X_train, X_val])
y_trainval = np.concatenate([y_train, y_val])

tree_cs_best.fit(X_trainval, y_trainval)

# Predictions on TEST
y_pred_test = tree_cs_best.predict(X_test)

df_test = pd.DataFrame(
    {
        "date":   dates_test,
        "symbol": tickers_test,
        "y_true": y_test,
        "y_pred": y_pred_test,
    }
).set_index(["date", "symbol"]).sort_index()

eqw_tree, tree_long_daily, tree_ls_daily = compute_cs_daily_returns(
    df_test, q=q_best, horizon=21
)


In [None]:
def daily_momentum_cs(group: pd.DataFrame, q: float = 0.1, horizon: int = 21) -> pd.Series:
    """
    Cross-sectional momentum for a single date.

    group: rows for one date, many tickers.
    q: top/bottom quantile, e.g. 0.1 for deciles.
    horizon: forward horizon used for the target (e.g. 21 days).
    """
    n = len(group)
    if n < 10:
        return pd.Series({"eqw": 0.0, "long": 0.0, "long_short": 0.0})

    # equal-weight all stocks -> "index" (21-day return)
    eqw_ret_21 = group["target_fwd_21"].mean()

    # sort by past 21d return
    g_sorted = group.sort_values("ret_21")
    k = max(1, int(n * q))

    bottom = g_sorted.iloc[:k]
    top    = g_sorted.iloc[-k:]

    long_ret_21  = top["target_fwd_21"].mean()
    short_ret_21 = bottom["target_fwd_21"].mean()

    # convert 21-day returns to daily-equivalent returns
    def to_daily(R):
        return (1.0 + R) ** (1.0 / horizon) - 1.0

    eqw_ret_daily   = to_daily(eqw_ret_21)
    long_ret_daily  = to_daily(long_ret_21)
    short_ret_daily = to_daily(short_ret_21)
    long_short_daily = long_ret_daily - short_ret_daily

    return pd.Series(
        {
            "eqw": eqw_ret_daily,
            "long": long_ret_daily,
            "long_short": long_short_daily,
        }
    )


In [None]:
daily_cs_rets = signals_test.groupby("date").apply(daily_momentum_cs, q=0.1)
daily_cs_rets.head()

eqw_returns        = daily_cs_rets["eqw"]
mom_long_returns   = daily_cs_rets["long"]
mom_ls_returns     = daily_cs_rets["long_short"]


In [None]:
from src.backtest import (
    equity_curve_from_returns,
    cagr,
    annualized_vol,
    sharpe_ratio,
    max_drawdown,
)

equity_eqw     = equity_curve_from_returns(eqw_returns)
equity_mom_L   = equity_curve_from_returns(mom_long_returns)
equity_mom_LS  = equity_curve_from_returns(mom_ls_returns)

# Equity curves for the Optuna-tuned tree strategy
equity_tree_L_opt  = equity_curve_from_returns(tree_long_daily)
equity_tree_LS_opt = equity_curve_from_returns(tree_ls_daily)




# Long-only: EW vs momentum vs tree (Optuna)
plt.figure(figsize=(10, 4))
equity_eqw.plot(label="EW all stocks (test)")
equity_mom_L.plot(label="Momentum long-only (top decile)")
equity_tree_L_opt.plot(label="Tree long-only (Optuna, top q)", linestyle="--")
plt.legend()
plt.title("Cross-sectional long-only: momentum vs tree (test)")
plt.show()

# Long-short: momentum vs tree (Optuna)
plt.figure(figsize=(10, 4))
equity_mom_LS.plot(label="Momentum long-short (top-bottom decile)")
equity_tree_LS_opt.plot(label="Tree long-short (Optuna, top q)", linestyle="--")
plt.legend()
plt.title("Cross-sectional long-short: momentum vs tree (test)")
plt.show()


metrics_cs = {
    "eqw_cagr":     cagr(equity_eqw),
    "eqw_vol":      annualized_vol(eqw_returns),
    "eqw_sharpe":   sharpe_ratio(eqw_returns),
    "eqw_max_dd":   max_drawdown(equity_eqw),

    "momL_cagr":    cagr(equity_mom_L),
    "momL_vol":     annualized_vol(mom_long_returns),
    "momL_sharpe":  sharpe_ratio(mom_long_returns),
    "momL_max_dd":  max_drawdown(equity_mom_L),

    "momLS_cagr":   cagr(equity_mom_LS),
    "momLS_vol":    annualized_vol(mom_ls_returns),
    "momLS_sharpe": sharpe_ratio(mom_ls_returns),
    "momLS_max_dd": max_drawdown(equity_mom_LS),
}
metrics_cs


In [None]:
metrics_tree_cs_opt = {
    "momL_cagr":    cagr(equity_mom_L),
    "momL_vol":     annualized_vol(mom_long_returns),
    "momL_sharpe":  sharpe_ratio(mom_long_returns),
    "momL_max_dd":  max_drawdown(equity_mom_L),

    "treeL_cagr":   cagr(equity_tree_L_opt),
    "treeL_vol":    annualized_vol(tree_long_daily),
    "treeL_sharpe": sharpe_ratio(tree_long_daily),
    "treeL_max_dd": max_drawdown(equity_tree_L_opt),

    "momLS_cagr":   cagr(equity_mom_LS),
    "momLS_vol":    annualized_vol(mom_ls_returns),
    "momLS_sharpe": sharpe_ratio(mom_ls_returns),
    "momLS_max_dd": max_drawdown(equity_mom_LS),

    "treeLS_cagr":  cagr(equity_tree_LS_opt),
    "treeLS_vol":   annualized_vol(tree_ls_daily),
    "treeLS_sharpe":sharpe_ratio(tree_ls_daily),
    "treeLS_max_dd":max_drawdown(equity_tree_LS_opt),
}
metrics_tree_cs_opt
