In [3]:
import numpy as np
import pandas as pd
import yfinance as yf
from statsmodels.tsa.stattools import adfuller
from ripser import ripser
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore",
    message="The input point cloud has more columns than rows; did you mean to transpose\\?")

In [4]:
data = yf.download('SPY', start='2018-01-01', end='2025-05-01',
                   auto_adjust=True, progress=False)
prices = data['Close'].dropna()
log_ret = np.log(prices).diff().dropna()
pct_ret = prices.pct_change().dropna()
stat_p, pval_p = adfuller(prices)[:2]
stat_r, pval_r = adfuller(log_ret)[:2]

print(f"Prices ADF p-value:     {pval_p:.3f}  (non-stationary)")
print(f"Log-returns ADF p-value: {pval_r:.3f}  (stationary)\n")

window_size, horizon, step = 90, 1, 1
L = len(pct_ret)
runs = 10
experiments = {
    'Prices': prices,
    'Log-Returns': log_ret
}
results = {}
thr = np.median(np.abs(pct_ret.values))

for name, series in experiments.items():
    Xb, Xt, y = [], [], []
    vals = series.values
    for i in range(0, L - window_size - horizon + 1, step):
        win = vals[i:i+window_size]
        fut = pct_ret.values[i+window_size]
        label = int(abs(fut) > thr)
        tech = [win.mean(), win.std()]
        emb = np.vstack([win[:-1], win[1:]]).T
        dgms = ripser(emb, maxdim=1)['dgms']
        tda = []
        for diag in dgms:
            ls = diag[:,1] - diag[:,0]
            ls = np.where(np.isfinite(ls), ls, 0)
            tda.extend(np.sort(ls)[::-1][:3])
        Xb.append(tech)
        Xt.append(tech + tda)
        y.append(label)
    Xb = np.array(Xb); Xt = np.array(Xt); y = np.array(y)
    acc_b, acc_t, auc_b, auc_t = [], [], [], []

    for seed in range(runs):
        idx = np.arange(len(y))
        tr, te = train_test_split(idx, test_size=0.3,random_state=seed, stratify=y)
        clf = RandomForestClassifier(random_state=seed)
        clf.fit(Xb[tr], y[tr])
        pred = clf.predict(Xb[te]); prob = clf.predict_proba(Xb[te])[:,1]
        acc_b.append(accuracy_score(y[te], pred))
        auc_b.append(roc_auc_score(y[te], prob))
        clf.fit(Xt[tr], y[tr])
        pred = clf.predict(Xt[te]); prob = clf.predict_proba(Xt[te])[:,1]
        acc_t.append(accuracy_score(y[te], pred))
        auc_t.append(roc_auc_score(y[te], prob))

    results[name] = {
        'base_acc': np.mean(acc_b),
        'tda_acc':  np.mean(acc_t),
        'Δacc':     np.mean(acc_t)-np.mean(acc_b),
        'base_auc': np.mean(auc_b),
        'tda_auc':  np.mean(auc_t),
        'Δauc':     np.mean(auc_t)-np.mean(auc_b)
    }

res_df = pd.DataFrame(results).T
print(res_df)

Prices ADF p-value:     0.908  (non-stationary)
Log-returns ADF p-value: 0.000  (stationary)

             base_acc   tda_acc     Δacc  base_auc   tda_auc      Δauc
Prices       0.567110  0.566730 -0.00038  0.589347  0.588945 -0.000402
Log-Returns  0.575095  0.574715 -0.00038  0.600740  0.602966  0.002226
