In [None]:
%pip -q install hmmlearn arch statsmodels pandas numpy matplotlib

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from hmmlearn.hmm import GaussianHMM
from arch import arch_model
from statsmodels.tsa.arima.model import ARIMA

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (12,5)
np.random.seed(42)


In [None]:
# ========= ПАРАМЕТРЫ =========
H            = 3        # ваш горизонт (как в таргете)
BASE_THR     = 0.53     # базовый порог уверенности proba_up
MARGIN_HIT   = 0.05     # на сколько p_hit_tp должно превосходить p_hit_sl
VOL_PCTL_ABS = 0.95     # пратежка: не торговать в топ-5% волатильности
TARGET_VOL   = 0.02     # дневной таргет под волатильность (пример)
F_MAX        = 0.02     # максимум доля капитала (ограничитель Келли)
TP_ATR_K     = 2      # базовый множитель ATR на TP (как в baseline)
SL_ATR_K     = 1      # базовый множитель ATR на SL (как в baseline)
# =============================


In [None]:
def compute_atr_wilder(df, n=14):
    """True Range + Wilder's ATR, если нет atr_14 в CSV."""
    high = df['high'].astype(float)
    low  = df['low'].astype(float)
    close= df['close'].astype(float)
    prev_close = close.shift(1)

    tr1 = (high - low).abs()
    tr2 = (high - prev_close).abs()
    tr3 = (low  - prev_close).abs()
    tr  = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)

    atr = tr.ewm(alpha=1/n, adjust=False).mean()
    return atr

def fit_hmm_regimes(returns, n_states=2):
    X = returns.dropna().values.reshape(-1,1)
    if len(X) < 200:  # минимально, чтобы стабильно
        return None, None
    hmm = GaussianHMM(n_components=n_states, covariance_type='full',
                      n_iter=200, random_state=42)
    hmm.fit(X)
    hidden = pd.Series(hmm.predict(X), index=returns.dropna().index)
    return hmm, hidden.reindex(returns.index)

def garch_sigma(returns):
    # Еженаборная оценка σ_t через (E)GARCH: используем простой GARCH(1,1)
    r = (returns.dropna()*100).astype(float)  # проценты
    if len(r) < 300:
        return returns.abs().rolling(20).std().reindex(returns.index)
    am = arch_model(r, p=1, q=1, mean='Constant', vol='GARCH', dist='normal')
    res = am.fit(disp="off")
    cond_vol = res.conditional_volatility / 100.0  # обратно в доли
    return cond_vol.reindex(returns.index)

def arima_sign(returns):
    # Одношаговый ARIMA sanity-check (легкий)
    y = returns.dropna()
    if len(y) < 100:
        return pd.Series(index=returns.index, dtype=float)
    model = ARIMA(y, order=(1,0,1))
    res = model.fit()
    fc = res.forecast(1)
    sgn = np.sign(fc.iloc[0])
    return pd.Series(sgn, index=returns.index).ffill()

def mc_hit_probs(close_series, mu, sigma, H, n_paths=2000, dt=1):
    # Упрощенный GBM для hit-prob: вернём массив максимумов/минимумов за H шагов
    S0 = close_series.iloc[-1]
    z = np.random.normal(size=(n_paths, H))
    steps = (mu - 0.5*sigma**2)*dt + sigma*np.sqrt(dt)*z
    paths = S0 * np.exp(np.cumsum(steps, axis=1))
    max_arr = paths.max(axis=1)
    min_arr = paths.min(axis=1)
    return max_arr, min_arr

def kelly_lite(prob_up, R=1.0, f_max=F_MAX):
    p = np.clip(prob_up, 1e-6, 1-1e-6)
    f = (p*R - (1-p))/R
    return float(np.clip(f, 0.0, f_max))

def overlays_postprocess(df):
    """
    Apply stochastic filters (HMM, GARCH, ARIMA, Monte Carlo) to generate long/short signals.
    Expects DataFrame with columns time, close, high, low, and either proba_up/proba_down or pred_class.
    Adds columns: signal (+1 long, -1 short, 0 skip), tp_price, sl_price, size.
    """
    # Sort by time and copy to avoid modifying original
    df = df.sort_values('time').reset_index(drop=True).copy()

    # Ensure required price columns exist
    assert {'close','high','low'}.issubset(df.columns), "Нужны колонки close/high/low"

    # Must have either proba_up or pred_class to derive probabilities
    if ('proba_up' not in df.columns) and ('pred_class' not in df.columns):
        raise ValueError("Добавьте в входной набор либо 'proba_up', либо 'pred_class' из вашей модели.")

    # If ATR is missing, compute it via Wilder method
    if 'atr_14' not in df.columns:
        df['atr_14'] = compute_atr_wilder(df, n=14)

    # Compute log returns for volatility/regime detection
    df['logret'] = np.log(df['close']).diff()

    # 1) Hidden Markov regime classification on returns
    hmm, regimes = fit_hmm_regimes(df['logret'])
    df['regime'] = regimes  # 0/1; if None remains NaN

    # 2) Estimate conditional volatility σ_t via GARCH
    df['sigma_t'] = garch_sigma(df['logret'])
    vol_cut = df['sigma_t'].quantile(VOL_PCTL_ABS)

    # 3) Quick ARIMA one-step sign estimate (sanity check)
    df['arima_sign'] = arima_sign(df['logret'])

    # Build probability columns: if proba_up missing, derive from pred_class
    if 'proba_up' not in df.columns:
        df['proba_up'] = (df['pred_class'] == 2).astype(float) * 0.6 + 0.2
    # For symmetric trading we also need proba_down; if absent derive from pred_class or complement
    if 'proba_down' not in df.columns:
        if 'pred_class' in df.columns:
            df['proba_down'] = (df['pred_class'] == 0).astype(float) * 0.6 + 0.2
        else:
            df['proba_down'] = 1.0 - df['proba_up']

    # Initialize outputs: signal (+1 long, -1 short, 0 skip), take-profit price, stop-loss price, position size
    out = []
    for i in range(len(df)):
        row = df.iloc[i]
        # Skip initial rows where features are insufficient
        if i < 50 or (row['logret'] != row['logret']):  # NaN check
            out.append((0, float('nan'), float('nan'), 0.0))
            continue

        # Skip trading in extreme volatility regimes
        if (row['sigma_t'] == row['sigma_t']) and row['sigma_t'] >= vol_cut:
            out.append((0, float('nan'), float('nan'), 0.0))
            continue

        # Determine threshold (higher in high-vol regime)
        thr = BASE_THR
        if (row['regime'] == row['regime']) and int(row['regime']) == 1:
            thr = BASE_THR + 0.05

        # Candidate directions based on probabilities
        long_cond = (row['proba_up']   >= thr)
        short_cond= (row['proba_down'] >= thr)

        # If neither direction meets threshold, skip
        if not long_cond and not short_cond:
            out.append((0, float('nan'), float('nan'), 0.0))
            continue

        # ARIMA sanity check: require ARIMA sign to match direction when confidence is low
        # For long trades, a negative ARIMA sign suggests skip; for short trades, positive sign suggests skip
        direction = None
        if long_cond and not short_cond:
            # long candidate
            if (row['arima_sign'] == row['arima_sign']) and (row['arima_sign'] < 0) and (row['proba_up'] < (thr + 0.05)):
                out.append((0, float('nan'), float('nan'), 0.0))
                continue
            direction = 'long'
        elif short_cond and not long_cond:
            # short candidate
            if (row['arima_sign'] == row['arima_sign']) and (row['arima_sign'] > 0) and (row['proba_down'] < (thr + 0.05)):
                out.append((0, float('nan'), float('nan'), 0.0))
                continue
            direction = 'short'
        else:
            # Both directions meet threshold; choose the higher probability
            if row['proba_up'] >= row['proba_down']:
                if (row['arima_sign'] == row['arima_sign']) and (row['arima_sign'] < 0) and (row['proba_up'] < (thr + 0.05)):
                    out.append((0, float('nan'), float('nan'), 0.0))
                    continue
                direction = 'long'
            else:
                if (row['arima_sign'] == row['arima_sign']) and (row['arima_sign'] > 0) and (row['proba_down'] < (thr + 0.05)):
                    out.append((0, float('nan'), float('nan'), 0.0))
                    continue
                direction = 'short'

        # Determine ATR and fallback sigma estimate
        atr = row['atr_14']
        sigma = row['sigma_t'] if (row['sigma_t'] == row['sigma_t']) else df['logret'].rolling(20).std().iloc[i]
        # Step distance: maximum of ATR-based and volatility-based distance
        step = max(TP_ATR_K * atr, 2.0 * sigma * row['close'])

        # Compute take-profit and stop-loss depending on direction
        if direction == 'long':
            base_tp = row['close'] + step
            base_sl = row['close'] - step * (SL_ATR_K / TP_ATR_K)
        else:
            base_tp = row['close'] - step
            base_sl = row['close'] + step * (SL_ATR_K / TP_ATR_K)

        # Monte Carlo hit probability filter: require TP to be sufficiently more likely than SL
        if (sigma == sigma) and sigma > 0:
            mu_loc = df['logret'].iloc[max(0, i-50):i].mean()
            max_arr, min_arr = mc_hit_probs(df['close'].iloc[:i+1], mu=mu_loc, sigma=sigma, H=H, n_paths=1500)
            if direction == 'long':
                p_hit_tp = (max_arr >= base_tp).mean()
                p_hit_sl = (min_arr <= base_sl).mean()
            else:
                p_hit_tp = (min_arr <= base_tp).mean()
                p_hit_sl = (max_arr >= base_sl).mean()
            if p_hit_tp < p_hit_sl + MARGIN_HIT:
                out.append((0, float('nan'), float('nan'), 0.0))
                continue

        # Position sizing via Kelly criterion and volatility targeting
        # Risk-reward ratio for kelly: absolute of TP/SL difference relative to stop
        R = abs((base_tp - row['close']) / max(abs(row['close'] - base_sl), 1e-9))
        prob = row['proba_up'] if direction == 'long' else row['proba_down']
        f_kelly = kelly_lite(prob, R=R, f_max=F_MAX)
        size_vol = TARGET_VOL / max(sigma, 1e-6) if (sigma == sigma) else F_MAX
        size = float(np.clip(min(f_kelly, size_vol), 0.0, F_MAX))

        # Determine signal sign
        signal = 1 if direction == 'long' else -1
        # Append results
        out.append((signal, base_tp, base_sl, size))

    # Assign computed columns back to DataFrame
    df['signal']   = [o[0] for o in out]
    df['tp_price'] = [o[1] for o in out]
    df['sl_price'] = [o[2] for o in out]
    df['size']     = [o[3] for o in out]

    return df


In [None]:
import pandas as pd, numpy as np

df = pd.read_csv("../../data/SBER_dataset_5m.csv", parse_dates=["time"])

# Заглушка — случайная вероятность "up"
# (позже заменишь на реальные вероятности модели)
df["proba_up"] = np.random.uniform(0.1, 0.9, len(df))
df["proba_down"] = 1.0 - df["proba_up"]

# Если у тебя есть метка target (например, -1/0/1)
# можно добавить суррогатный pred_class:
df["pred_class"] = df["label"].map({-1:0, 0:1, 1:2}).fillna(1).astype(int)

df.to_csv("signals_raw.csv", index=False)
print("✅ Created signals_raw.csv (synthetic proba_up/down added)")


In [None]:
IN_CSV  = "signals_raw.csv"     # поменяйте путь
OUT_CSV = "orders_overlays.csv" # куда сохранить

In [None]:
df_in = pd.read_csv(IN_CSV, parse_dates=['time'])
df_out = overlays_postprocess(df_in)
df_out.to_csv(OUT_CSV, index=False)
print("Saved:", OUT_CSV)

df_out.tail()

In [None]:
df = df_out.copy()
df = df.sort_values('time').reset_index(drop=True)

# Доходности (по close→close)
df['ret_1'] = df['close'].pct_change()

# Простая схема: если signal==1, считаем, что "держим позицию" на следующий шаг
df['signal_shift'] = df['signal'].shift(1).fillna(0)
df['str_ret'] = df['signal_shift'] * df['ret_1']    # без плеча, для прикидки

bh = (1+df['ret_1'].fillna(0)).cumprod()
st = (1+df['str_ret'].fillna(0)).cumprod()

plt.plot(df['time'], bh, label='Buy&Hold')
plt.plot(df['time'], st, label='Overlay Strategy (no leverage)')
plt.legend()
plt.title('Cumulative Returns (toy check)')
plt.grid(True)
plt.show()
