# Investigación de barridas en NDXm

Este cuaderno explora patrones de microestructura intradía y eventos de **barrida bajista** sobre NDXm. Objetivos principales:

- Cargar datos OHLCV intradía desde los NPZ existentes en el pipeline.
- Construir features microestructurales (mechas, rango, ATR, hora, volumen relativo).
- Definir formalmente el evento de "barrida bajista".
- Analizar estadísticamente los retornos posteriores a las barridas.
- Conectar con el motor de backtesting para probar señales simples derivadas de las barridas.
- Dejar preparado un esquema básico de separación **train/test** por años.

## Setup e imports

In [None]:
import sys, os
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.data.feeds import NPZOHLCVFeed
from src.engine.core import BacktestConfig, run_backtest_with_signals
from src.analytics.metrics import equity_curve_metrics, trades_metrics
from src.analytics.reporting import trades_to_dataframe

%matplotlib inline
plt.style.use("seaborn-v0_8-darkgrid")


## Carga de datos

In [None]:
SYMBOL = "NDXm"
YEARS = [2020, 2021, 2022]  # ajustable

feed = NPZOHLCVFeed(SYMBOL)
ohlcv = feed.load_years(YEARS)

ts = pd.to_datetime(ohlcv.ts, unit="s")
df = pd.DataFrame({
    "open": ohlcv.o,
    "high": ohlcv.h,
    "low": ohlcv.l,
    "close": ohlcv.c,
    "volume": ohlcv.v,
}, index=ts).sort_index()

df_head = df.head()
df_stats = df.describe()

df_head, df_stats


## Construcción de features microestructurales

In [None]:
def add_microstructure_features(df: pd.DataFrame, atr_window: int = 14) -> pd.DataFrame:
    out = df.copy()
    out["range"] = out["high"] - out["low"]
    out["upper_wick"] = out["high"] - out[["open", "close"]].max(axis=1)
    out["lower_wick"] = out[["open", "close"]].min(axis=1) - out["low"]

    prev_close = out["close"].shift(1)
    tr_components = pd.concat([
        out["high"] - out["low"],
        (out["high"] - prev_close).abs(),
        (out["low"] - prev_close).abs(),
    ], axis=1)
    out["tr"] = tr_components.max(axis=1)
    out["atr"] = out["tr"].rolling(atr_window).mean()

    out["hour"] = out.index.hour

    for horizon in [5, 10, 20]:
        out[f"fwd_{horizon}m"] = out["close"].shift(-horizon) / out["close"] - 1.0

    return out


df_feat = add_microstructure_features(df)
df_feat.head()


## Definición de barrida

In [None]:
def compute_sweep_flag(
    df: pd.DataFrame,
    wick_threshold: float = 0.6,
    vol_window: int = 20,
    vol_quantile: float = 0.8,
    hour_window: tuple[int, int] | None = None,
) -> pd.Series:
    '''
    Marca como True las velas que consideramos 'barrida bajista':
    - Mecha inferior grande: lower_wick > wick_threshold * range
    - Volumen alto: volume > rolling quantile(vol_window, vol_quantile)
    - (Opcional) restringido a una ventana horaria [h0, h1]
    '''
    vol_rolling = df["volume"].rolling(vol_window)
    vol_p = vol_rolling.quantile(vol_quantile)

    cond_wick = df["lower_wick"] > df["range"] * wick_threshold
    cond_vol = df["volume"] > vol_p

    if hour_window is not None:
        h0, h1 = hour_window
        cond_hour = (df["hour"] >= h0) & (df["hour"] <= h1)
    else:
        cond_hour = True

    is_sweep = cond_wick & cond_vol & cond_hour
    return is_sweep.fillna(False)


def build_long_signal_from_sweep(is_sweep: pd.Series) -> pd.Series:
    sig = pd.Series(0, index=is_sweep.index, dtype=np.int8)
    sig[is_sweep] = 1
    return sig


is_sweep = compute_sweep_flag(df_feat)
df_feat["signal_long"] = build_long_signal_from_sweep(is_sweep)
df_feat[["range", "lower_wick", "signal_long"]].head()


## Análisis estadístico de barridas

In [None]:
sweep_stats = df_feat.loc[is_sweep, ["fwd_5m", "fwd_10m", "fwd_20m"]].describe()
no_sweep_stats = df_feat.loc[~is_sweep, ["fwd_5m", "fwd_10m", "fwd_20m"]].describe()

prob_up_sweep = (df_feat.loc[is_sweep, "fwd_10m"] > 0).mean()
prob_up_no_sweep = (df_feat.loc[~is_sweep, "fwd_10m"] > 0).mean()

mean_by_hour = df_feat.groupby("hour")["fwd_10m"].mean()

print("Prob. fwd_10m > 0 tras barrida:", prob_up_sweep)
print("Prob. fwd_10m > 0 sin barrida:", prob_up_no_sweep)

(ax := mean_by_hour.plot(kind="bar", figsize=(10, 4))).set_title("Retorno medio a 10m por hora del día")
plt.show()

sweep_stats, no_sweep_stats


## Helper para backtest desde señal

In [None]:
def run_backtest_from_signal(df_feat: pd.DataFrame, signal: pd.Series,
                             sl_atr: float = 1.5,
                             tp_atr: float = 3.0,
                             max_trade_minutes: int = 60,
                             initial_cash: float = 10_000):
    ts_arr = df_feat.index.view("int64") // 10**9
    o_arr = df_feat["open"].to_numpy()
    h_arr = df_feat["high"].to_numpy()
    l_arr = df_feat["low"].to_numpy()
    c_arr = df_feat["close"].to_numpy()
    v_arr = df_feat["volume"].to_numpy()
    sig_arr = signal.to_numpy()
    atr_arr = df_feat["atr"].to_numpy()

    cfg = BacktestConfig(
        initial_cash=initial_cash,
        sl_atr=sl_atr,
        tp_atr=tp_atr,
        max_trade_minutes=max_trade_minutes,
        commission_per_trade=0.0,
        slippage_ticks=0,
    )

    result = run_backtest_with_signals(
        ts_arr, o_arr, h_arr, l_arr, c_arr, v_arr,
        sig_arr,
        atr_arr,
        cfg,
    )

    metrics = {}
    if equity_curve_metrics is not None:
        eqm = equity_curve_metrics(result.equity_curve)
        metrics.update(eqm.to_dict())

    if trades_metrics is not None and trades_to_dataframe is not None:
        tdf = trades_to_dataframe(result)
        tmet = trades_metrics(tdf)
        metrics.update({f"trades_{k}": v for k, v in tmet.to_dict().items()})

    return result, metrics


## Ejemplo de backtest de barrida

In [None]:
is_sweep = compute_sweep_flag(df_feat, wick_threshold=0.6, vol_quantile=0.8, hour_window=(8, 10))
signal = build_long_signal_from_sweep(is_sweep)
result, metrics = run_backtest_from_signal(df_feat, signal)

pd.Series(metrics)


## Separación train/test por años

In [None]:
def split_by_years(df: pd.DataFrame, train_years, test_years):
    train_mask = df.index.year.isin(train_years)
    test_mask = df.index.year.isin(test_years)
    return df[train_mask].copy(), df[test_mask].copy()


df_train, df_test = split_by_years(df_feat, train_years=[2020, 2021], test_years=[2022])

# Ejemplo: usar df_train para calibrar parámetros y reservar df_test para validar
len(df_train), len(df_test)


## Notas y siguientes pasos

- Ajustar parámetros de definición de barrida (mecha, volumen, ventana horaria) y medir estabilidad.
- Experimentar con filtros adicionales (tendencia intradía, volatilidad previa, distancias a VWAP si está disponible).
- Extender señales para posiciones cortas o estrategias de reversión.
- Validar en `df_test` y documentar resultados clave.