In [1]:
from qresearch.portfolio.weights import TopKBookConfig
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import Literal, Tuple, Optional


def momentum_scores(prices: pd.DataFrame, lookback: int = 20) -> pd.DataFrame:
    """
    Simple lookback return as score: price / price.shift(lookback) - 1
    scores aligned to prices index/cols.
    """
    px = prices.replace([np.inf, -np.inf], np.nan)
    score = px / px.shift(lookback) - 1.0
    score = score.replace([np.inf, -np.inf], np.nan)
    return score

### 下载指数数据

In [2]:
import akshare as ak

stock_zh_index_spot_em_df = ak.stock_zh_index_spot_em(symbol="沪深重要指数")
print(stock_zh_index_spot_em_df)

    序号      代码       名称       最新价   涨跌幅    涨跌额         成交量           成交额  \
0    1  000001     上证指数   4151.24  0.27  11.34   823026748  1.365526e+12   
1    2  399001     深证成指  14342.89  0.09  12.98   883025460  1.599884e+12   
2    3  899050     北证50   1562.45 -0.16  -2.49    10993221  2.720108e+10   
3    4  399006     创业板指   3323.56 -0.57 -19.04   267660982  7.493185e+11   
4    5  000680     科创综指   1880.57 -0.47  -8.89    55340046  3.200188e+11   
5    6  000688     科创50   1554.80 -0.08  -1.18    15543610  1.046132e+11   
6    7  399330    深证100   5847.99 -0.27 -15.86    73652958  2.867423e+11   
7    8  000300    沪深300   4717.99  0.26  12.30   382255602  8.294141e+11   
8    9  000016     上证50   3060.56  0.27   8.17    96366780  2.526225e+11   
9   10  399673    创业板50   3463.33 -0.37 -12.84    39127050  2.368073e+11   
10  11  000888  上证综合全收益   4758.59  0.27  13.00   823026748  1.365526e+12   
11  12  399750    深主板50   9078.60 -0.18 -16.66    50884167  1.027487e+11   
12  13  8996

### 大小盘与成长价值
- 创业板指数可作为成长指数，上证50作为价值指数

In [3]:
INDEX_SMALL = {
    '000852': 'CSI1000',
    '000905': 'CSI500',
    '399006': 'ChiNext',
}

INDEX_LARGE = {
    '000300': 'CSI300',
    '000016': 'CSI50',
}

INDEX = INDEX_SMALL | INDEX_LARGE
TICK_TO_NAME_MAP = {v: k for k, v in INDEX.items()}

def build_pair_prices(df_map: dict, large: str, small: str) -> pd.DataFrame:
    """
    df_map[name] is an AKShare index hist df with columns including: '日期', '收盘', etc.
    Return a 2-col close price DataFrame indexed by Date with columns [large, small].
    """
    if large not in df_map or small not in df_map:
        raise KeyError(f"Missing in df_map: {large if large not in df_map else ''} {small if small not in df_map else ''}")

    dfl = df_map[large].copy()
    dfs = df_map[small].copy()

    # normalize date
    dfl["Date"] = pd.to_datetime(dfl["日期"])
    dfs["Date"] = pd.to_datetime(dfs["日期"])

    # keep close only (AKShare uses '收盘')
    dfl = dfl[["Date", "收盘"]].rename(columns={"收盘": large})
    dfs = dfs[["Date", "收盘"]].rename(columns={"收盘": small})

    # inner join on common trading days
    px = dfl.merge(dfs, how="inner", on="Date").set_index("Date").sort_index()

    # enforce numeric
    px[large] = pd.to_numeric(px[large], errors="coerce")
    px[small] = pd.to_numeric(px[small], errors="coerce")
    px = px.dropna()

    return px

In [4]:
df_map = {}

for k, v in INDEX.items():
    tmp_df = ak.index_zh_a_hist(symbol=k, period="daily", start_date="20000101", end_date="20260127")
    df_map[v] = tmp_df

  0%|          | 0/17 [00:00<?, ?it/s]

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

### 选择大小盘

In [None]:
LARGE, SMALL = 'CSI50', 'ChiNext'
df_combined = df_map[LARGE].merge(df_map[SMALL], how='inner', on='日期')
prices = df_combined[['日期', '收盘_x', '收盘_y']].copy()
prices.rename(columns={
    '收盘_x': LARGE,
    '收盘_y': SMALL,
    '日期': 'Date'
}, inplace=True)
prices.set_index('Date', inplace=True)
prices.index = pd.to_datetime(prices.index)

In [5]:
from qresearch.backtest.portfolio import backtest_weights, run_one
from typing import Any, Dict
from dataclasses import field
from matplotlib import pyplot as plt
from qresearch.backtest.metrics import perf_summary, TRADING_DAYS, drawdown_series_from_equity, yearly_returns

RebalanceMode = Literal["calendar", "fixed_h"]


# 数据开始时间为 "2005-01-04"，但是2010年前市场不成熟，所以设置开始时间为2010之后

@dataclass(frozen=True)
class ExperimentConfig:
    start: str = "2010-01-01"
    end: Optional[str] = None
    
    # schedule
    rebalance_mode: RebalanceMode = "fixed_h"
    rebalance: str = "W-FRI"     # used if calendar
    H: int = 5                   # used if fixed_h
    offset: int = 0

    top_k: int = 1
    fee_bps: float = 2.0

    benchmark_mode: Literal["equal_weight_all", "single_ticker"] = "single_ticker"
    benchmark_ticker: Optional[str] = None
    rf_annual: float = 0.0

    # registry-style
    signal_name: str = "mom_ret"
    signal_params: Dict[str, Any] = field(default_factory=lambda: {"lookback": 21, "skip": 0})


def build_benchmark_weights(prices: pd.DataFrame, cfg: ExperimentConfig) -> pd.DataFrame:
    cols = prices.columns.tolist()

    if cfg.benchmark_mode == "equal_weight_all":
        w = pd.Series(1.0 / len(cols), index=cols)
        return pd.DataFrame(np.tile(w.values, (len(prices), 1)), index=prices.index, columns=cols)

    if cfg.benchmark_mode == "single_ticker":
        if not cfg.benchmark_ticker or cfg.benchmark_ticker not in cols:
            raise ValueError("benchmark_ticker must be in prices.columns")
        w = pd.Series(0.0, index=cols)
        w[cfg.benchmark_ticker] = 1.0
        return pd.DataFrame(np.tile(w.values, (len(prices), 1)), index=prices.index, columns=cols)

    raise ValueError(f"Unknown benchmark_mode: {cfg.benchmark_mode}")


def plot_compare(eq_s: pd.Series, eq_b: pd.Series, ret_s: pd.Series, ret_b: pd.Series, title: str) -> None:
    # equity
    fig, ax = plt.subplots(figsize=(11, 5))
    ax.plot(eq_s.index, eq_s.values, label="Strategy (Net)")
    ax.plot(eq_b.index, eq_b.values, label="Benchmark")
    ax.set_title(title)
    ax.legend()
    fig.tight_layout()
    plt.show()

    # drawdown
    dd_s = drawdown_series_from_equity(eq_s)
    dd_b = drawdown_series_from_equity(eq_b)
    fig, ax = plt.subplots(figsize=(11, 4))
    ax.plot(dd_s.index, dd_s.values, label="Strategy")
    ax.plot(dd_b.index, dd_b.values, label="Benchmark")
    ax.set_title("Drawdown")
    ax.legend()
    fig.tight_layout()
    plt.show()

    # yearly returns
    yr_s = yearly_returns(ret_s).rename("Strategy")
    yr_b = yearly_returns(ret_b).rename("Benchmark")
    yr_tbl = pd.concat([yr_s, yr_b], axis=1).sort_index()

    fig, ax = plt.subplots(figsize=(10, 5))
    yr_tbl.plot(kind="bar", ax=ax)
    ax.set_title("Calendar-Year Returns")
    ax.set_ylabel("Return")
    ax.set_xlabel("Year")
    fig.tight_layout()
    plt.show()

cfg = ExperimentConfig(
    H=2,
    top_k=1,
    fee_bps=2.0,
    benchmark_mode="equal_weight_all",
    rf_annual=0.015,
)

# market_data = download_market_data(tickers, start=cfg.start, end=cfg.end)
price_df = prices[prices.index >= cfg.start]

scores = momentum_scores(price_df, lookback=20)

out = run_one(price_df, scores=scores, cfg=cfg)
print(out["stats"])
print(yearly_returns(out['strat'].net_ret))

plot_compare(
    eq_s=out["strat"].equity_net,
    eq_b=out["bench"].equity_net,
    ret_s=out["strat"].net_ret,
    ret_b=out["bench"].net_ret,
    title=f"Top-{cfg.top_k} {cfg.signal_name} Rotation vs Benchmark",
)


NameError: name 'prices' is not defined

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(out['strat'].equity_net, label='Strategy')
plt.plot(price_df / price_df.iloc[0], label=[LARGE, SMALL])
plt.title('Cumulative Return')
plt.legend()
plt.show()

### 不同大小盘股指数回测

In [None]:
def run_pair(df_map: dict, large: str, small: str, cfg: ExperimentConfig, lookback: int = 20) -> dict:
    prices_pair = build_pair_prices(df_map, large, small)

    # apply date range from cfg
    start = pd.Timestamp(cfg.start)
    end = pd.Timestamp(cfg.end) if cfg.end else None
    if end is None:
        prices_pair = prices_pair.loc[prices_pair.index >= start]
    else:
        prices_pair = prices_pair.loc[(prices_pair.index >= start) & (prices_pair.index <= end)]

    scores = momentum_scores(prices_pair, lookback=lookback)

    out = run_one(prices_pair, scores=scores, cfg=cfg)

    # add a few diagnostics that are very useful when comparing pairs
    res = out["strat"]
    row = {
        "LARGE": large,
        "SMALL": small,
        "n_obs": len(res.net_ret),
        "ann_return_geo": float(out["stats"].loc["ann_return_geo", "Strategy"]),
        "ann_vol": float(out["stats"].loc["ann_vol", "Strategy"]),
        "sharpe": float(out["stats"].loc["sharpe", "Strategy"]),
        "max_dd": float(out["stats"].loc["max_dd", "Strategy"]),
        "total_return": float(res.equity_net.iloc[-1] - 1.0),
        "turnover_mean": float(res.turnover.mean()),
        "cash_ratio": float((res.exposure < 1e-9).mean()),
    }
    return {"row": row, "out": out, "prices": prices_pair}

def sweep_pairs(df_map: dict,
                large_list: list[str],
                small_list: list[str],
                cfg: ExperimentConfig,
                lookback: int = 20,
                drop_same: bool = True) -> pd.DataFrame:
    rows = []
    for L in large_list:
        for S in small_list:
            if drop_same and L == S:
                continue
            try:
                out = run_pair(df_map, L, S, cfg=cfg, lookback=lookback)
                rows.append(out["row"])
            except Exception as e:
                rows.append({
                    "LARGE": L, "SMALL": S,
                    "error": str(e),
                })
    df = pd.DataFrame(rows)
    if "sharpe" in df.columns:
        df = df.sort_values(["sharpe", "ann_return_geo"], ascending=False, na_position="last")
    return df

large_list = list(INDEX_LARGE.values())   # ['CSI300', 'CSI50']
small_list = list(INDEX_SMALL.values())   # ['CSI1000', 'CSI500', 'ChiNext']

cfg = ExperimentConfig(
    start="2005-01-04",
    end=None,
    H=1,
    top_k=1,
    fee_bps=2.0,
    benchmark_mode="equal_weight_all",
    rf_annual=0.015
)

df_pairs = sweep_pairs(df_map, large_list, small_list, cfg=cfg, lookback=20)
print(df_pairs)

### 收益归因：到底是“少数牛市段贡献”还是“长期稳定小幅赚钱”

In [None]:
def _logret(r: pd.Series) -> pd.Series:
    r = r.fillna(0.0).replace([np.inf, -np.inf], 0.0)
    return np.log1p(r)

def _ann_geo_from_logsum(logsum: float, n_days: int, trading_days: int = 252) -> float:
    if n_days <= 0:
        return np.nan
    return float(np.exp(logsum * (trading_days / n_days)) - 1.0)

def _simple_from_logsum(logsum: float) -> float:
    return float(np.exp(logsum) - 1.0)

def attribution_by_year(net_ret: pd.Series, trading_days: int = 252) -> pd.DataFrame:
    g = _logret(net_ret)
    yr = g.groupby(g.index.year).sum()
    total = yr.sum()

    out = pd.DataFrame({
        "log_sum": yr,
        "simple_return": yr.apply(_simple_from_logsum),
        "share_of_total_log": yr / total if total != 0 else np.nan,
    })
    out.index.name = "year"
    out = out.sort_values("log_sum", ascending=False)
    return out

def top_rolling_windows(net_ret: pd.Series, window: int = 252, top_n: int = 10) -> pd.DataFrame:
    g = _logret(net_ret)
    roll = g.rolling(window).sum()

    # top windows by log-sum
    top_end = roll.nlargest(top_n).index
    rows = []
    for end in top_end:
        start = g.index[g.index.get_loc(end) - window + 1]
        logsum = float(roll.loc[end])
        rows.append({
            "start": start,
            "end": end,
            "log_sum": logsum,
            "simple_return": _simple_from_logsum(logsum),
        })
    return pd.DataFrame(rows).sort_values("log_sum", ascending=False)

def state_contribution(weights_used: pd.DataFrame, net_ret: pd.Series, eps: float = 1e-9, trading_days: int = 252) -> pd.DataFrame:
    # align
    w = weights_used.reindex(net_ret.index).fillna(0.0)
    r = net_ret.reindex(w.index).fillna(0.0)

    exposure = w.sum(axis=1)
    # state label: CASH if exposure~0 else argmax weight ticker
    state = pd.Series(index=w.index, dtype="object")
    state[exposure < eps] = "CASH"
    active = exposure >= eps
    if active.any():
        state.loc[active] = w.loc[active].idxmax(axis=1)

    g = _logret(r)
    df = pd.DataFrame({"state": state, "log_g": g, "ret": r})
    grp = df.groupby("state")

    out = pd.DataFrame({
        "days": grp.size(),
        "mean_daily_ret": grp["ret"].mean(),
        "log_sum": grp["log_g"].sum(),
    })
    out["simple_return"] = out["log_sum"].apply(_simple_from_logsum)
    out["share_of_total_log"] = out["log_sum"] / out["log_sum"].sum() if out["log_sum"].sum() != 0 else np.nan
    out["ann_geo_in_state"] = out.apply(lambda row: _ann_geo_from_logsum(row["log_sum"], int(row["days"]), trading_days), axis=1)

    return out.sort_values("log_sum", ascending=False)


In [None]:
yr_attr = attribution_by_year(out["strat"].net_ret)
print(yr_attr.head(15))
print("Top-4 years share (log):", float(yr_attr["share_of_total_log"].head(4).sum()))

top_win = top_rolling_windows(out["strat"].net_ret, window=252, top_n=12)
print(top_win)

state_attr = state_contribution(out["strat"].weights_used, out["strat"].net_ret)
print(state_attr)

In [None]:
def zero_out_windows(net_ret: pd.Series, windows: list[tuple[str, str]]) -> pd.Series:
    r = net_ret.copy()
    for s, e in windows:
        mask = (r.index >= pd.Timestamp(s)) & (r.index <= pd.Timestamp(e))
        r.loc[mask] = 0.0
    return r

def quick_stats(net_ret: pd.Series, trading_days: int = 252) -> dict:
    g = _logret(net_ret)
    logsum = float(g.sum())
    n = int(g.shape[0])
    ann_geo = _ann_geo_from_logsum(logsum, n, trading_days)
    vol = float(net_ret.std() * np.sqrt(trading_days))
    sharpe = float((net_ret.mean() / (net_ret.std() + 1e-12)) * np.sqrt(trading_days))
    eq = (1.0 + net_ret.fillna(0.0)).cumprod()
    dd = (eq / eq.cummax() - 1.0).min()
    return {"ann_geo": ann_geo, "ann_vol": vol, "sharpe": sharpe, "max_dd": float(dd), "total_return": float(eq.iloc[-1] - 1.0)}

# Example windows: replace with your true 4 segments
windows = [
    ("2006-04-01", "2007-12-31"),
    ("2009-01-01", "2011-12-31"),
    ("2015-01-01", "2015-12-31"),
    ("2024-01-01", "2099-12-31"),
]

r_ex = zero_out_windows(out["strat"].net_ret, windows)
print("Original:", quick_stats(out["strat"].net_ret))
print("Exclude windows:", quick_stats(r_ex))


### 参数敏感度测试

In [None]:
def run_rotation_once(
    prices: pd.DataFrame,
    lookback: int,
    mode: str,
    H: int = 1,
    rebalance: str = "W-FRI",
    fee_bps: float = 2.0,
    rf_annual: float = 0.015,
    top_k: int = 1,
    use_absolute_filter: bool = True,
) -> dict:
    # scores
    scores = momentum_scores(prices, lookback=lookback)

    # weights
    wcfg = TopKBookConfig(
        mode=mode,
        rebalance=rebalance,
        H=H,
        offset=0,
        top_k=top_k,
        use_absolute_filter=use_absolute_filter,
    )
    w = build_topk_ls_weights(prices=prices, scores=scores, cfg=wcfg)

    # backtest
    bt = backtest_weights(
        prices=prices,
        weights=w,
        fee_bps=fee_bps,
        rf_annual=rf_annual,
        long_only=True,
        allow_leverage=False,
        max_gross=1.01,
    )

    # metrics
    r = bt.net_ret
    eq = bt.equity_net
    dd = (eq / eq.cummax() - 1.0).min()

    out = {
        "lookback": lookback,
        "mode": mode,
        "H": H if mode == "fixed_h" else np.nan,
        "rebalance": rebalance if mode == "calendar" else None,
        "ann_geo": quick_stats(r)["ann_geo"],
        "ann_vol": float(r.std() * np.sqrt(TRADING_DAYS)),
        "sharpe": float((r.mean() / (r.std() + 1e-12)) * np.sqrt(TRADING_DAYS)),
        "max_dd": float(dd),
        "turnover_mean": float(bt.turnover.mean()),
        "cash_ratio": float((bt.exposure < 1e-9).mean()),
        "total_return": float(bt.equity_net.iloc[-1] - 1.0),
    }
    return out

def robustness_sweep(
    prices: pd.DataFrame,
    lookbacks=(10, 20, 40, 60, 120),
    fixed_Hs=(1, 5, 10, 20),
    calendar_freqs=("W-FRI", "M"),
    fee_bps: float = 2.0,
    use_absolute_filter: bool = True,
) -> pd.DataFrame:
    rows = []

    # fixed_h sweeps
    for lb in lookbacks:
        for H in fixed_Hs:
            rows.append(run_rotation_once(
                prices=prices, lookback=lb,
                mode="fixed_h", H=H,
                fee_bps=fee_bps,
                use_absolute_filter=use_absolute_filter,
            ))

    # calendar sweeps
    for lb in lookbacks:
        for freq in calendar_freqs:
            rows.append(run_rotation_once(
                prices=prices, lookback=lb,
                mode="calendar", rebalance=freq,
                fee_bps=fee_bps,
                use_absolute_filter=use_absolute_filter,
            ))

    df = pd.DataFrame(rows)
    df = df.sort_values(["sharpe", "ann_geo"], ascending=False).reset_index(drop=True)
    return df

# 强烈建议：如果你做严格二八轮动，prices 只保留两列（大盘/小盘）
# prices = prices[[LARGE, SMALL]]

sweep = robustness_sweep(
    prices=prices,
    lookbacks=(10, 20, 30, 40, 60, 120),
    fixed_Hs=(1, 5, 10, 20),
    calendar_freqs=("W-FRI", "ME"),
    fee_bps=2.0,
    use_absolute_filter=True,
)

print(sweep.head(20))

# 看“平台效应”：Sharpe > 0.8 的配置占比
print("Configs Sharpe>0.8:", (sweep["sharpe"] > 0.8).mean())
print("Configs Sharpe>1.0:", (sweep["sharpe"] > 1.0).mean())

# 也可以做一个简单 pivot，直观看不同 lookback/H 的年化或夏普
pivot = sweep.query("mode=='fixed_h'").pivot_table(index="lookback", columns="H", values="sharpe", aggfunc="mean")
print(pivot)
