In [1]:
from __future__ import annotations

from dataclasses import dataclass
from typing import Callable, Dict, Iterable, Literal, Optional, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from qresearch.data.utils import get_processed_dir, save_market_data_to_csv, load_market_data_from_csv
from qresearch.data.yfinance import download_market_data


# ============================================================
# 0) Core utilities: naming + label/feature building
# ============================================================
def ensure_names(close_df: pd.DataFrame) -> pd.DataFrame:
    close = close_df.sort_index().copy()
    close.index = pd.to_datetime(close.index)
    close.index.name = "date"
    close.columns.name = "ticker"
    return close


def make_fwd_return(close_df: pd.DataFrame, H: int = 5) -> pd.DataFrame:
    close = ensure_names(close_df)
    return close.shift(-H) / close - 1.0


def rsi_wilder(close: pd.DataFrame, period: int = 14) -> pd.DataFrame:
    delta = close.diff()
    up = delta.clip(lower=0.0)
    down = (-delta).clip(lower=0.0)
    roll_up = up.ewm(alpha=1 / period, adjust=False).mean()
    roll_down = down.ewm(alpha=1 / period, adjust=False).mean()
    rs = roll_up / roll_down
    return 100.0 - (100.0 / (1.0 + rs))


def make_features_default(close_df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
    close = ensure_names(close_df)

    ma5 = close.rolling(5).mean()
    ma10 = close.rolling(10).mean()
    ma20 = close.rolling(20).mean()

    close_ma_diff_5 = close / ma5 - 1.0
    close_ma_diff_10 = close / ma10 - 1.0
    close_ma_diff_20 = close / ma20 - 1.0

    max_growth_5 = close / close.shift(5) - 1.0

    rsi = rsi_wilder(close, period=14)

    roll_max_120 = close.rolling(120).max()
    dd = close / roll_max_120 - 1.0
    max_drawdown_120 = dd.rolling(120).min()

    return {
        "close_ma_diff_5": close_ma_diff_5,
        "max_growth_5": max_growth_5,
        "close_ma_diff_10": close_ma_diff_10,
        "rsi": rsi,
        "close_ma_diff_20": close_ma_diff_20,
        "max_drawdown_120": max_drawdown_120,
    }


def wide_to_panel(features: Dict[str, pd.DataFrame], y_wide: pd.DataFrame) -> pd.DataFrame:
    """
    Convert wide feature dict + wide y into a long panel:
    columns: date, ticker, <feature...>, y
    """
    parts = []
    for name, df in features.items():
        s = df.stack(future_stack=True).rename(name)
        parts.append(s)

    X = pd.concat(parts, axis=1)
    y = y_wide.stack(future_stack=True).rename("y")

    panel = pd.concat([X, y], axis=1)
    panel.index = panel.index.set_names(["date", "ticker"])
    panel = panel.reset_index()
    panel["date"] = pd.to_datetime(panel["date"])
    return panel


# ============================================================
# 1) Cross-sectional preprocessing
# ============================================================
def cs_winsorize_zscore(panel: pd.DataFrame, feature_cols: list[str],
                       q_lo: float = 0.01, q_hi: float = 0.99) -> pd.DataFrame:
    out = panel.copy()
    for c in feature_cols:
        g = out.groupby("date")[c]
        lo = g.transform(lambda s: s.quantile(q_lo))
        hi = g.transform(lambda s: s.quantile(q_hi))
        x = out[c].clip(lo, hi)
        mu = x.groupby(out["date"]).transform("mean")
        sd = x.groupby(out["date"]).transform("std")
        out[c] = (x - mu) / sd
    return out


def flip_negative_ic(panel: pd.DataFrame, feature_cols: list[str]) -> Tuple[pd.DataFrame, Dict[str, float]]:
    """
    Pooled Spearman IC between each feature and y.
    If IC < 0, flip sign to enforce positive direction.
    """
    out = panel.copy()
    ic_map: Dict[str, float] = {}
    for c in feature_cols:
        ic = out[c].corr(out["y"], method="spearman")
        ic_map[c] = float(ic) if ic is not None else np.nan
        if ic is not None and ic < 0:
            out[c] = -out[c]
    return out, ic_map


# ============================================================
# 2) Walk-forward split (2y train / 1m test + purge H)
# ============================================================
def walk_forward_splits(
    dates: Iterable[pd.Timestamp],
    train_years: int = 2,
    test_months: int = 1,
    H: int = 5,
    min_train_days: int = 100,
    min_test_days: int = 5,
):
    dates = pd.DatetimeIndex(sorted(pd.DatetimeIndex(dates).unique()))
    start = dates.min()
    end = dates.max()

    cur_test_start = start + pd.DateOffset(years=train_years)
    while cur_test_start < end:
        cur_test_end = cur_test_start + pd.DateOffset(months=test_months)

        test_dates = dates[(dates >= cur_test_start) & (dates < cur_test_end)]
        if len(test_dates) == 0:
            cur_test_start = cur_test_end
            continue

        test_start = test_dates.min()
        pos = dates.get_indexer([test_start])[0]
        train_end_pos = max(pos - H, 0)  # purge H days
        train_end = dates[train_end_pos]

        train_start = test_start - pd.DateOffset(years=train_years)
        train_dates = dates[(dates >= train_start) & (dates <= train_end)]

        if len(train_dates) >= min_train_days and len(test_dates) >= min_test_days:
            yield train_dates, test_dates

        cur_test_start = cur_test_end


# ============================================================
# 3) Model adapters (single interface)
# ============================================================
@dataclass(frozen=True)
class FitPredictContext:
    feature_cols: list[str]
    top_k: int
    H: int


FitPredictFn = Callable[
    [pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray, np.ndarray, np.ndarray, FitPredictContext, dict],
    np.ndarray
]
"""
Signature:
    predict = fit_predict(
        train_df, test_df,
        X_train, y_train, group_train,
        X_test, group_test,
        ctx, params
    )
"""

def make_relevance_from_y(
    panel: pd.DataFrame,
    n_bins: int = 5,
    col_y: str = "y",
    col_date: str = "date",
    min_per_day: int = 20,
) -> pd.Series:
    """
    Convert continuous forward returns y into integer relevance labels per day.

    Why:
    - LightGBM LambdaRank expects integer labels (relevance grades).
    - Relevance must be cross-sectional per query group (here: per date).

    Robustness:
    - Handles ties / insufficient unique values / small universes.
    - Guarantees output is integer in [0, n_bins-1] where defined.
    - Returns pd.Series aligned to panel.index (same length).

    Convention:
    - Higher y => higher relevance (better).
    """
    if n_bins < 2:
        raise ValueError("n_bins must be >= 2")

    y = panel[col_y]
    d = panel[col_date]

    def _bin_one_day(s: pd.Series) -> pd.Series:
        # s is y for one date, indexed by panel index rows
        s = s.replace([np.inf, -np.inf], np.nan).dropna()

        # Not enough names to create stable bins
        if len(s) < max(min_per_day, n_bins):
            return pd.Series(index=s.index, data=np.nan)

        # If too many ties (e.g., many zeros), qcut can fail or drop bins
        # Strategy: rank then map ranks into bins deterministically.
        r = s.rank(method="average")  # ascending rank: low y -> small rank
        # Convert rank percentiles to bins
        # bin_id in 0..n_bins-1
        bin_id = np.floor((r - 1) / max(len(r) / n_bins, 1.0))
        bin_id = bin_id.clip(0, n_bins - 1)

        # Ensure integer dtype but allow NaN at higher level; return Int64 for safety
        return pd.Series(index=s.index, data=bin_id.astype(np.int64))

    rel = panel.groupby(d, sort=False)[col_y].apply(_bin_one_day)
    # groupby/apply returns a multiindex series; drop the group level so it aligns with original rows
    rel = rel.reset_index(level=0, drop=True)

    # Align to full panel index: rows that were NaN/dropped become NaN here
    rel = rel.reindex(panel.index)

    # Use pandas nullable Int64 to avoid int-casting NaN error.
    return rel.astype("Int64")


def fit_predict_xgb_ranker(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    X_train: np.ndarray,
    y_train: np.ndarray,
    group_train: np.ndarray,
    X_test: np.ndarray,
    group_test: np.ndarray,
    ctx: FitPredictContext,
    params: dict,
) -> np.ndarray:
    from xgboost import XGBRanker

    # map friendly params -> XGBRanker init
    model = XGBRanker(**params)
    model.fit(X_train, y_train, group=group_train)
    return model.predict(X_test)


def fit_predict_lgb_lambdarank(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    X_train: np.ndarray,
    y_train: np.ndarray,
    group_train: np.ndarray,
    X_test: np.ndarray,
    group_test: np.ndarray,
    ctx: FitPredictContext,
    params: dict,
) -> np.ndarray:
    import lightgbm as lgb

    # LightGBM needs Dataset + group
    dtrain = lgb.Dataset(X_train, label=y_train, group=group_train, free_raw_data=True)
    dvalid = lgb.Dataset(X_test,  label=np.zeros_like(X_test[:, 0]), group=group_test, reference=dtrain, free_raw_data=True)

    # training params
    lgb_params = params.get("lgb_params", {})
    num_boost_round = params.get("num_boost_round", 2000)
    early_stopping_rounds = params.get("early_stopping_rounds", 50)

    # ensure objective/metric exist
    lgb_params = {
        "objective": "lambdarank",
        "metric": "ndcg",
        "verbosity": -1,
        "ndcg_eval_at": [ctx.top_k],
        **lgb_params,
    }

    booster = lgb.train(
        lgb_params,
        dtrain,
        num_boost_round=num_boost_round,
        valid_sets=[dvalid],
        callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=False)],
    )
    return booster.predict(X_test, num_iteration=booster.best_iteration)


MODEL_REGISTRY: Dict[str, FitPredictFn] = {
    "xgb_ranker": fit_predict_xgb_ranker,
    "lgb_lambdarank": fit_predict_lgb_lambdarank,
}


# ============================================================
# 4) Shared evaluation
# ============================================================
def eval_rankic(oos: pd.DataFrame) -> Dict[str, object]:
    rankic_by_day = oos.groupby("date", group_keys=False)[["score", "y"]].apply(lambda g: g["score"].corr(g["y"], method="spearman"))
    mean = float(rankic_by_day.mean())
    std = float(rankic_by_day.std(ddof=0))
    ir = float(mean / std) if std > 0 else np.nan
    return {"rankic_by_day": rankic_by_day, "rankic_mean": mean, "rankic_ir": ir}


def eval_topk_proxy(oos: pd.DataFrame, top_k: int) -> Dict[str, object]:
    # Diagnostic only: mean of forward H-day return y among top-k scores
    topk_fwd = oos.groupby("date", group_keys=False)[["score", "y"]].apply(lambda g: g.nlargest(top_k, "score")["y"].mean())
    topk_eq = (1 + topk_fwd.fillna(0)).cumprod()
    return {"topk_fwd": topk_fwd, "topk_fwd_eq": topk_eq}


# ============================================================
# 5) The unified workflow (minimal duplication)
# ============================================================
def run_rank_workflow(
    close_df: pd.DataFrame,
    model_name: Literal["xgb_ranker", "lgb_lambdarank"],
    model_params: dict,
    H: int = 5,
    top_k: int = 7,
    train_years: int = 2,
    test_months: int = 1,
    preprocess: bool = True,
    align_feature_sign: bool = True,
) -> Dict[str, object]:
    """
    Unified workflow for cross-sectional ranking models (walk-forward).

    Inputs
    ------
    close_df:
        Wide close price table: index=Date, columns=Ticker.
        Features and y are computed using only information up to close[t].
    model_name:
        Which ranker to use (delegated to MODEL_REGISTRY).
    model_params:
        Model-specific parameters, plus optional workflow knobs like:
          - relevance_bins (for lgb_lambdarank)
    H:
        Forward horizon: y[t] = fwd return over the holding window aligned to entry rule.
    top_k:
        Used by eval_topk_proxy (diagnostic), and can be used by the ranker objective.
    train_years / test_months:
        Walk-forward split definition.
    preprocess:
        Cross-sectional winsorize + zscore per date (recommended).
    align_feature_sign:
        Flip features with negative pooled IC so “higher is better” consistently.

    Returns
    -------
    dict with:
        - pooled_ic (optional)
        - oos (long panel of predictions)
        - oos_wide (date x ticker score table; directly usable as a signal)
        - rankic stats
        - topk proxy stats
        - metadata
    """
    if model_name not in MODEL_REGISTRY:
        raise KeyError(f"Unknown model_name={model_name}. Available: {list(MODEL_REGISTRY.keys())}")

    fit_predict = MODEL_REGISTRY[model_name]

    # -------------------------
    # 1) Build a single shared panel: (date, ticker) rows with features + y
    # -------------------------
    feats = make_features_default(close_df)
    y_wide = make_fwd_return(close_df, H=H)

    panel = wide_to_panel(feats, y_wide)  # expects columns: date, ticker, <features...>, y

    feature_cols = list(feats.keys())

    # Hard clean y to avoid inf contaminating downstream ops
    # (You can optionally extend to features, but you already dropna later.)
    panel = panel[np.isfinite(panel["y"].to_numpy())].copy()

    # -------------------------
    # 2) LightGBM LambdaRank needs integer relevance grades per query group (date)
    # -------------------------
    if model_name == "lgb_lambdarank":
        n_bins = int(model_params.get("relevance_bins", 5))
        panel = panel.copy()
        panel["rel"] = make_relevance_from_y(panel, n_bins=n_bins, col_y="y", col_date="date")

    # -------------------------
    # 3) Drop NA rows from rolling windows + end-of-sample horizon
    # -------------------------
    drop_cols = ["y"] + feature_cols
    if model_name == "lgb_lambdarank":
        # rel is allowed to be NA for bad dates; but training cannot.
        drop_cols += ["rel"]

    panel = panel.dropna(subset=drop_cols)

    # -------------------------
    # 4) Optional cross-sectional preprocessing (per date)
    # -------------------------
    if preprocess:
        panel = cs_winsorize_zscore(panel, feature_cols)

    pooled_ic = None
    if align_feature_sign:
        panel, pooled_ic = flip_negative_ic(panel, feature_cols)

    # Ensure chronological order for splits
    all_dates = pd.DatetimeIndex(panel["date"].unique()).sort_values()

    ctx = FitPredictContext(feature_cols=feature_cols, top_k=top_k, H=H)

    # -------------------------
    # 5) Walk-forward: train -> predict -> collect OOS scores
    # -------------------------
    oos_rows = []

    for train_dates, test_dates in walk_forward_splits(
        all_dates,
        train_years=train_years,
        test_months=test_months,
        H=H,
    ):
        train = panel[panel["date"].isin(train_dates)].sort_values(["date", "ticker"])
        test  = panel[panel["date"].isin(test_dates)].sort_values(["date", "ticker"])

        # Guard: skip empty splits (can happen if dates get filtered by NA drops)
        if len(train) == 0 or len(test) == 0:
            continue

        # Group sizes: one query group per date
        group_train = train.groupby("date").size().to_numpy()
        group_test  = test.groupby("date").size().to_numpy()

        X_train = train[feature_cols].to_numpy()
        X_test  = test[feature_cols].to_numpy()

        # y is always continuous forward return (kept for evaluation)
        y_test = test["y"].to_numpy()

        # For training target:
        # - XGB ranker can train directly on continuous y (pairwise)
        # - LGBM lambdarank typically expects relevance grades (int)
        if model_name == "lgb_lambdarank":
            y_train_model = train["rel"].to_numpy()
        else:
            y_train_model = train["y"].to_numpy()

        score = fit_predict(
            train, test,
            X_train, y_train_model, group_train,
            X_test, group_test,
            ctx, model_params,
        )

        tmp = test[["date", "ticker"]].copy()
        tmp["score"] = score
        tmp["y"] = y_test
        oos_rows.append(tmp)

    if not oos_rows:
        raise RuntimeError("No OOS rows produced. Check splits/NA filtering/universe coverage.")

    oos = (
        pd.concat(oos_rows, ignore_index=True)
        .sort_values(["date", "score"], ascending=[True, False])
    )

    # Wide score table: convenient “signal” to plug into bucket backtest / tearsheet
    oos_wide = oos.pivot(index="date", columns="ticker", values="score").sort_index()

    rank_metrics = eval_rankic(oos)
    topk_metrics = eval_topk_proxy(oos, top_k=top_k)

    return {
        "pooled_ic": pooled_ic,
        "oos": oos,
        "oos_wide": oos_wide,
        **rank_metrics,
        **topk_metrics,
        "model_name": model_name,
        "model_params": model_params,
        "H": H,
        "top_k": top_k,
        "train_years": train_years,
        "test_months": test_months,
        "preprocess": preprocess,
        "align_feature_sign": align_feature_sign,
    }


In [2]:
import yfinance as yf

processed_dir = get_processed_dir()

hsci_stocks = pd.read_csv(f'{processed_dir}/hsci_components.csv')

START_DATE = "2009-01-01"
END_DATE   = "2026-01-15"

def code_int_to_hk(code: int) -> str:
    return f"{int(code):04d}.HK"

UNIVERSE = hsci_stocks["Stock Code"].apply(code_int_to_hk).tolist()

# price_df = download_market_data(UNIVERSE, start=START_DATE, end=END_DATE)

In [3]:
# save_market_data_to_csv(price_df, f'{processed_dir}/hsci_ohlc.csv')

In [4]:
hsci_md = load_market_data_from_csv(f'{processed_dir}/hsci_ohlc.csv')
hsci_md.close.head()

Loading MarketData from /Users/henrywzh/Desktop/Quant/Research/data/processed/hsci_ohlc.csv...


Ticker,0883.HK,0857.HK,1088.HK,0386.HK,1171.HK,1898.HK,2883.HK,3668.HK,2386.HK,0639.HK,...,1052.HK,9699.HK,2510.HK,0636.HK,1341.HK,2169.HK,2570.HK,3677.HK,2582.HK,1333.HK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-01-02,2.767038,3.167,5.170709,1.308784,0.736292,3.657679,4.749063,,,0.425364,...,0.82963,,,,,,,,,
2009-01-05,2.985776,3.386929,5.34901,1.387944,0.807467,3.959227,5.000485,,,0.434272,...,0.854555,,,,,,,,,
2009-01-06,2.989422,3.439713,5.47382,1.374751,0.830782,4.18818,4.853823,,,0.414229,...,0.868798,,,,,,,,,
2009-01-07,2.880052,3.276964,5.444103,1.324616,0.809921,3.953643,4.679226,,,0.412002,...,0.858115,,,,,,,,,
2009-01-08,2.686833,3.105419,5.25986,1.245456,0.742428,3.573914,4.211303,,,0.42091,...,0.811827,,,,,,,,,


In [5]:
hsi_md = download_market_data("^HSI", START_DATE, END_DATE)

# Extract close matrices
close = hsci_md.close.sort_index()
# close.to_csv(f'{grandparent}/data/processed/hsci_close.csv')
benchmark_close = hsi_md.close.sort_index()

# Common calendar (optional but recommended)
common_dates = close.index.intersection(benchmark_close.index)
close_df = close.loc[common_dates]
benchmark_close = benchmark_close.loc[common_dates]

[-] Initializing download for 1 tickers...
    Date Range: 2009-01-01 -> 2026-01-15


[*********************100%***********************]  1 of 1 completed

[✓] Download complete. Shape: (4196, 5)
[-] Extracting and aligning components...
[✓] MarketData object created..





In [6]:
# from qresearch.backtest.buckets import make_tearsheet
# 
# lgb_params = {
#   "relevance_bins": 5,
#   "lgb_params": {
#     "learning_rate": 0.05,
#     "num_leaves": 31,
#     "min_data_in_leaf": 300,
#     "feature_fraction": 0.8,
#     "bagging_fraction": 0.8,
#     "bagging_freq": 1,
#     "lambda_l2": 2.0,
#     "min_gain_to_split": 0.0,
#   },
#   "num_boost_round": 800,
# }
# 
# out = run_rank_workflow(
#     close_df=close_df,
#     model_name="lgb_lambdarank",
#     model_params=lgb_params,
#     H=5,
#     top_k=10,
# )
# 
# signal_ml = out["oos_wide"]  # date x ticker model scores
# 
# rep = make_tearsheet(
#     price_df=price_df,
#     signal=signal_ml,
#     H=5,
#     n_buckets=20,
#     entry_mode="next_close",
#     benchmark_price=hsi,  # optional
#     benchmark_name="^HSI",
# )

In [7]:
from qresearch.signals.signals_sweep import *

# H = 5

test_ma_diff = [
    ('ma_diff', {"lookback": 5, "skip": 0}),
    ('ma_diff', {"lookback": 20, "skip": 0}),
    ('ma_diff', {"lookback": 50, "skip": 0}),
    ('ma_diff', {"lookback": 120, "skip": 0}),
    ('ma_diff', {"lookback": 200, "skip": 0}),
    ('ma_diff', {"lookback": 250, "skip": 0}),
]

test_mom_ret = [
    ("mom_ret", {"lookback": 5, "skip": 0}),
    ("mom_ret", {"lookback": 21, "skip": 0}),
    ("mom_ret", {"lookback": 63, "skip": 0}),
    ("mom_ret", {"lookback": 126, "skip": 0}),
    ("mom_12_1", {"lookback": 252, "skip": 21}),
]

test_ohlc_mom = [
    ("on_minus_id", {"lookback": 5}),
    ("overnight_mom", {"lookback": 5}),
    ('intraday_mom', {"lookback": 5}),
    ("on_minus_id", {"lookback": 10}),
    ("overnight_mom", {"lookback": 10}),
    ('intraday_mom', {"lookback": 10}),
    ("on_minus_id", {"lookback": 21}),
    ("overnight_mom", {"lookback": 21}),
    ('intraday_mom', {"lookback": 21}),
    ("on_minus_id", {"lookback": 63}),
    ("overnight_mom", {"lookback": 63}),
    ('intraday_mom', {"lookback": 63}),
    ("on_minus_id", {"lookback": 126}),
    ("overnight_mom", {"lookback": 126}),
    ('intraday_mom', {"lookback": 126}),
    ("on_minus_id", {"lookback": 252}),
    ("overnight_mom", {"lookback": 252}),
    ('intraday_mom', {"lookback": 252}),
]

test_rsi = [
    ("rsi", {"lookback": 5}),
    ("rsi", {"lookback": 12}),
    ("rsi", {"lookback": 24}),
    ("rsi", {"lookback": 60}),
]

test_trend_annret_r2 = [
    ("trend_annret_r2", {"lookback": 21, "ann_factor": 252}),
    ("trend_annret_r2", {"lookback": 126, "ann_factor": 252}),
    ("trend_annret_r2", {"lookback": 252, "ann_factor": 252}),
    ("trend_annret_r2", {"lookback": 252, "ann_factor": 252, 'skip': 21}),
]

tests = test_ma_diff + test_mom_ret + test_trend_annret_r2 + test_ohlc_mom + test_rsi

  return float(eq ** (freq / len(r)) - 1.0)
  return float(eq ** (freq / len(r)) - 1.0)


In [39]:
cfg = SignalTestConfig(H=5, n_buckets=20, use_price_floor=True)
summary = sweep_signals(hsci_md, tests, cfg)
summary

  return float(eq ** (freq / len(r)) - 1.0)
  return float(eq ** (freq / len(r)) - 1.0)


Unnamed: 0,signal,params,ic_mean,icir,hit_rate,mono,top_annret,top_sharpe,top_maxdd,ls_annret,ls_sharpe,ls_maxdd
19,trend_annret_r2,"{'lookback': 252, 'ann_factor': 252}",0.030003,0.171665,0.604061,0.581955,0.295998,1.004055,-0.429166,0.171682,0.671295,-0.689475
15,mom_12_1,"{'lookback': 252, 'skip': 21}",0.025566,0.141087,0.582908,0.52782,0.268724,0.911702,-0.482102,0.14456,0.594515,-0.655989
9,ma_diff,"{'lookback': 250, 'skip': 0}",0.024087,0.131057,0.574144,0.571429,0.272005,0.929788,-0.429907,0.105817,0.484588,-0.727508
8,ma_diff,"{'lookback': 180, 'skip': 0}",0.020173,0.11169,0.596513,0.593985,0.29488,0.987734,-0.486185,0.126608,0.54366,-0.738571
7,ma_diff,"{'lookback': 150, 'skip': 0}",0.01818,0.103264,0.585909,0.724812,0.336154,1.074477,-0.469736,0.195007,0.729147,-0.753514
18,trend_annret_r2,"{'lookback': 126, 'ann_factor': 252}",0.017649,0.104421,0.561425,0.538346,0.258268,0.899971,-0.480629,0.107434,0.49274,-0.715912
29,rsi,{'lookback': 60},0.013687,0.084193,0.570738,0.466165,0.314864,1.1532,-0.341944,0.153543,0.674054,-0.480962
14,mom_ret,"{'lookback': 126, 'skip': 0}",0.012725,0.07215,0.571956,0.309774,0.328499,1.036491,-0.469423,0.128238,0.547841,-0.720653
6,ma_diff,"{'lookback': 120, 'skip': 0}",0.012641,0.07194,0.571779,0.249624,0.345307,1.067949,-0.489061,0.115744,0.512308,-0.666071
28,rsi,{'lookback': 50},0.011485,0.071681,0.564536,0.33985,0.309361,1.119839,-0.368065,0.110729,0.532633,-0.482112


In [40]:
cfg = SignalTestConfig(H=10, n_buckets=20, use_price_floor=True)
summary = sweep_signals(hsci_md, tests, cfg)
summary

  return float(eq ** (freq / len(r)) - 1.0)
  return float(eq ** (freq / len(r)) - 1.0)


Unnamed: 0,signal,params,ic_mean,icir,hit_rate,mono,top_annret,top_sharpe,top_maxdd,ls_annret,ls_sharpe,ls_maxdd
19,trend_annret_r2,"{'lookback': 252, 'ann_factor': 252}",0.034712,0.193082,0.597964,0.595489,0.291524,1.011386,-0.428468,0.16705,0.649723,-0.667917
15,mom_12_1,"{'lookback': 252, 'skip': 21}",0.029651,0.158353,0.56266,0.536842,0.277657,0.941407,-0.444957,0.121561,0.52285,-0.619323
9,ma_diff,"{'lookback': 250, 'skip': 0}",0.026967,0.142146,0.581218,0.547368,0.279234,0.972835,-0.427003,0.104846,0.478468,-0.704453
8,ma_diff,"{'lookback': 180, 'skip': 0}",0.022747,0.122556,0.59601,0.485714,0.31778,1.055622,-0.447972,0.155948,0.613118,-0.700373
7,ma_diff,"{'lookback': 150, 'skip': 0}",0.02091,0.115799,0.584158,0.57594,0.33504,1.098336,-0.448613,0.198619,0.727705,-0.751691
18,trend_annret_r2,"{'lookback': 126, 'ann_factor': 252}",0.018273,0.108234,0.576355,0.475188,0.284131,0.98191,-0.487457,0.12552,0.544985,-0.67408
29,rsi,{'lookback': 60},0.015924,0.096073,0.578692,0.604511,0.323264,1.240963,-0.41985,0.161084,0.700336,-0.399888
14,mom_ret,"{'lookback': 126, 'skip': 0}",0.014752,0.081782,0.571429,0.442105,0.35331,1.119202,-0.459014,0.171913,0.664677,-0.62473
28,rsi,{'lookback': 50},0.013548,0.083072,0.570048,0.330827,0.333932,1.253796,-0.431255,0.16739,0.71935,-0.487871
6,ma_diff,"{'lookback': 120, 'skip': 0}",0.013323,0.074326,0.55774,0.333835,0.359805,1.130868,-0.475793,0.156132,0.619913,-0.589194


In [41]:
cfg = SignalTestConfig(H=20, n_buckets=20, use_price_floor=True)
summary = sweep_signals(hsci_md, tests, cfg)
summary

  return float(eq ** (freq / len(r)) - 1.0)
  return float(eq ** (freq / len(r)) - 1.0)


Unnamed: 0,signal,params,ic_mean,icir,hit_rate,mono,top_annret,top_sharpe,top_maxdd,ls_annret,ls_sharpe,ls_maxdd
19,trend_annret_r2,"{'lookback': 252, 'ann_factor': 252}",0.039134,0.218455,0.627551,0.894737,0.301257,1.030022,-0.424485,0.165971,0.656345,-0.661213
15,mom_12_1,"{'lookback': 252, 'skip': 21}",0.033387,0.181908,0.635897,0.639098,0.27835,0.947223,-0.445795,0.07828,0.419293,-0.709937
9,ma_diff,"{'lookback': 250, 'skip': 0}",0.027875,0.15049,0.591837,0.654135,0.268001,0.911987,-0.440924,0.092941,0.458917,-0.727074
8,ma_diff,"{'lookback': 180, 'skip': 0}",0.026833,0.146157,0.62,0.757895,0.299395,1.000867,-0.436431,0.115627,0.53168,-0.758223
7,ma_diff,"{'lookback': 150, 'skip': 0}",0.022946,0.130374,0.60199,0.809023,0.335641,1.078944,-0.450704,0.162796,0.642673,-0.795385
29,rsi,{'lookback': 60},0.019881,0.122876,0.601942,0.766917,0.2603,1.052978,-0.371727,0.109089,0.54889,-0.377516
18,trend_annret_r2,"{'lookback': 126, 'ann_factor': 252}",0.018696,0.115937,0.60396,0.584962,0.263619,0.9203,-0.496843,0.123594,0.545111,-0.664101
28,rsi,{'lookback': 50},0.016682,0.104429,0.606796,0.78797,0.253073,1.024086,-0.370518,0.089662,0.479918,-0.481046
14,mom_ret,"{'lookback': 126, 'skip': 0}",0.016071,0.092361,0.59901,0.666165,0.311123,1.002491,-0.463058,0.149343,0.611851,-0.695034
6,ma_diff,"{'lookback': 120, 'skip': 0}",0.015811,0.092154,0.571429,0.545865,0.323673,1.038828,-0.456869,0.14175,0.590356,-0.676931


In [14]:
hsci_components = pd.read_csv(get_processed_dir() / 'hsci_components.csv')
hsci_components['Industry'].value_counts()

Industry
非必需性消費    104
醫療保健業      73
工業         60
地產建築業      59
金融業        48
資訊科技業      47
必需性消費      37
公用事業       27
原材料業       23
能源業        16
電訊業         8
綜合企業        6
Name: count, dtype: int64

## Market Cap

In [23]:
tickers = hsci_components['Stock Code'].apply(code_int_to_hk)

# 2. 批量获取数据
data = []
for ticker in tickers:
    try:
        stock = yf.Ticker(ticker)
        # 获取市值 (Market Cap)
        mkt_cap = stock.info.get('marketCap')
        data.append({'Ticker': ticker, 'Market Cap': mkt_cap})
        print(f"Fetched {ticker}: {mkt_cap}")
    except Exception as e:
        print(f"Error fetching {ticker}: {e}")

# 3. 转换为 DataFrame 并保存
mkt_cp_df = pd.DataFrame(data)
mkt_cp_df

Unnamed: 0,Ticker,Market Cap
0,0883.HK,1076075954176
1,0857.HK,2020873863168
2,1088.HK,1117549363200
3,0386.HK,793553928192
4,1171.HK,156522496000
...,...,...
503,2169.HK,3080000000
504,2570.HK,6173595648
505,3677.HK,23091965952
506,2582.HK,4188262656


In [25]:
# 1. Prepare the Shares Estimation Series
# We set the index to 'Ticker' so it aligns with price_df's columns
current_data = mkt_cp_df.set_index('Ticker')

# We get the most recent price for every stock from your price_df
# (Using .iloc[-1] ensures we use the price concurrent with your market cap snapshot)
latest_prices = hsci_md.close.iloc[-1]

# Calculate Shares Outstanding (Series)
# Note: This automatically aligns indices. If a ticker is in one but not the other, it becomes NaN.
estimated_shares = current_data['Market Cap'] / latest_prices
estimated_shares.name = 'Estimated Shares'

# 2. Calculate Historical Market Cap (The Matrix Operation)
# We multiply the price dataframe by the shares series along the columns (axis=1)
historical_mcap = hsci_md.close.mul(estimated_shares, axis=1)

# ----------------------------------------------------
# Check the result
historical_mcap

Ticker,0883.HK,0857.HK,1088.HK,0386.HK,1171.HK,1898.HK,2883.HK,3668.HK,2386.HK,0639.HK,...,1052.HK,9699.HK,2510.HK,0636.HK,1341.HK,2169.HK,2570.HK,3677.HK,2582.HK,1333.HK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-01-02,1.374674e+11,7.748314e+11,1.389736e+11,2.154752e+11,1.042952e+10,5.770609e+10,4.100511e+10,,,2.178641e+09,...,1.385069e+09,,,,,,,,,
2009-01-05,1.483343e+11,8.286388e+11,1.437658e+11,2.285079e+11,1.143771e+10,6.246352e+10,4.317597e+10,,,2.224267e+09,...,1.426681e+09,,,,,,,,,
2009-01-06,1.485155e+11,8.415528e+11,1.471203e+11,2.263359e+11,1.176798e+10,6.607565e+10,4.190963e+10,,,2.121608e+09,...,1.450458e+09,,,,,,,,,
2009-01-07,1.430819e+11,8.017350e+11,1.463216e+11,2.180817e+11,1.147247e+10,6.237542e+10,4.040211e+10,,,2.110203e+09,...,1.432625e+09,,,,,,,,,
2009-01-08,1.334828e+11,7.597651e+11,1.413697e+11,2.050490e+11,1.051644e+10,5.638456e+10,3.636189e+10,,,2.155828e+09,...,1.355347e+09,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2026-01-08,1.027389e+12,1.949923e+12,1.099810e+12,7.688583e+11,1.546811e+11,1.738592e+11,6.320349e+10,3.517899e+10,3.297380e+10,1.649228e+10,...,7.612924e+09,9.649294e+09,1.355430e+10,1.276316e+10,1.039567e+09,3.080000e+09,5.477917e+09,2.197503e+10,3.478722e+09,1.035507e+10
2026-01-09,1.040306e+12,1.981729e+12,1.103573e+12,7.721510e+11,1.539728e+11,1.748058e+11,6.406692e+10,3.562730e+10,3.361615e+10,1.690203e+10,...,7.612924e+09,1.076780e+10,1.411907e+10,1.285082e+10,1.000583e+09,3.122192e+09,5.456041e+09,2.249453e+10,3.434375e+09,1.046746e+10
2026-01-12,1.040306e+12,1.984175e+12,1.103573e+12,7.606263e+11,1.532646e+11,1.735437e+11,6.389424e+10,3.594376e+10,3.121805e+10,1.654350e+10,...,7.696400e+09,1.064352e+10,1.413568e+10,1.286836e+10,1.091545e+09,2.953425e+09,5.639804e+09,2.254649e+10,3.695526e+09,1.100694e+10
2026-01-13,1.067133e+12,2.011088e+12,1.114324e+12,7.655655e+11,1.542561e+11,1.733859e+11,6.622551e+10,3.649755e+10,3.216016e+10,1.690203e+10,...,7.629620e+09,1.029732e+10,1.425195e+10,1.286836e+10,1.208496e+09,2.953425e+09,5.512920e+09,2.262441e+10,3.629006e+09,9.291087e+09


In [37]:
mkt_cap_path = get_processed_dir() / 'hsci_mkt_cp.csv'
estimated_shares_path = get_processed_dir() / 'hsci_estimated_shares.csv'

In [38]:
# historical_mcap.to_csv(mkt_cap_path)
# estimated_shares.to_csv(estimated_shares_path)