In [1]:
from pathlib import Path
import json
import random
import numpy as np
import pandas as pd
import pyarrow.dataset as ds
import xgboost as xgb

# config
DEVICE = "cuda" # set to "cpu" if needed
ROOT = Path("processed_data2")
TARGET = "stock_ret"
ID_COLS = ["gvkey","iid","excntry"]
NON_FEATURES = {"id","date","gvkey","iid","excntry","year","month","char_date","char_eom","ret_eom","stock_ret","prc","y","m"}
TRAIN_START = 2005
OOS_START, OOS_END = 2015, 2025
LOG_DIR = Path("train_logs")
LOG_DIR.mkdir(exist_ok=True)

# tuning config
random.seed(0)
TUNE_YEAR = 2018
N_TRIALS = 20
WARM_ROUNDS = 200
EARLY_STOP = 30
PRUNE_MARGIN = 0.01

In [4]:
# Read factor_char_list.csv into a set
factor_chars = set(pd.read_csv("factor_char_list.csv", header=None).iloc[:,0].astype(str).str.strip())

# Write to non-features.txt every row in column_names.txt but not in factor_char_list.csv
with open("non-features.txt", "w") as out:
    for col in cols:
        if col not in factor_chars:
            out.write(col + "\n")

NameError: name 'cols' is not defined

In [10]:
for year in range(TRAIN_START, OOS_END + 1):
    for month in months_in(year):
        df = pd.read_parquet(ROOT / f"y={year}" / f"m={month}" / "part-0.parquet")
        idx = df['stock_ret'].abs().idxmax()
        val = df.loc[idx, 'stock_ret']
        print(f"Year {year} Month {month}: max magnitude stock_ret = {val}")

In [11]:
with open("column_names.txt","r") as f:
    cols = [c.strip() for c in f if c.strip()]
    FEATURES = [c for c in cols if c not in NON_FEATURES]

def dataset():
    return ds.dataset(str(ROOT), format="parquet", partitioning="hive")

def months_in(year):
    base = ROOT / f"y={year}"
    if not base.exists():
        return []
    return [m for m in range(1,13) if (base / f"m={m}").exists()]

class ArrowMonthIter(xgb.core.DataIter):
    def __init__(self, y_start, y_end, features, target, batch_size=131072):
        super().__init__()
        self.y_start = y_start
        self.y_end = y_end
        self.features = features
        self.target = target
        self.batch_size = batch_size
        self._plan = [(y, m) for y in range(y_start, y_end+1) for m in months_in(y)]
        self._d = None
        self._idx = 0
        self._batches = None
    def reset(self):
        self._d = dataset()
        self._idx = 0
        self._batches = None
    def next(self, input_data):
        while True:
            if self._batches is None:
                if self._idx >= len(self._plan):
                    return 0
                y, m = self._plan[self._idx]
                self._idx += 1
                flt = (ds.field("y")==y) & (ds.field("m")==m)
                cols = self.features + [self.target]
                scn = self._d.scanner(columns=cols, filter=flt, batch_size=self.batch_size)
                self._batches = iter(scn.to_batches())
            try:
                rb = next(self._batches)
            except StopIteration:
                self._batches = None
                continue
            X = np.column_stack([rb.column(n).to_numpy(zero_copy_only=False) for n in self.features]).astype(np.float32, copy=False)
            y = rb.column(self.target).to_numpy(zero_copy_only=False).astype(np.float32, copy=False)
            input_data(data=X, label=y)
            return 1

def _build_dmats(train_end, val_start, val_end, max_bin):
    it_tr = ArrowMonthIter(TRAIN_START, train_end, FEATURES, TARGET)
    it_va = ArrowMonthIter(val_start,   val_end,   FEATURES, TARGET)
    dtr = xgb.QuantileDMatrix(it_tr, missing=np.nan, max_bin=max_bin)
    dva = xgb.QuantileDMatrix(it_va, ref=dtr, missing=np.nan, max_bin=max_bin)
    return dtr, dva

def _train_until(dtr, dva, params, max_rounds):
    bst = xgb.train(params, dtr, num_boost_round=max_rounds, evals=[(dva,"val")], early_stopping_rounds=EARLY_STOP, verbose_eval=False)
    rmse = float(bst.best_score)
    best_it = bst.best_iteration if bst.best_iteration is not None else max_rounds - 1
    yhat = bst.predict(dva, iteration_range=(0, best_it + 1))
    sigma_pred = float(np.std(yhat))
    return bst, rmse, best_it, sigma_pred

def tune_once():
    y = TUNE_YEAR
    base = {
        "objective": "reg:squarederror",
        "tree_method": "hist",
        "device": DEVICE,
        "eval_metric": "rmse",
        "lambda": 5.0,
        "alpha": 0.0,
        "seed": 0,
        "nthread": -1,
        "colsample_bytree": 0.8
    }

    mb_candidates = [256, 512]
    mcw_candidates = [1, 20]
    depths = [4, 6]
    etas = [0.03, 0.05]
    subs = [0.7, 0.9]

    trials = [{
        "max_depth": random.choice(depths),
        "eta": random.choice(etas),
        "subsample": random.choice(subs),
        "min_child_weight": random.choice(mcw_candidates),
        "max_bin": random.choice(mb_candidates)
    } for _ in range(N_TRIALS)]

    dmat_cache = {}
    def get_dmats(mb):
        if mb not in dmat_cache:
            dmat_cache[mb] = _build_dmats(y-3, y-2, y-1, mb)
        return dmat_cache[mb]

    best_rmse = float("inf")
    chosen = None
    for t in trials:
        params = dict(base, **t)
        dtr, dva = get_dmats(t["max_bin"])
        _, warm_rmse, _, _ = _train_until(dtr, dva, params, WARM_ROUNDS)
        if chosen is not None and warm_rmse > best_rmse * (1.0 + PRUNE_MARGIN):
            continue
        bst, rmse, best_it, sig = _train_until(dtr, dva, params, 2000)
        if rmse < best_rmse:
            best_rmse = rmse
            chosen = {"params": params, "best_iteration": best_it, "sigma_pred_val": sig}

    if chosen is None:
        chosen = {"params": dict(base, max_depth=6, eta=0.05, subsample=0.9, min_child_weight=20, max_bin=256),
                  "best_iteration": 1000, "sigma_pred_val": None}

    (LOG_DIR / "global_best.json").write_text(json.dumps(chosen, indent=2))
    print("tuned_params:", chosen)
    return chosen

def get_best_params():
    p = LOG_DIR / "global_best.json"
    if p.exists():
        return json.loads(p.read_text())
    return tune_once()

def fit_for_oos_year(oos_year):
    best = get_best_params()
    mb = best["params"]["max_bin"]
    dtr, dva = _build_dmats(oos_year - 3, oos_year - 2, oos_year - 1, mb)
    params = best["params"]
    bst = xgb.train(params, dtr, num_boost_round=5000, evals=[(dva,"val")],
                    early_stopping_rounds=EARLY_STOP, verbose_eval=False)
    return bst

def predict_year(bst, year, out_root: Path):
    out_root.mkdir(exist_ok=True)
    for m in months_in(year):
        d = dataset()
        flt = (ds.field("y")==year) & (ds.field("m")==m)
        cols = FEATURES + ID_COLS + ["y","m"]
        rbatches = d.scanner(columns=cols, filter=flt, batch_size=131072).to_batches()
        rows = []
        preds = []
        for rb in rbatches:
            X = np.column_stack([rb.column(name).to_numpy(zero_copy_only=False) for name in FEATURES]).astype(np.float32, copy=False)
            dm = xgb.DMatrix(X, missing=np.nan)
            p = bst.predict(dm)
            ids = {k: rb.column(k).to_numpy(zero_copy_only=False) for k in (ID_COLS + ["y","m"])}
            rows.append(ids)
            preds.append(p.astype(np.float32, copy=False))
        if not rows:
            continue
        ids = {k: np.concatenate([r[k] for r in rows]) for k in rows[0].keys()}
        out = pd.DataFrame(ids)
        out["pred_ret_t1"] = np.concatenate(preds)
        out = out.groupby(ID_COLS + ["y","m"], as_index=False)["pred_ret_t1"].mean()
        sigma_month = float(out["pred_ret_t1"].std())
        print(f"year {year} m {m} sigma_pred_oos {sigma_month:.6f}")
        pdir = out_root / f"y={int(year)}" / f"m={int(m)}"
        pdir.mkdir(parents=True, exist_ok=True)
        out.to_parquet(pdir / "part-0.parquet", index=False)

In [12]:
for year in range(OOS_START, OOS_END+1):
    bst = fit_for_oos_year(year)
    predict_year(bst, year, Path("oos_preds"))
    print(f"year {year}: prediction pass done")

ArrowTypeError: Unable to merge: Field month has incompatible types: int16 vs int32