#### Model Structure
- XGB Classifier
- Fine tuning

In [2]:
import json, gzip
import numpy as np
import xgboost as xgb
import pandas as pd
from pathlib import Path
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.model_selection import GroupShuffleSplit
import optuna
from sklearn.model_selection import StratifiedGroupKFold
import joblib

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def parse_json_line(obj):
    (tid, tdata), = obj.items()
    (pos_key, contexts), = tdata.items()
    pos = int(pos_key) if isinstance(pos_key, str) else pos_key
    (ctx7, reads), = contexts.items()
    arr = np.asarray(reads, dtype=float)
    return tid, pos, ctx7, arr

BASE_IDX = {"A":0, "C":1, "G":2, "T":3}
def onehot28(seq7: str) -> np.ndarray:
    out = np.zeros((7,4), dtype=np.int8)
    s = (seq7 or "").upper()
    for i in range(min(7, len(s))):
        j = BASE_IDX.get(s[i], -1)
        if j >= 0:
            out[i, j] = 1
    return out.ravel()

def aggregate_9(arr: np.ndarray) -> np.ndarray:
    if arr.size == 0:
        return np.zeros(45, dtype=np.float32)
    mean = arr.mean(axis=0)
    std  = arr.std(axis=0, ddof=0)
    mn   = arr.min(axis=0)
    mx   = arr.max(axis=0)
    med  = np.median(arr, axis=0)
    return np.concatenate([mean, std, mn, mx, med]).astype(np.float32, copy=False)

NUM_COLS = [
    "dwell_m1","sd_m1","mean_m1",
    "dwell_0","sd_0","mean_0",
    "dwell_p1","sd_p1","mean_p1",
]
FEATURE_NAMES = (
    [f"mean_{c}"   for c in NUM_COLS] +
    [f"std_{c}"    for c in NUM_COLS] +
    [f"min_{c}"    for c in NUM_COLS] +
    [f"max_{c}"    for c in NUM_COLS] +
    [f"median_{c}" for c in NUM_COLS] +
    [f"ctx_{i}"    for i in range(28)]
)


def build_dataset_from_json_objects(json_objects, label_dict, transcript_to_gene):
    """
    json_objects: iterable of parsed per-line dicts
    label_dict: {(transcript_id, position): 0/1}
    Returns X (N,73), y (N,), plus ids for later mapping.
    """
    X_rows, y_rows, ids = [], [], []
    for obj in json_objects:
        tid, pos, ctx7, arr = parse_json_line(obj)
        feats45 = aggregate_9(arr)
        ctx28   = onehot28(ctx7)
        X_rows.append(np.concatenate([feats45, ctx28]))
        y_rows.append(label_dict.get((tid, int(pos)), None))
        gene = transcript_to_gene.get((tid, None))
        ids.append((gene, tid, int(pos)))
    X = np.asarray(X_rows, dtype=np.float32)
    y = np.asarray(y_rows)
    return X, y, ids

def make_data_splits(X, y, ids, test_size=0.3, val_size=0.5, random_state=4262):
    mask = ~pd.isna(y)
    X_tr = X[mask]
    y_tr = y[mask].astype(int)

    groups = np.array([
        str(gene) if gene is not None else tid
        for gene, tid, pos in ids
    ])[mask]

    gss = GroupShuffleSplit(test_size=test_size, random_state=random_state)
    train_idx, temp_idx = next(gss.split(X_tr, y_tr, groups=groups))

    gss2 = GroupShuffleSplit(test_size=val_size, random_state=random_state)
    val_idx, test_idx = next(gss2.split(X_tr[temp_idx], y_tr[temp_idx], groups=groups[temp_idx]))

    train_genes = set(groups[train_idx])
    val_genes   = set(groups[temp_idx][val_idx])
    test_genes  = set(groups[temp_idx][test_idx])
    assert len(train_genes & val_genes) == 0, "Gene overlap between train and val!"
    assert len(train_genes & test_genes) == 0, "Gene overlap between train and test!"
    assert len(val_genes & test_genes) == 0, "Gene overlap between val and test!"
    print(f"Split complete: {len(train_genes)} train genes, {len(val_genes)} val genes, {len(test_genes)} test genes")

    return {
        "X_train": X[train_idx], "y_train": y[train_idx],
        "X_val": X[temp_idx][val_idx], "y_val": y[temp_idx][val_idx],
        "X_test": X[temp_idx][test_idx], "y_test": y[temp_idx][test_idx],
        "ids_train": [ids[i] for i in train_idx],
        "ids_val": [ids[i] for i in temp_idx[val_idx]],
        "ids_test": [ids[i] for i in temp_idx[test_idx]]
    }
    

def iter_json_lines(path: str):
    """Stream NDJSON from .json or .json.gz"""
    p = Path(path)
    if p.suffix == ".gz":
        with gzip.open(p, "rt", encoding="utf-8", errors="replace") as f:
            for line in f:
                s = line.strip()
                if s:
                    yield json.loads(s)
    else:
        with open(p, "r", encoding="utf-8") as f:
            for line in f:
                s = line.strip()
                if s:
                    yield json.loads(s)

In [5]:
df_labels = pd.read_csv("data_task1/data.info.labelled.csv")
label_dict = { (r.transcript_id, int(r.transcript_position)): int(r.label) for r in df_labels.itertuples(index=False)}
transcript_to_gene = dict(zip(df_labels['transcript_id'], df_labels['gene_id']))

train_json_iter = iter_json_lines("data/dataset0.json.gz")
X, y, ids = build_dataset_from_json_objects(train_json_iter, label_dict, transcript_to_gene)
data = make_data_splits(X, y, ids)

X_train = data['X_train']
X_val = data['X_val']
X_test = data['X_test']

y_train = data['y_train']
y_val = data['y_val']
y_test = data['y_test']

ids_train = data['ids_train']
ids_val = data['ids_val']
ids_test = data['ids_test']

Split complete: 3733 train genes, 800 val genes, 800 test genes


### Fine Tuning

In [6]:
def train_and_validate_optuna(X_train, y_train, X_val, y_val, n_trials=40):
    def objective(trial):
        params = {
            "max_depth": trial.suggest_int("max_depth", 4, 7),
            "learning_rate": trial.suggest_float("learning_rate", 0.03, 0.1, log=True),
            "n_estimators": trial.suggest_int("n_estimators", 400, 800),
            "subsample": trial.suggest_float("subsample", 0.7, 0.95),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 0.95),
            "scale_pos_weight": trial.suggest_int("scale_pos_weight", 10, 30),
            "min_child_weight": trial.suggest_int("min_child_weight", 2, 6),
            "gamma": trial.suggest_float("gamma", 0.05, 0.3),
            "reg_alpha": trial.suggest_float("reg_alpha", 0, 1.5),
            "reg_lambda": trial.suggest_float("reg_lambda", 2, 8),

            "tree_method": "hist",
            "eval_metric": "aucpr",
            "random_state": 4262,
            "n_jobs": -1,
        }

        clf = xgb.XGBClassifier(**params)
        clf.fit(X_train, y_train)

        y_pred = clf.predict_proba(X_val)[:, 1]
        auprc = average_precision_score(y_val, y_pred)
        auroc = roc_auc_score(y_val, y_pred)
        score = 0.7 * auprc + 0.3 * auroc

        trial.set_user_attr("auprc", auprc)
        trial.set_user_attr("auroc", auroc)
        return score

    study = optuna.create_study(direction="maximize", study_name="xgb_optuna_cpu")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

    best_params = study.best_params
    print(f"Best params: {best_params}")
    print(f"AUPRC: {study.best_trial.user_attrs['auprc']:.4f}, AUROC: {study.best_trial.user_attrs['auroc']:.4f}")
    return best_params


In [7]:
def predict_with_model(model, X_test, y_test=None):
    y_pred = model.predict_proba(X_test)[:, 1]
    if y_test is not None:
        auprc = average_precision_score(y_test, y_pred)
        auroc = roc_auc_score(y_test, y_pred)
        print(f"Test AUPRC: {auprc:.4f}, AUROC: {auroc:.4f}")
        return y_pred, auprc, auroc
    return y_pred

In [8]:
def train_final_groupkfold_with_oof(X, y, ids, params, n_splits=5):
    groups = np.array([gene if gene is not None else tid for gene, tid, pos in ids])
    gkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=4262)

    models = []
    oof_pred = np.zeros(len(y), dtype=float)
    fold_metrics = []

    for fold, (tr_idx, va_idx) in enumerate(gkf.split(X, y, groups=groups), 1):
        X_tr, y_tr = X[tr_idx], y[tr_idx]
        X_va, y_va = X[va_idx], y[va_idx]

        clf = xgb.XGBClassifier(
            **params,
            tree_method="hist", 
            eval_metric="aucpr", random_state=42 + fold, n_jobs=-1
        )
        clf.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            verbose=False
        )
        models.append(clf)

        oof_scores = clf.predict_proba(X_va)[:, 1]
        oof_pred[va_idx] = oof_scores

        auprc = average_precision_score(y_va, oof_scores)
        auroc = roc_auc_score(y_va, oof_scores)
        fold_metrics.append((auprc, auroc))
        print(f"Fold {fold}: AUPRC={auprc:.4f}, AUROC={auroc:.4f}")

    oof_auprc = average_precision_score(y, oof_pred)
    oof_auroc = roc_auc_score(y, oof_pred)
    print(f"OOF AUPRC={oof_auprc:.4f}, OOF AUROC={oof_auroc:.4f}")

    return models, oof_pred, fold_metrics


In [9]:
def predict_ensemble(models, X):
    preds = np.zeros(X.shape[0])
    for clf in models:
        preds += clf.predict_proba(X)[:, 1]
    preds /= len(models)
    return preds

In [10]:
best_params = train_and_validate_optuna(X_train, y_train, X_val, y_val, n_trials=40)

[I 2025-11-02 14:57:28,767] A new study created in memory with name: xgb_optuna_cpu
Best trial: 0. Best value: 0.596088:   2%|▎         | 1/40 [00:02<01:46,  2.73s/it]

[I 2025-11-02 14:57:31,496] Trial 0 finished with value: 0.5960882641309806 and parameters: {'max_depth': 7, 'learning_rate': 0.07959435007505138, 'n_estimators': 454, 'subsample': 0.7220072109584907, 'colsample_bytree': 0.848803002568344, 'scale_pos_weight': 24, 'min_child_weight': 4, 'gamma': 0.19451601673668528, 'reg_alpha': 1.4514797766918224, 'reg_lambda': 2.302106046312251}. Best is trial 0 with value: 0.5960882641309806.


Best trial: 1. Best value: 0.607168:   5%|▌         | 2/40 [00:06<02:09,  3.40s/it]

[I 2025-11-02 14:57:35,373] Trial 1 finished with value: 0.6071676140948652 and parameters: {'max_depth': 7, 'learning_rate': 0.043191343686409756, 'n_estimators': 676, 'subsample': 0.7629564017716223, 'colsample_bytree': 0.8489213004347792, 'scale_pos_weight': 17, 'min_child_weight': 4, 'gamma': 0.1476689164249455, 'reg_alpha': 0.10642421829493942, 'reg_lambda': 7.4535018945653935}. Best is trial 1 with value: 0.6071676140948652.


Best trial: 1. Best value: 0.607168:   8%|▊         | 3/40 [00:10<02:16,  3.70s/it]

[I 2025-11-02 14:57:39,419] Trial 2 finished with value: 0.5976664939137026 and parameters: {'max_depth': 7, 'learning_rate': 0.06323409146914999, 'n_estimators': 687, 'subsample': 0.7456526275926706, 'colsample_bytree': 0.8044082475996461, 'scale_pos_weight': 29, 'min_child_weight': 3, 'gamma': 0.16132891236339703, 'reg_alpha': 1.3724838536946333, 'reg_lambda': 6.939297888648533}. Best is trial 1 with value: 0.6071676140948652.


Best trial: 1. Best value: 0.607168:  10%|█         | 4/40 [00:12<01:43,  2.87s/it]

[I 2025-11-02 14:57:41,009] Trial 3 finished with value: 0.5916384415260141 and parameters: {'max_depth': 4, 'learning_rate': 0.07497257719375995, 'n_estimators': 551, 'subsample': 0.8209819278199126, 'colsample_bytree': 0.838069409622193, 'scale_pos_weight': 25, 'min_child_weight': 2, 'gamma': 0.08096970646132669, 'reg_alpha': 1.1169894283871658, 'reg_lambda': 7.41255915607997}. Best is trial 1 with value: 0.6071676140948652.


Best trial: 1. Best value: 0.607168:  12%|█▎        | 5/40 [00:13<01:23,  2.40s/it]

[I 2025-11-02 14:57:42,576] Trial 4 finished with value: 0.5936270989744559 and parameters: {'max_depth': 5, 'learning_rate': 0.03311821216046574, 'n_estimators': 422, 'subsample': 0.878595647519249, 'colsample_bytree': 0.7792945590183631, 'scale_pos_weight': 30, 'min_child_weight': 2, 'gamma': 0.2464747594929519, 'reg_alpha': 0.6356504992290007, 'reg_lambda': 7.2958261295555005}. Best is trial 1 with value: 0.6071676140948652.


Best trial: 1. Best value: 0.607168:  15%|█▌        | 6/40 [00:15<01:13,  2.16s/it]

[I 2025-11-02 14:57:44,282] Trial 5 finished with value: 0.5945444147831738 and parameters: {'max_depth': 4, 'learning_rate': 0.060598482398223096, 'n_estimators': 596, 'subsample': 0.833450498975388, 'colsample_bytree': 0.920663694356275, 'scale_pos_weight': 13, 'min_child_weight': 6, 'gamma': 0.07424456628597308, 'reg_alpha': 0.9829478625510004, 'reg_lambda': 4.564473771121869}. Best is trial 1 with value: 0.6071676140948652.


Best trial: 1. Best value: 0.607168:  18%|█▊        | 7/40 [00:17<01:14,  2.26s/it]

[I 2025-11-02 14:57:46,748] Trial 6 finished with value: 0.5989179041812103 and parameters: {'max_depth': 6, 'learning_rate': 0.07942573969928236, 'n_estimators': 542, 'subsample': 0.7509454772658051, 'colsample_bytree': 0.7569363733863066, 'scale_pos_weight': 12, 'min_child_weight': 3, 'gamma': 0.2996544129312797, 'reg_alpha': 0.39784159304337763, 'reg_lambda': 4.891539394527886}. Best is trial 1 with value: 0.6071676140948652.


Best trial: 1. Best value: 0.607168:  20%|██        | 8/40 [00:20<01:12,  2.27s/it]

[I 2025-11-02 14:57:49,050] Trial 7 finished with value: 0.5968552806172562 and parameters: {'max_depth': 5, 'learning_rate': 0.049607642590673226, 'n_estimators': 655, 'subsample': 0.7203688395486995, 'colsample_bytree': 0.7585152635092159, 'scale_pos_weight': 27, 'min_child_weight': 3, 'gamma': 0.2458292230116954, 'reg_alpha': 0.7770031242325595, 'reg_lambda': 4.0354757475589444}. Best is trial 1 with value: 0.6071676140948652.


Best trial: 1. Best value: 0.607168:  22%|██▎       | 9/40 [00:22<01:08,  2.22s/it]

[I 2025-11-02 14:57:51,135] Trial 8 finished with value: 0.5993381067400885 and parameters: {'max_depth': 6, 'learning_rate': 0.06418532435419208, 'n_estimators': 462, 'subsample': 0.917743529448872, 'colsample_bytree': 0.8093652621594376, 'scale_pos_weight': 19, 'min_child_weight': 6, 'gamma': 0.1453377131079615, 'reg_alpha': 0.7088138650721865, 'reg_lambda': 2.3762775496667627}. Best is trial 1 with value: 0.6071676140948652.


Best trial: 1. Best value: 0.607168:  25%|██▌       | 10/40 [00:24<01:08,  2.27s/it]

[I 2025-11-02 14:57:53,538] Trial 9 finished with value: 0.5977216226790623 and parameters: {'max_depth': 5, 'learning_rate': 0.05087480177102686, 'n_estimators': 680, 'subsample': 0.8189728544699728, 'colsample_bytree': 0.7379126400191841, 'scale_pos_weight': 19, 'min_child_weight': 5, 'gamma': 0.14507034435946575, 'reg_alpha': 0.7840801690647352, 'reg_lambda': 6.697223249608281}. Best is trial 1 with value: 0.6071676140948652.


Best trial: 10. Best value: 0.608967:  28%|██▊       | 11/40 [00:29<01:25,  2.97s/it]

[I 2025-11-02 14:57:58,073] Trial 10 finished with value: 0.6089671552579154 and parameters: {'max_depth': 7, 'learning_rate': 0.03628941294279358, 'n_estimators': 777, 'subsample': 0.779093668134451, 'colsample_bytree': 0.8968086534202462, 'scale_pos_weight': 16, 'min_child_weight': 5, 'gamma': 0.12432908393529907, 'reg_alpha': 0.011261766631205386, 'reg_lambda': 6.105666996416587}. Best is trial 10 with value: 0.6089671552579154.


Best trial: 10. Best value: 0.608967:  30%|███       | 12/40 [00:33<01:37,  3.47s/it]

[I 2025-11-02 14:58:02,701] Trial 11 finished with value: 0.6084168216762438 and parameters: {'max_depth': 7, 'learning_rate': 0.036369686975115674, 'n_estimators': 795, 'subsample': 0.7783854041977095, 'colsample_bytree': 0.898576265741575, 'scale_pos_weight': 16, 'min_child_weight': 5, 'gamma': 0.09601202991024227, 'reg_alpha': 0.060023474785748876, 'reg_lambda': 5.936797728039791}. Best is trial 10 with value: 0.6089671552579154.


Best trial: 10. Best value: 0.608967:  32%|███▎      | 13/40 [00:37<01:36,  3.58s/it]

[I 2025-11-02 14:58:06,532] Trial 12 finished with value: 0.6077740602589481 and parameters: {'max_depth': 6, 'learning_rate': 0.030611070154611984, 'n_estimators': 789, 'subsample': 0.7819258888009828, 'colsample_bytree': 0.9182012103553936, 'scale_pos_weight': 15, 'min_child_weight': 5, 'gamma': 0.10907642049943699, 'reg_alpha': 0.014968245081901269, 'reg_lambda': 5.822782399969007}. Best is trial 10 with value: 0.6089671552579154.


Best trial: 10. Best value: 0.608967:  35%|███▌      | 14/40 [00:42<01:41,  3.90s/it]

[I 2025-11-02 14:58:11,159] Trial 13 finished with value: 0.60353349202021 and parameters: {'max_depth': 7, 'learning_rate': 0.03754728287963414, 'n_estimators': 797, 'subsample': 0.7991118764325762, 'colsample_bytree': 0.8798803454160865, 'scale_pos_weight': 22, 'min_child_weight': 5, 'gamma': 0.050267192279713935, 'reg_alpha': 0.2576899207278527, 'reg_lambda': 5.8104654734892005}. Best is trial 10 with value: 0.6089671552579154.


Best trial: 10. Best value: 0.608967:  38%|███▊      | 15/40 [00:46<01:42,  4.08s/it]

[I 2025-11-02 14:58:15,677] Trial 14 finished with value: 0.6069904357110413 and parameters: {'max_depth': 7, 'learning_rate': 0.09884296921293731, 'n_estimators': 732, 'subsample': 0.861615969306073, 'colsample_bytree': 0.9499968775529022, 'scale_pos_weight': 15, 'min_child_weight': 5, 'gamma': 0.1102051366594012, 'reg_alpha': 0.3338939096277604, 'reg_lambda': 5.975088679990488}. Best is trial 10 with value: 0.6089671552579154.


Best trial: 10. Best value: 0.608967:  40%|████      | 16/40 [00:50<01:32,  3.85s/it]

[I 2025-11-02 14:58:18,994] Trial 15 finished with value: 0.6063621128999128 and parameters: {'max_depth': 6, 'learning_rate': 0.04044162898939562, 'n_estimators': 744, 'subsample': 0.7831836905736084, 'colsample_bytree': 0.8835018272182061, 'scale_pos_weight': 10, 'min_child_weight': 6, 'gamma': 0.11235377680718558, 'reg_alpha': 0.4854678813429273, 'reg_lambda': 3.6435866328291473}. Best is trial 10 with value: 0.6089671552579154.


Best trial: 10. Best value: 0.608967:  42%|████▎     | 17/40 [00:54<01:30,  3.95s/it]

[I 2025-11-02 14:58:23,186] Trial 16 finished with value: 0.6057703598951665 and parameters: {'max_depth': 7, 'learning_rate': 0.03521928972810725, 'n_estimators': 745, 'subsample': 0.8581655628001583, 'colsample_bytree': 0.706486220646416, 'scale_pos_weight': 16, 'min_child_weight': 4, 'gamma': 0.19202002966155235, 'reg_alpha': 0.1748521376615425, 'reg_lambda': 6.122386581321157}. Best is trial 10 with value: 0.6089671552579154.


Best trial: 10. Best value: 0.608967:  45%|████▌     | 18/40 [00:57<01:19,  3.60s/it]

[I 2025-11-02 14:58:25,966] Trial 17 finished with value: 0.5998663017056405 and parameters: {'max_depth': 6, 'learning_rate': 0.04307769377132303, 'n_estimators': 620, 'subsample': 0.9479120018992986, 'colsample_bytree': 0.8837147398763955, 'scale_pos_weight': 21, 'min_child_weight': 5, 'gamma': 0.08427122439429818, 'reg_alpha': 0.029356957450411253, 'reg_lambda': 5.098071154227484}. Best is trial 10 with value: 0.6089671552579154.


Best trial: 10. Best value: 0.608967:  48%|████▊     | 19/40 [01:01<01:21,  3.87s/it]

[I 2025-11-02 14:58:30,455] Trial 18 finished with value: 0.6032286089984671 and parameters: {'max_depth': 7, 'learning_rate': 0.03049393914812707, 'n_estimators': 762, 'subsample': 0.7033376685311576, 'colsample_bytree': 0.9168287315860947, 'scale_pos_weight': 18, 'min_child_weight': 6, 'gamma': 0.11058913959997452, 'reg_alpha': 0.5088976016611771, 'reg_lambda': 6.519641761304763}. Best is trial 10 with value: 0.6089671552579154.


Best trial: 10. Best value: 0.608967:  50%|█████     | 20/40 [01:05<01:14,  3.71s/it]

[I 2025-11-02 14:58:33,792] Trial 19 finished with value: 0.6039047175238978 and parameters: {'max_depth': 6, 'learning_rate': 0.04600026294560859, 'n_estimators': 717, 'subsample': 0.7970144443049696, 'colsample_bytree': 0.9400309620033362, 'scale_pos_weight': 12, 'min_child_weight': 5, 'gamma': 0.18785347476403824, 'reg_alpha': 0.230039440923695, 'reg_lambda': 5.3581279379500195}. Best is trial 10 with value: 0.6089671552579154.


Best trial: 20. Best value: 0.609735:  52%|█████▎    | 21/40 [01:09<01:16,  4.00s/it]

[I 2025-11-02 14:58:38,482] Trial 20 finished with value: 0.609735019006661 and parameters: {'max_depth': 7, 'learning_rate': 0.03793545463887945, 'n_estimators': 789, 'subsample': 0.7595681679033139, 'colsample_bytree': 0.8675763224991409, 'scale_pos_weight': 14, 'min_child_weight': 4, 'gamma': 0.12781693454634455, 'reg_alpha': 0.955439676811904, 'reg_lambda': 7.934369633006156}. Best is trial 20 with value: 0.609735019006661.


Best trial: 20. Best value: 0.609735:  55%|█████▌    | 22/40 [01:14<01:16,  4.27s/it]

[I 2025-11-02 14:58:43,369] Trial 21 finished with value: 0.6071468224018953 and parameters: {'max_depth': 7, 'learning_rate': 0.03749199306603081, 'n_estimators': 799, 'subsample': 0.7636845431004183, 'colsample_bytree': 0.8718214593220284, 'scale_pos_weight': 14, 'min_child_weight': 4, 'gamma': 0.13165422566750648, 'reg_alpha': 0.9929919000371957, 'reg_lambda': 7.84529606714598}. Best is trial 20 with value: 0.609735019006661.


Best trial: 20. Best value: 0.609735:  57%|█████▊    | 23/40 [01:19<01:15,  4.41s/it]

[I 2025-11-02 14:58:48,118] Trial 22 finished with value: 0.608359896182954 and parameters: {'max_depth': 7, 'learning_rate': 0.0345768874491686, 'n_estimators': 766, 'subsample': 0.7371562697332793, 'colsample_bytree': 0.9008009395922879, 'scale_pos_weight': 10, 'min_child_weight': 4, 'gamma': 0.053249873854139804, 'reg_alpha': 1.1416110307350422, 'reg_lambda': 6.501347844648919}. Best is trial 20 with value: 0.609735019006661.


Best trial: 20. Best value: 0.609735:  60%|██████    | 24/40 [01:23<01:10,  4.38s/it]

[I 2025-11-02 14:58:52,407] Trial 23 finished with value: 0.6075587293527976 and parameters: {'max_depth': 7, 'learning_rate': 0.03991056461885927, 'n_estimators': 710, 'subsample': 0.773597769388053, 'colsample_bytree': 0.8635802191368299, 'scale_pos_weight': 17, 'min_child_weight': 4, 'gamma': 0.09923323517214501, 'reg_alpha': 0.9587943130913603, 'reg_lambda': 7.97635538640456}. Best is trial 20 with value: 0.609735019006661.


Best trial: 20. Best value: 0.609735:  62%|██████▎   | 25/40 [01:27<01:01,  4.12s/it]

[I 2025-11-02 14:58:55,947] Trial 24 finished with value: 0.6013943664324491 and parameters: {'max_depth': 6, 'learning_rate': 0.053996412764964696, 'n_estimators': 783, 'subsample': 0.7934044175449321, 'colsample_bytree': 0.8183667305537421, 'scale_pos_weight': 13, 'min_child_weight': 5, 'gamma': 0.12736811239737308, 'reg_alpha': 1.28412398302405, 'reg_lambda': 3.1050736202836724}. Best is trial 20 with value: 0.609735019006661.


Best trial: 20. Best value: 0.609735:  65%|██████▌   | 26/40 [01:31<00:56,  4.05s/it]

[I 2025-11-02 14:58:59,824] Trial 25 finished with value: 0.6008941096127945 and parameters: {'max_depth': 7, 'learning_rate': 0.046213964539600375, 'n_estimators': 630, 'subsample': 0.8118378185023829, 'colsample_bytree': 0.900975039367527, 'scale_pos_weight': 16, 'min_child_weight': 3, 'gamma': 0.17575161139744114, 'reg_alpha': 0.5851037249857456, 'reg_lambda': 5.415556519500345}. Best is trial 20 with value: 0.609735019006661.


Best trial: 20. Best value: 0.609735:  68%|██████▊   | 27/40 [01:35<00:52,  4.06s/it]

[I 2025-11-02 14:59:03,905] Trial 26 finished with value: 0.604096451251082 and parameters: {'max_depth': 6, 'learning_rate': 0.03339920719543355, 'n_estimators': 762, 'subsample': 0.8425112962930783, 'colsample_bytree': 0.8609340124878594, 'scale_pos_weight': 20, 'min_child_weight': 5, 'gamma': 0.1304893397850349, 'reg_alpha': 0.8803146676945055, 'reg_lambda': 4.514688742866218}. Best is trial 20 with value: 0.609735019006661.


Best trial: 20. Best value: 0.609735:  70%|███████   | 28/40 [01:39<00:49,  4.16s/it]

[I 2025-11-02 14:59:08,284] Trial 27 finished with value: 0.6066887416365493 and parameters: {'max_depth': 7, 'learning_rate': 0.03755585387467653, 'n_estimators': 724, 'subsample': 0.7291136652666698, 'colsample_bytree': 0.8994343329382851, 'scale_pos_weight': 14, 'min_child_weight': 4, 'gamma': 0.22315079440515295, 'reg_alpha': 0.3742678989143774, 'reg_lambda': 7.150321059971974}. Best is trial 20 with value: 0.609735019006661.


Best trial: 20. Best value: 0.609735:  72%|███████▎  | 29/40 [01:42<00:40,  3.68s/it]

[I 2025-11-02 14:59:10,849] Trial 28 finished with value: 0.6007693320262164 and parameters: {'max_depth': 5, 'learning_rate': 0.030178200121022625, 'n_estimators': 699, 'subsample': 0.7583589350910245, 'colsample_bytree': 0.8347299693261271, 'scale_pos_weight': 11, 'min_child_weight': 6, 'gamma': 0.08989793441676691, 'reg_alpha': 0.14432535308433303, 'reg_lambda': 6.398881598224122}. Best is trial 20 with value: 0.609735019006661.


Best trial: 20. Best value: 0.609735:  75%|███████▌  | 30/40 [01:45<00:35,  3.59s/it]

[I 2025-11-02 14:59:14,217] Trial 29 finished with value: 0.6033836284738834 and parameters: {'max_depth': 7, 'learning_rate': 0.04027490835628313, 'n_estimators': 560, 'subsample': 0.7151481078862456, 'colsample_bytree': 0.8503030298769512, 'scale_pos_weight': 22, 'min_child_weight': 5, 'gamma': 0.06951258030743693, 'reg_alpha': 1.181098150606498, 'reg_lambda': 5.5640805468127725}. Best is trial 20 with value: 0.609735019006661.


Best trial: 20. Best value: 0.609735:  78%|███████▊  | 31/40 [01:50<00:35,  3.90s/it]

[I 2025-11-02 14:59:18,857] Trial 30 finished with value: 0.6040135909951605 and parameters: {'max_depth': 7, 'learning_rate': 0.057848324194623275, 'n_estimators': 761, 'subsample': 0.8029059203799065, 'colsample_bytree': 0.9277263750271689, 'scale_pos_weight': 17, 'min_child_weight': 4, 'gamma': 0.16574982825832094, 'reg_alpha': 1.4808111222942753, 'reg_lambda': 6.815478663516886}. Best is trial 20 with value: 0.609735019006661.


Best trial: 20. Best value: 0.609735:  80%|████████  | 32/40 [01:54<00:33,  4.13s/it]

[I 2025-11-02 14:59:23,513] Trial 31 finished with value: 0.6085363674866182 and parameters: {'max_depth': 7, 'learning_rate': 0.034194701310782484, 'n_estimators': 772, 'subsample': 0.7328237501168033, 'colsample_bytree': 0.9024561784153297, 'scale_pos_weight': 10, 'min_child_weight': 4, 'gamma': 0.05238869086889375, 'reg_alpha': 1.2758542023985124, 'reg_lambda': 6.436290425315649}. Best is trial 20 with value: 0.609735019006661.


Best trial: 20. Best value: 0.609735:  82%|████████▎ | 33/40 [01:59<00:30,  4.30s/it]

[I 2025-11-02 14:59:28,197] Trial 32 finished with value: 0.6089294000823848 and parameters: {'max_depth': 7, 'learning_rate': 0.032832468663248904, 'n_estimators': 774, 'subsample': 0.7400124781293365, 'colsample_bytree': 0.8926629731782473, 'scale_pos_weight': 12, 'min_child_weight': 4, 'gamma': 0.06407328310837371, 'reg_alpha': 1.272668153864378, 'reg_lambda': 6.1457755985198}. Best is trial 20 with value: 0.609735019006661.


Best trial: 20. Best value: 0.609735:  85%|████████▌ | 34/40 [02:03<00:26,  4.36s/it]

[I 2025-11-02 14:59:32,717] Trial 33 finished with value: 0.6072216148093054 and parameters: {'max_depth': 7, 'learning_rate': 0.03309411110717471, 'n_estimators': 744, 'subsample': 0.7012761459844228, 'colsample_bytree': 0.8873614208407535, 'scale_pos_weight': 11, 'min_child_weight': 3, 'gamma': 0.06284143673209246, 'reg_alpha': 1.3313612015050054, 'reg_lambda': 7.65899304782117}. Best is trial 20 with value: 0.609735019006661.


Best trial: 34. Best value: 0.612534:  88%|████████▊ | 35/40 [02:07<00:21,  4.24s/it]

[I 2025-11-02 14:59:36,655] Trial 34 finished with value: 0.6125336125053799 and parameters: {'max_depth': 7, 'learning_rate': 0.04302079441911812, 'n_estimators': 663, 'subsample': 0.7368038358606985, 'colsample_bytree': 0.8536161518163243, 'scale_pos_weight': 12, 'min_child_weight': 4, 'gamma': 0.062000706600374725, 'reg_alpha': 1.2122729646581127, 'reg_lambda': 7.036593954451487}. Best is trial 34 with value: 0.6125336125053799.


Best trial: 34. Best value: 0.612534:  90%|█████████ | 36/40 [02:11<00:15,  3.94s/it]

[I 2025-11-02 14:59:39,901] Trial 35 finished with value: 0.6063452274763026 and parameters: {'max_depth': 7, 'learning_rate': 0.04364504768983369, 'n_estimators': 521, 'subsample': 0.7430142690276451, 'colsample_bytree': 0.8485008730082928, 'scale_pos_weight': 13, 'min_child_weight': 3, 'gamma': 0.21632130756818124, 'reg_alpha': 1.0626208842185048, 'reg_lambda': 7.14238669495872}. Best is trial 34 with value: 0.6125336125053799.


Best trial: 34. Best value: 0.612534:  92%|█████████▎| 37/40 [02:13<00:09,  3.33s/it]

[I 2025-11-02 14:59:41,800] Trial 36 finished with value: 0.5920445880252614 and parameters: {'max_depth': 4, 'learning_rate': 0.04129298533350287, 'n_estimators': 659, 'subsample': 0.7642601264519884, 'colsample_bytree': 0.8336267255714631, 'scale_pos_weight': 14, 'min_child_weight': 4, 'gamma': 0.07401796696188394, 'reg_alpha': 1.208041485780766, 'reg_lambda': 7.538753056992475}. Best is trial 34 with value: 0.6125336125053799.


Best trial: 34. Best value: 0.612534:  95%|█████████▌| 38/40 [02:16<00:06,  3.29s/it]

[I 2025-11-02 14:59:45,009] Trial 37 finished with value: 0.6041920770424677 and parameters: {'max_depth': 6, 'learning_rate': 0.03231896848916329, 'n_estimators': 694, 'subsample': 0.7506815720455653, 'colsample_bytree': 0.8606713869751514, 'scale_pos_weight': 12, 'min_child_weight': 4, 'gamma': 0.1589079254631877, 'reg_alpha': 1.3950270183034, 'reg_lambda': 6.999213974504679}. Best is trial 34 with value: 0.6125336125053799.


Best trial: 34. Best value: 0.612534:  98%|█████████▊| 39/40 [02:19<00:03,  3.35s/it]

[I 2025-11-02 14:59:48,496] Trial 38 finished with value: 0.6007521612544894 and parameters: {'max_depth': 7, 'learning_rate': 0.04504969323197328, 'n_estimators': 578, 'subsample': 0.7149862370123943, 'colsample_bytree': 0.8492981012197636, 'scale_pos_weight': 15, 'min_child_weight': 3, 'gamma': 0.08208402729772607, 'reg_alpha': 0.8730906463951471, 'reg_lambda': 7.427243399622747}. Best is trial 34 with value: 0.6125336125053799.


Best trial: 34. Best value: 0.612534: 100%|██████████| 40/40 [02:23<00:00,  3.59s/it]

[I 2025-11-02 14:59:52,453] Trial 39 finished with value: 0.6068843229180942 and parameters: {'max_depth': 7, 'learning_rate': 0.0695766453338071, 'n_estimators': 658, 'subsample': 0.744914668663445, 'colsample_bytree': 0.8009817153731591, 'scale_pos_weight': 11, 'min_child_weight': 2, 'gamma': 0.12446886092628372, 'reg_alpha': 1.0764585928418406, 'reg_lambda': 6.234701277345773}. Best is trial 34 with value: 0.6125336125053799.
Best params: {'max_depth': 7, 'learning_rate': 0.04302079441911812, 'n_estimators': 663, 'subsample': 0.7368038358606985, 'colsample_bytree': 0.8536161518163243, 'scale_pos_weight': 12, 'min_child_weight': 4, 'gamma': 0.062000706600374725, 'reg_alpha': 1.2122729646581127, 'reg_lambda': 7.036593954451487}
AUPRC: 0.4793, AUROC: 0.9234





In [11]:
X_final = np.concatenate([X_train]) ##previously we include validation data, but now we remove it to avoid overfitting
y_final = np.concatenate([y_train])
ids_final = ids_train
models, oof_pred, _ = train_final_groupkfold_with_oof(
    X_final, y_final, ids_final, best_params, n_splits=5
)

Fold 1: AUPRC=0.4984, AUROC=0.9261
Fold 2: AUPRC=0.4677, AUROC=0.9245
Fold 3: AUPRC=0.4896, AUROC=0.9328
Fold 4: AUPRC=0.4995, AUROC=0.9247
Fold 5: AUPRC=0.4924, AUROC=0.9202
OOF AUPRC=0.4878, OOF AUROC=0.9256


In [12]:
y_pred = predict_ensemble(models, X_test)
auprc = average_precision_score(y_test, y_pred)
auroc = roc_auc_score(y_test, y_pred)
print(f"Final Test AUPRC={auprc:.4f}, AUROC={auroc:.4f}")


Final Test AUPRC=0.5312, AUROC=0.9333


In [13]:
with open("models/best_params.json", "w") as f:
    json.dump(best_params, f, indent=4)

In [14]:
Path("models/final").mkdir(parents=True, exist_ok=True)
for i, model in enumerate(models):
    joblib.dump(model, f"models/final/xgb_fold{i+1}.pkl")

from datetime import datetime
import json
from pathlib import Path

meta = {
    "feature_names": FEATURE_NAMES,
    "threshold": 0.5,
    "model_type": "xgb_ensemble_5fold",
    "n_folds": len(models),
    "fold_models": [f"xgb_fold{i+1}.pkl" for i in range(len(models))],
    "best_params": best_params,              # useful provenance
    "created_at": datetime.utcnow().isoformat() + "Z"
}

Path("models").mkdir(parents=True, exist_ok=True)
with open("models/metadata.json", "w") as f:
    json.dump(meta, f, indent=2)
print("Wrote models/metadata.json")


Wrote models/metadata.json


In [15]:
## sanity check before proceeding
def predict_ensemble_dataset(models, data_path, label_dict=None, transcript_to_gene=None, output_csv="predictions/pred_dataset0.csv"):
    preds = np.array([m.predict_proba(X)[:,1] for m in models])
    y_pred = preds.mean(axis=0)  # mean ensemble
    print(y_pred.shape)

    pred_df = pd.DataFrame({
        "gene_id": [gene for gene, tid, pos in ids],
        "transcript_id": [tid for gene, tid, pos in ids],
        "transcript_position": [pos for gene, tid, pos in ids],
        "score": y_pred
    })
    Path(output_csv).parent.mkdir(parents=True, exist_ok=True)
    pred_df.to_csv(output_csv, index=False)
    print(f"Saved predictions to {output_csv}")
    
    if label_dict:
        mask = ~pd.isna(y)
        if mask.sum() > 0:
            y_true = y[mask].astype(int)
            y_pred_masked = y_pred[mask]
            auprc = average_precision_score(y_true, y_pred_masked)
            auroc = roc_auc_score(y_true, y_pred_masked)
            print(f"len before masking: {len(y)}; len after masking: {len(y_true)}")
            print(f"[Sanity check on labeled data] AUPRC={auprc:.4f}, AUROC={auroc:.4f}")

    return pred_df


In [27]:
data_path = "data_task1/dataset_0.json"
pred_df = predict_ensemble_dataset(models, data_path, label_dict, transcript_to_gene)

(121838,)
Saved predictions to predictions/pred_dataset0.csv
len before masking: 121838; len after masking: 121838
[Sanity check on labeled data] AUPRC=0.8648, AUROC=0.9802


In [28]:
X.shape

(121838, 73)