**Competition Name:**  
*CIBMTR - Equity in post-HCT Survival Predictions*

**Objective:**  
The competition seeks to improve the prediction of transplant survival rates for allogeneic hematopoietic stem cell transplantation (HCT) patients. The emphasis is on generating predictions that are not only accurate but also equitable across diverse racial and demographic groups.

# Training Code

**Libraries and Load Train Data**

In [7]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from lightgbm import log_evaluation
from sklearn.model_selection import StratifiedKFold
from lifelines import NelsonAalenFitter
from lifelines.utils import concordance_index
import joblib
import os

# Load Data
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")

# Ensure model directory exists
MODEL_DIR = "/content/models"
os.makedirs(MODEL_DIR, exist_ok=True)

**Feature Engineering**

In [8]:
# Feature Engineering
def add_features(df):
    df['donor_age_hct_diff'] = df['donor_age'] - df['age_at_hct']
    df['comorbidity_karnofsky_ratio'] = df['comorbidity_score'] / (df['karnofsky_score'] + 1)
    if 'efs_time' in df.columns:
        df['efs_time_log'] = np.log1p(df['efs_time'])
    df['year_hct_adjusted'] = df['year_hct'] - 2000
    df['is_cyto_score_same'] = (df['cyto_score'] == df['cyto_score_detail']).astype(int)
    return df

train = add_features(train)
test = add_features(test)

**Encode Categorical Features**

In [9]:
# Feature Selection
RMV = ["ID", "efs", "efs_time", "y", "efs_time_log"]
FEATURES = [c for c in train.columns if c not in RMV]

# Encode categorical features
for col in train.select_dtypes(include=['object', 'category']).columns:
    train[col] = train[col].astype('category').cat.codes
    test[col] = test[col].astype('category').cat.codes

**Nelson-Aalen Target Transformation**

In [10]:
# Nelson-Aalen Target Transformation
def create_nelson(data):
    naf = NelsonAalenFitter(nelson_aalen_smoothing=0)
    naf.fit(durations=data['efs_time'], event_observed=data['efs'])
    return naf.cumulative_hazard_at_times(data['efs_time']).values * -1

train["y_nel"] = create_nelson(train)
train.loc[train.efs == 0, "y_nel"] = (-(-train.loc[train.efs == 0, "y_nel"])**0.5)

**Pairwise Logit Transform**

In [11]:
# Pairwise Logit Transform
def logit_transform(y, eps=2e-2, eps_mul=1.1):
    y = (y - y.min() + eps) / (y.max() - y.min() + eps_mul * eps)
    return np.log(y / (1 - y))

train["y_transformed"] = logit_transform(train["y_nel"])

**Stratified K-Folds**

In [12]:
# Stratified K-Folds
FOLDS = 15
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
train["fold"] = -1
for fold, (_, val_idx) in enumerate(skf.split(train, train["race_group"])):
    train.loc[val_idx, "fold"] = fold

**LGB MODEL PARAMETERS**

In [13]:
# Model Parameters
lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.04,
    "max_depth": 9,
    "num_leaves": 64,
    "subsample": 0.6,
    "colsample_bytree": 0.6,
    "random_state": 42
}

**XGB MODEL PARAMETERS**

In [14]:
xgb_params = {
    "objective": "reg:squarederror",
    "learning_rate": 0.02,
    "max_depth": 6,
    "colsample_bytree": 0.6,
    "subsample": 0.8,
    "n_estimators": 6000,
    "min_child_weight": 3,
    "early_stopping_rounds": 500,
    "random_state": 42,
    "gamma": 0.2,
    "reg_alpha": 0.1,
    "reg_lambda": 0.8
}

**CATBOOST MODEL PARAMETERS**

In [15]:
cat_params = {
    "loss_function": "RMSE",
    "learning_rate": 0.045,
    "depth": 8,
    "iterations": 5000,
    "random_strength": 0.2,
    "l2_leaf_reg": 5,
    "bagging_temperature": 0.6,
    "random_seed": 42
}

**Train & Save Models**

In [16]:
# Train & Save Models
oof_preds = np.zeros(len(train))

for fold in range(FOLDS):
    print(f"Training Fold {fold+1}")
    x_train = train.loc[train.fold != fold, FEATURES]
    y_train = train.loc[train.fold != fold, "y_transformed"]
    x_valid = train.loc[train.fold == fold, FEATURES]
    y_valid = train.loc[train.fold == fold, "y_transformed"]

    # XGBoost
    model_xgb = xgb.XGBRegressor(**xgb_params)
    model_xgb.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=500)
    joblib.dump(model_xgb, f"{MODEL_DIR}/xgb_fold{fold}.pkl")
    oof_preds[train.index[train.fold == fold]] += model_xgb.predict(x_valid) * 0.4

    # LightGBM
    model_lgb = lgb.LGBMRegressor(**lgb_params)
    model_lgb.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], callbacks=[log_evaluation(500)])
    joblib.dump(model_lgb, f"{MODEL_DIR}/lgb_fold{fold}.pkl")
    oof_preds[train.index[train.fold == fold]] += model_lgb.predict(x_valid) * 0.4

    # CatBoost
    model_cat = cb.CatBoostRegressor(**cat_params, verbose=500)
    model_cat.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=500)
    model_cat.save_model(f"{MODEL_DIR}/cat_fold{fold}.cbm")
    oof_preds[train.index[train.fold == fold]] += model_cat.predict(x_valid) * 0.2

Training Fold 1
[0]	validation_0-rmse:2.33474
[500]	validation_0-rmse:2.08393
[1000]	validation_0-rmse:2.07002
[1500]	validation_0-rmse:2.07291
[1629]	validation_0-rmse:2.07526
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014766 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1158
[LightGBM] [Info] Number of data points in the train set: 26880, number of used features: 61
[LightGBM] [Info] Start training from score -1.154759
0:	learn: 2.3321653	test: 2.3278996	best: 2.3278996 (0)	total: 65.5ms	remaining: 5m 27s
500:	learn: 1.8324950	test: 2.0759756	best: 2.0759270 (499)	total: 8.47s	remaining: 1m 16s
1000:	learn: 1.6255424	test: 2.0773012	best: 2.0753050 (534)	total: 17s	remaining: 1m 7s
1500:	learn: 1.4671445	test: 2.0830741	best: 2.0753050 (534)	total: 24s	remaining: 56s
2000:	learn: 1.3363716	test: 2.0877617	best: 2.0753050 (534)	

**Compute C-index**

In [17]:
# Compute C-index
c_index = concordance_index(train["efs_time"], -oof_preds, train["efs"])
print(f"\nOverall CV C-Index: {c_index:.4f}")


Overall CV C-Index: 0.6853
