In [1]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import StratifiedKFold

In [None]:
# データの読み込み
INPUT_DIR = "xxx"
df_train = pd.read_csv(INPUT_DIR + "input/02/df_train_02.csv")
df_test = pd.read_csv(INPUT_DIR + "input/02/df_test_02.csv")
data_dic = pd.read_csv(INPUT_DIR + "Equity in Post-HCT Survival Predictions/data_dictionary.csv")# 辞書データ
sample_sub = pd.read_csv(INPUT_DIR + "Equity in Post-HCT Survival Predictions/sample_submission.csv")
df_train.head()

Unnamed: 0,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,...,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,target
0,N/A - non-malignant indication,No,Missing,No,-1.0,-1.0,No TBI,No,6.0,Bone marrow,...,No,Missing,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.458687
1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,Peripheral blood,...,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,0.847759
2,N/A - non-malignant indication,No,Missing,No,2.0,8.0,No TBI,No,6.0,Bone marrow,...,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.462424
3,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,Bone marrow,...,Yes,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.456661
4,High,No,Missing,No,2.0,8.0,No TBI,No,6.0,Peripheral blood,...,No,Permissive mismatched,Related,MEL,8.0,No,2.0,No,10.0,0.464674


In [3]:
# Custom function to compute the Stratified Concordance Index (C-index)
def stratified_c_index(y_true, y_pred, groups):
    unique_groups = np.unique(groups)
    c_indices = []

    for group in unique_groups:
        mask = groups == group
        if sum(mask) > 1:  
            y_true_group = y_true[mask]
            y_pred_group = y_pred[mask]
            concordant = 0
            permissible = 0

            for i in range(len(y_true_group)):
                for j in range(i + 1, len(y_true_group)):
                    if y_true_group[i] != y_true_group[j]:
                        permissible += 1
                        if (y_pred_group[i] > y_pred_group[j] and y_true_group[i] > y_true_group[j]) or \
                           (y_pred_group[i] < y_pred_group[j] and y_true_group[i] < y_true_group[j]):
                            concordant += 1

            c_indices.append(concordant / permissible if permissible > 0 else 0)

    c_indices = np.array(c_indices)
    return np.mean(c_indices) - np.std(c_indices)

In [4]:
# Sample dataset
X = df_train.drop(columns=['target'], axis=1)
y = df_train['target']
race_groups = df_train['race_group']  

In [5]:
# Convert categorical features for CatBoost
cat_features = list(X.select_dtypes(include=['object', 'category']).columns)

In [6]:
# K-Fold Cross Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
final_predictions = np.zeros(len(X))

for fold, (train_idx, val_idx) in enumerate(kf.split(X, race_groups)):
    print(f"Fold {fold + 1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    race_val = race_groups.iloc[val_idx]

    train_pool = Pool(X_train, y_train, cat_features=cat_features) # catboost専用のデータ形式
    val_pool = Pool(X_val, y_val, cat_features=cat_features)

    # Model with tuned hyperparameters
    model = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        l2_leaf_reg=3,
        loss_function='RMSE',
        random_seed=42,
        verbose=100,
        train_dir = None,
        # task_type='GPU',
    )

    model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50)

    # Predict and compute metric
    y_val_pred = model.predict(X_val)
    fold_score = stratified_c_index(y_val.values, y_val_pred, race_val.values)
    print(f"Stratified C-Index for Fold {fold + 1}: {fold_score}")

    final_predictions[val_idx] = y_val_pred

# Overall Stratified Concordance Index
overall_score = stratified_c_index(y.values, final_predictions, race_groups.values)
print(f"Overall Stratified C-Index: {overall_score}")

Fold 1
0:	learn: 0.1754974	test: 0.1768809	best: 0.1768809 (0)	total: 97.7ms	remaining: 1m 37s
100:	learn: 0.1587172	test: 0.1619467	best: 0.1619467 (100)	total: 3.15s	remaining: 28.1s
200:	learn: 0.1551626	test: 0.1600619	best: 0.1600619 (200)	total: 6.12s	remaining: 24.3s
300:	learn: 0.1522975	test: 0.1589364	best: 0.1589364 (300)	total: 9.1s	remaining: 21.1s
400:	learn: 0.1505312	test: 0.1584711	best: 0.1584651 (396)	total: 12.3s	remaining: 18.4s
500:	learn: 0.1490942	test: 0.1581192	best: 0.1581192 (500)	total: 15.3s	remaining: 15.2s
600:	learn: 0.1477940	test: 0.1577767	best: 0.1577767 (600)	total: 18.2s	remaining: 12.1s
700:	learn: 0.1465929	test: 0.1575668	best: 0.1575657 (697)	total: 21.4s	remaining: 9.11s
800:	learn: 0.1455752	test: 0.1575021	best: 0.1574783 (762)	total: 24.3s	remaining: 6.03s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.157478254
bestIteration = 762

Shrink model to first 763 iterations.
Stratified C-Index for Fold 1: 0.6391847687820773