In [None]:
!pip install scikit-survival pandas scikit-learn xgbse

In [None]:
import pandas as pd
import numpy as np
from sksurv.util import Surv
from sksurv.ensemble import RandomSurvivalForest
from xgbse import XGBSEDebiasedBCE
from xgbse.converters import convert_to_structured
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [None]:
train_df = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")
test_df = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/test.csv")

In [None]:
columns_to_drop = ["efs", "efs_time", "ID"]

X_train = train_df.drop(columns=columns_to_drop, axis=1)
y_train = Surv.from_dataframe("efs", "efs_time", train_df)

num_features = X_train.select_dtypes(include=["float64", "int64"]).columns.tolist()
cat_features = X_train.select_dtypes(include=["object"]).columns.tolist()

preprocessor = ColumnTransformer([
    ("num", Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), num_features),
    ("cat", Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
        ("encoder", OneHotEncoder(
            drop="first",
            min_frequency=0.05,
            sparse_output=False,
            handle_unknown="ignore"
        ))
    ]), cat_features)
])

In [None]:
X_train_processed = preprocessor.fit_transform(X_train)

y_train_xgb = convert_to_structured(train_df["efs_time"], train_df["efs"])

In [None]:
import pandas as pd
from sksurv.util import Surv
from sksurv.ensemble import RandomSurvivalForest

rsf = RandomSurvivalForest(
    n_estimators=100,
    max_depth=5,
    max_samples=0.5,  
    n_jobs=-1,
    random_state=42
)

rsf.fit(X_train_processed, y_train)

race_groups = train_df["race_group"]
c_indices = {}
for race in race_groups.unique():
    mask = race_groups == race
    c_indices[race] = rsf.score(X_train_processed[mask], y_train[mask])
print("RSF C-indices:", c_indices)

In [None]:
from xgbse import XGBSEDebiasedBCE

xgb_params = {
    "n_estimators": 100,
    "objective": "survival:cox",  
    "tree_method": "hist",
    "random_state": 42
}

xgb_model = XGBSEDebiasedBCE(xgb_params=xgb_params)

xgb_model.fit(
    X_train_processed,
    y_train_xgb 
)

In [None]:

rsf_score = rsf.score(X_train_processed, y_train)
print(f"RSF C-index: {rsf_score:.3f}")

from xgbse.metrics import concordance_index
risk_scores = xgb_model.predict(X_train_processed)
xgb_score = concordance_index(y_train_xgb, risk_scores)
print(f"XGBoost C-index: {xgb_score:.3f}")

In [None]:
race_groups = train_df["race_group"]
c_indices = {}

for race in race_groups.unique():
    mask = race_groups == race
    X_race = X_train_processed[mask]
    y_race = y_train[mask]
    c_indices[race] = rsf.score(X_race, y_race)  

print("C-indices per race:", c_indices)

In [None]:
X_test = test_df.drop(columns=["ID"], axis=1, errors="ignore")
X_test_processed = preprocessor.transform(X_test)

test_risk_scores = rsf.predict(X_test_processed)

submission = pd.DataFrame({
    "ID": test_df["ID"],
    "prediction": test_risk_scores
})
submission.to_csv("submission.csv", index=False)