In [1]:
!pip install /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
!pip install /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/formulaic-1.0.2-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl

Processing /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
Installing collected packages: autograd
Successfully installed autograd-1.7.0
Processing /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: autograd-gamma
  Building wheel for autograd-gamma (setup.py) ... [?25l- \ done
[?25h  Created wheel for autograd-gamma: filename=autograd_gamma-0.5.0-py3-none-any.whl size=4030 sha256=ce938ee45a7a9eb02df5aaf8ee585970a29209324b8228d1cc5809014d79de2e
  Stored in directory: /root/.cache/pip/wheels/6b/b5/e0/4c79e15c0b5f2c15ecf613c720bb20daab20a666eb67135155
Successfully built autograd-gamma
Installing collected packages: autograd-gamma
Successfully installed autograd-gamma-0.5.0
Processing /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
Installing collected packages: interface-meta
Successfully installed interface-meta-1.3.0


# Libraries 

In [2]:
import pandas as pd
import numpy as np

import warnings

warnings.filterwarnings('ignore')

# Load Data

In [3]:
train_data = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
test_data  = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')

# Combine Target for Survival Analysis

In [4]:

from lifelines import KaplanMeierFitter

# Function to calculate Kaplan-Meier survival probabilities
def calculate_survival_probabilities(df, time_col, event_col):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    return kmf.survival_function_at_times(df[time_col]).values

# Preprocess the dataset
def preprocess_survival_data(df, time_col='efs_time', event_col='efs'):
    df['target'] = calculate_survival_probabilities(df, time_col, event_col)
    df.loc[df[event_col] == 0, 'target'] -= 0.2  # Adjust for censored data
    # df['efs_time2'] = df[time_col].where(df[event_col] == 1, -df[time_col])  # Negative for censored
    return df

# Apply preprocessing
df = preprocess_survival_data(train_data)

In [5]:
# Add an indicator column
train_data['Dataset'] = 'train'
test_data['Dataset'] = 'test'

# Concatenate train and test
df = pd.concat([train_data, test_data], axis=0).reset_index(drop=True)

# 1. Exploratory Data Analysis (EDA)

## 1.1: Load and Understand the Dataset

In [6]:
df

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time,target,Dataset
0,0,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,...,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356,0.258687,train
1,1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,...,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672,0.847759,train
2,2,N/A - non-malignant indication,No,,No,2.0,8.0,No TBI,No,6.0,...,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793,0.262424,train
3,3,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,...,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,102.349,0.256661,train
4,4,High,No,,No,2.0,8.0,No TBI,No,6.0,...,MEL,8.0,No,2.0,No,10.0,0.0,16.223,0.264674,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28798,28798,N/A - non-malignant indication,No,Poor,No,1.0,4.0,No TBI,No,3.0,...,MEL,4.0,No,1.0,No,5.0,0.0,52.351,0.258404,train
28799,28799,N/A - pediatric,No,,No,2.0,8.0,No TBI,No,6.0,...,MEL,8.0,No,2.0,Yes,10.0,0.0,25.158,0.260616,train
28800,28800,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,...,"N/A, Mel not given",8.0,No,2.0,No,10.0,,,,test
28801,28801,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,...,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,,,,test


In [7]:
df.isnull().sum()

ID                   0
dri_score          154
psych_disturb     2062
cyto_score        8070
diabetes          2119
                  ... 
hla_low_res_10    5064
efs                  3
efs_time             3
target               3
Dataset              0
Length: 62, dtype: int64

In [8]:
df.info

<bound method DataFrame.info of           ID                       dri_score psych_disturb    cyto_score  \
0          0  N/A - non-malignant indication            No           NaN   
1          1                    Intermediate            No  Intermediate   
2          2  N/A - non-malignant indication            No           NaN   
3          3                            High            No  Intermediate   
4          4                            High            No           NaN   
...      ...                             ...           ...           ...   
28798  28798  N/A - non-malignant indication            No          Poor   
28799  28799                 N/A - pediatric            No           NaN   
28800  28800  N/A - non-malignant indication            No           NaN   
28801  28801                    Intermediate            No  Intermediate   
28802  28802  N/A - non-malignant indication            No           NaN   

      diabetes  hla_match_c_high  hla_high_res_8       

In [9]:
df.describe()

Unnamed: 0,ID,hla_match_c_high,hla_high_res_8,hla_low_res_6,hla_high_res_6,hla_high_res_10,hla_match_dqb1_high,hla_nmdp_6,hla_match_c_low,hla_match_drb1_low,...,hla_match_a_low,hla_match_b_high,comorbidity_score,karnofsky_score,hla_low_res_8,hla_match_drb1_high,hla_low_res_10,efs,efs_time,target
count,28803.0,24182.0,22973.0,25533.0,23519.0,21639.0,23604.0,24606.0,26003.0,26160.0,...,26413.0,24715.0,28326.0,27933.0,25150.0,25451.0,23739.0,28800.0,28800.0,28800.0
mean,14401.0,1.764536,6.876899,5.143422,5.109316,8.617358,1.736909,5.160449,1.757836,1.715329,...,1.709121,1.699656,1.702252,83.832743,6.903579,1.707163,8.664855,0.539306,23.237678,0.51405
std,8314.854238,0.431929,1.56428,1.207722,1.214126,1.905083,0.447668,1.203202,0.435436,0.451266,...,0.458243,0.465164,1.994403,11.028433,1.56497,0.461163,1.882687,0.498461,24.799748,0.260578
min,0.0,0.0,2.0,2.0,0.0,3.0,0.0,2.0,0.0,1.0,...,0.0,0.0,0.0,40.0,2.0,0.0,4.0,0.0,0.333,0.252727
25%,7200.5,2.0,6.0,4.0,4.0,7.0,1.0,4.0,2.0,1.0,...,1.0,1.0,0.0,70.0,6.0,1.0,7.0,0.0,5.61975,0.259063
50%,14401.0,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,...,2.0,2.0,1.0,90.0,8.0,2.0,10.0,1.0,9.7965,0.500025
75%,21601.5,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,...,2.0,2.0,2.0,90.0,8.0,2.0,10.0,1.0,35.1,0.749937
max,28802.0,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,...,2.0,2.0,10.0,100.0,8.0,2.0,10.0,1.0,156.819,0.999965


# 2: Data Preprocessing 

## 2.1: Explore Missing Values

In [10]:
pd.set_option('display.max_columns', None)
df.head()  # Display the first few rows with all columns

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,vent_hist,renal_issue,pulm_severe,prim_disease_hct,hla_high_res_6,cmv_status,hla_high_res_10,hla_match_dqb1_high,tce_imm_match,hla_nmdp_6,hla_match_c_low,rituximab,hla_match_drb1_low,hla_match_dqb1_low,prod_type,cyto_score_detail,conditioning_intensity,ethnicity,year_hct,obesity,mrd_hct,in_vivo_tcd,tce_match,hla_match_a_high,hepatic_severe,donor_age,prior_tumor,hla_match_b_low,peptic_ulcer,age_at_hct,hla_match_a_low,gvhd_proph,rheum_issue,sex_match,hla_match_b_high,race_group,comorbidity_score,karnofsky_score,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time,target,Dataset
0,0,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,Bone marrow,No,No,No,IEA,6.0,+/+,,2.0,,6.0,2.0,No,2.0,2.0,BM,,,Not Hispanic or Latino,2016,No,,Yes,,2.0,No,,No,2.0,No,9.942,2.0,FKalone,No,M-F,2.0,More than one race,0.0,90.0,No,,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356,0.258687,train
1,1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,Peripheral blood,No,No,No,AML,6.0,+/+,10.0,2.0,P/P,6.0,2.0,No,2.0,2.0,PB,Intermediate,MAC,Not Hispanic or Latino,2008,No,Positive,No,Permissive,2.0,No,72.29,No,2.0,No,43.705,2.0,Other GVHD Prophylaxis,No,F-F,2.0,Asian,3.0,90.0,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672,0.847759,train
2,2,N/A - non-malignant indication,No,,No,2.0,8.0,No TBI,No,6.0,Bone marrow,No,No,No,HIS,6.0,+/+,10.0,2.0,P/P,6.0,2.0,No,2.0,2.0,BM,,,Not Hispanic or Latino,2019,No,,Yes,,2.0,No,,No,2.0,No,33.997,2.0,Cyclophosphamide alone,No,F-M,2.0,More than one race,0.0,90.0,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793,0.262424,train
3,3,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,Bone marrow,No,No,No,ALL,6.0,+/+,10.0,2.0,P/P,6.0,2.0,No,2.0,2.0,BM,Intermediate,MAC,Not Hispanic or Latino,2009,No,Positive,No,Permissive,2.0,No,29.23,No,2.0,No,43.245,2.0,FK+ MMF +- others,No,M-M,2.0,White,0.0,90.0,Yes,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,102.349,0.256661,train
4,4,High,No,,No,2.0,8.0,No TBI,No,6.0,Peripheral blood,No,No,No,MPN,6.0,+/+,10.0,2.0,,5.0,2.0,No,2.0,2.0,PB,,MAC,Hispanic or Latino,2018,No,,Yes,,2.0,No,56.81,No,2.0,No,29.74,2.0,TDEPLETION +- other,No,M-F,2.0,American Indian or Alaska Native,1.0,90.0,No,Permissive mismatched,Related,MEL,8.0,No,2.0,No,10.0,0.0,16.223,0.264674,train


In [11]:
# Separate numerical and categorical columns
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Display results
print("Numerical Columns:")
print(numerical_columns)

print("\nCategorical Columns:")
print(categorical_columns)

Numerical Columns:
Index(['ID', 'hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6',
       'hla_high_res_6', 'hla_high_res_10', 'hla_match_dqb1_high',
       'hla_nmdp_6', 'hla_match_c_low', 'hla_match_drb1_low',
       'hla_match_dqb1_low', 'year_hct', 'hla_match_a_high', 'donor_age',
       'hla_match_b_low', 'age_at_hct', 'hla_match_a_low', 'hla_match_b_high',
       'comorbidity_score', 'karnofsky_score', 'hla_low_res_8',
       'hla_match_drb1_high', 'hla_low_res_10', 'efs', 'efs_time', 'target'],
      dtype='object')

Categorical Columns:
Index(['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status',
       'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe',
       'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab',
       'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity',
       'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe',
       'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue

In [12]:
# # Handling missing values for numerical columns
# for column in numerical_columns:
#     # Replace missing values in numerical columns with the mean
#     # df[column].fillna(df[column].mean(), inplace=True)
#     # df[column].fillna(df[column].mode()[0], inplace=True)
#     df[column].fillna(0, inplace=True)  # Replace 0 with any desired constant
#     # df[column].fillna(method='ffill', inplace=True)
#     # df[column].interpolate(method='nearest', inplace=True)

# Handling missing values for categorical columns
for column in categorical_columns:
    # Replace missing values in categorical columns with 'unknown'
    df[column].fillna('unknown', inplace=True)

# 3.Machine Learning 

## Split Combined Data into Train and Test Sets

In [13]:
# Split the concatenated dataset back into the original train and test datasets
# Retain relevant columns while dropping unnecessary ones
train_data = df[df['Dataset'] == 'train'].drop(columns=['Dataset', 'ID'])  # Train set without metadata
test_data = df[df['Dataset'] == 'test'].drop(columns=['Dataset', 'efs', 'efs_time', 'target'])  # Test set without target-related columns

# Separate features (X) and target (y) for the training set
X = train_data.drop(columns=['efs', 'efs_time', 'target'])  # Feature columns for training
y = train_data[['target']]  # Target variable for training

## Catboost

In [14]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
from lifelines.utils import concordance_index  # Ensure lifelines is installed: pip install lifelines

# Initialize K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store fold-specific results
fold_scores = []  # RMSE for each fold
fold_c_indices = []  # C-index for each fold

# Perform cross-validation
for train_index, val_index in kf.split(X):
    # Split the dataset into training and validation sets for the current fold
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Identify categorical features for CatBoost
    cat_features = list(X.select_dtypes(include=['object', 'category']).columns)
    
    # Create CatBoost Pool objects for training and validation
    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    val_pool = Pool(X_val, y_val, cat_features=cat_features)
    
    # Initialize the CatBoost Regressor
    model = CatBoostRegressor(
        iterations=1000,            # Number of boosting iterations
        learning_rate=0.05,         # Learning rate for gradient boosting
        depth=6,                    # Depth of the tree
        l2_leaf_reg=3,              # Regularization parameter
        loss_function='RMSE',       # Loss function (Root Mean Squared Error)
        random_seed=42,             # Set random seed for reproducibility
        verbose=100                 # Output training progress every 100 iterations
    )

    # Train the model with early stopping
    model.fit(
        train_pool,
        eval_set=val_pool,
        verbose=50,                 # Output evaluation results every 50 iterations
        early_stopping_rounds=100   # Stop if no improvement after 100 iterations
    )

    # Make predictions on the validation set
    y_pred = model.predict(X_val)
    
    # Evaluate RMSE (Root Mean Squared Error)
    fold_score = mean_squared_error(y_val, y_pred, squared=False)
    fold_scores.append(fold_score)

    # Calculate Concordance Index (C-index) for survival analysis
    c_index = concordance_index(y_val, y_pred)
    fold_c_indices.append(c_index)

    # Print metrics for the current fold
    print(f"Fold RMSE: {fold_score}")
    print(f"Fold C-index: {c_index}")

# Summary of cross-validation results
print(f"Mean RMSE: {np.mean(fold_scores)}")
print(f"Standard Deviation of RMSE: {np.std(fold_scores)}")
print(f"Mean C-index: {np.mean(fold_c_indices)}")
print(f"Standard Deviation of C-index: {np.std(fold_c_indices)}")

0:	learn: 0.2591116	test: 0.2587041	best: 0.2587041 (0)	total: 136ms	remaining: 2m 15s
50:	learn: 0.2384332	test: 0.2391574	best: 0.2391574 (50)	total: 2.95s	remaining: 54.9s
100:	learn: 0.2348140	test: 0.2365617	best: 0.2365617 (100)	total: 5.7s	remaining: 50.8s
150:	learn: 0.2325142	test: 0.2353738	best: 0.2353738 (150)	total: 8.51s	remaining: 47.9s
200:	learn: 0.2306488	test: 0.2344964	best: 0.2344964 (200)	total: 11.4s	remaining: 45.3s
250:	learn: 0.2287282	test: 0.2337313	best: 0.2337313 (250)	total: 14.3s	remaining: 42.7s
300:	learn: 0.2269494	test: 0.2332598	best: 0.2332545 (299)	total: 17.3s	remaining: 40.1s
350:	learn: 0.2256943	test: 0.2329264	best: 0.2329264 (350)	total: 20.2s	remaining: 37.3s
400:	learn: 0.2245738	test: 0.2326835	best: 0.2326835 (400)	total: 23.4s	remaining: 34.9s
450:	learn: 0.2235912	test: 0.2325350	best: 0.2325192 (447)	total: 26.3s	remaining: 32s
500:	learn: 0.2226048	test: 0.2323138	best: 0.2323053 (498)	total: 29.2s	remaining: 29.1s
550:	learn: 0.2217

# Submission

In [15]:
# Preprocess test data
test_features = test_data.drop(columns=['ID'], errors='ignore')  # Drop unnecessary columns

In [16]:
# Predict using the trained CatBoost model
test_data['prediction'] = model.predict(test_features)

In [17]:
test_data['prediction']

28800    0.331219
28801    0.617860
28802    0.254847
Name: prediction, dtype: float64

In [18]:
# Create submission file
submission = test_data[['ID', 'prediction']]  # Include 'id' and the predicted target column
submission.to_csv('submission.csv', index=False)

print("Submission file created: submission.csv")

Submission file created: submission.csv
