In [2]:
!pip install /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
!pip install /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/formulaic-1.0.2-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl
!pip install lifelines

Processing /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
autograd is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
Processing /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: autograd-gamma
  Building wheel for autograd-gamma (setup.py) ... [?25l[?25hdone
  Created wheel for autograd-gamma: filename=autograd_gamma-0.5.0-py3-none-any.whl size=4031 sha256=63ac9c097a53eb859d84297beabd0adb9074afd2aa3fa016728e5d82bffb5bac
  Stored in directory: /root/.cache/pip/wheels/6b/b5/e0/4c79e15c0b5f2c15ecf613c720bb20daab20a666eb67135155
Successfully built autograd-gamma
Installing collected packages: autograd-gamma
Successfully installed autograd-gamma-0.5.0
Processing /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
Installing collected packages: interface-meta
Success

In [3]:
# Cell 1: Import and Load Data
import pandas as pd
import numpy as np

train_data = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
test_data = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')


import warnings

warnings.filterwarnings('ignore')

In [4]:
# Cell 2: Preprocessing with Kaplan-Meier
from lifelines import KaplanMeierFitter

# Function to calculate Kaplan-Meier survival probabilities
def calculate_survival_probabilities(df, time_col, event_col):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    return kmf.survival_function_at_times(df[time_col]).values

# Preprocess the dataset
def preprocess_survival_data(df, time_col='efs_time', event_col='efs'):
    df['target'] = calculate_survival_probabilities(df, time_col, event_col)
    df.loc[df[event_col] == 0, 'target'] -= 0.2  # Adjust for censored data
    return df

# Apply preprocessing
df = preprocess_survival_data(train_data)

In [5]:
# Cell 3: Combine Train and Test Data
# Add an indicator column
train_data['Dataset'] = 'train'
test_data['Dataset'] = 'test'

# Concatenate train and test
df = pd.concat([train_data, test_data], axis=0).reset_index(drop=True)

In [6]:
# Cell 4: Handle Missing Values
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns
# Handling missing values for categorical columns
for column in categorical_columns:
    # Replace missing values in categorical columns with 'unknown'
    df[column].fillna('unknown', inplace=True)

In [7]:
# Cell 4: Handle Missing Values
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns
# Handling missing values for categorical columns
for column in categorical_columns:
    # Replace missing values in categorical columns with 'unknown'
    df[column].fillna('unknown', inplace=True)

In [8]:
# Cell 5: Split Data Back into Train and Test
from sklearn.model_selection import train_test_split

# Split concatenated data back into train and test
train_data = df[df['Dataset'] == 'train'].drop(columns=['Dataset', 'ID'])
test_data = df[df['Dataset'] == 'test'].drop(columns=['Dataset', 'efs', 'efs_time', 'target'])

# Define features and target
target_column = 'target'
X = train_data.drop(columns=['efs', 'efs_time', 'target'])  # Features
y = train_data[['target']]

In [9]:
# Cell 6: XBNet Implementation
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from lifelines.utils import concordance_index
from catboost import CatBoostRegressor
import numpy as np

class TreeExtractor:
    """Extract tree-based features using CatBoost"""
    def __init__(self, cat_features, iterations=100, learning_rate=0.05, depth=6):
        self.cat_features = cat_features
        self.model = CatBoostRegressor(
            iterations=iterations,
            learning_rate=learning_rate,
            depth=depth,
            l2_leaf_reg=3,
            loss_function='RMSE',
            random_seed=42,
            verbose=0
        )
        self.leaf_indices = None
    
    def fit(self, X, y):
        # Handle categorical features
        X_copy = X.copy()
        for col in self.cat_features:
            X_copy[col] = X_copy[col].fillna("Unknown")
            
        self.model.fit(X_copy, y)
        self.leaf_indices = self.model.calc_leaf_indexes(X_copy)
        return self.leaf_indices
    
    def transform(self, X):
        X_copy = X.copy()
        for col in self.cat_features:
            X_copy[col] = X_copy[col].fillna("Unknown")
            
        return self.model.calc_leaf_indexes(X_copy)


class XBNet(nn.Module):
    """XBNet: combines tree-based features with neural network"""
    def __init__(self, num_trees, num_leaves, hidden_dim=64, dropout_rate=0.3):
        super(XBNet, self).__init__()
        
        # Embedding for tree leaves
        self.leaf_embed = nn.Embedding(num_leaves, hidden_dim)
        
        # Neural network layers
        self.network = nn.Sequential(
            nn.Linear(num_trees * hidden_dim, hidden_dim * 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, x):
        # Embed leaf indices from trees
        embeds = self.leaf_embed(x)
        # Flatten embeddings
        flat = embeds.view(embeds.size(0), -1)
        # Feed through neural network
        output = self.network(flat)
        return output


def preprocess_data_for_xbnet(X, cat_features):
    """Prepare data for XBNet by handling categorical variables"""
    # Ensure categorical features are properly encoded
    for col in cat_features:
        X[col] = X[col].astype('category')
    return X


def train_xbnet_model(X_train, y_train, X_val, y_val, cat_features, device='cpu'):
    """Train an XBNet model with tree-based feature extraction"""
    # Step 1: Extract tree-based features
    tree_extractor = TreeExtractor(cat_features=cat_features)
    leaf_indices_train = tree_extractor.fit(X_train, y_train)
    leaf_indices_val = tree_extractor.transform(X_val)
    
    # Get model parameters
    num_trees = leaf_indices_train.shape[1]
    num_leaves = int(tree_extractor.model.get_param('max_leaves')) * 2  # Ensure enough embedding capacity
    
    # Convert to PyTorch tensors
    X_train_tensor = torch.tensor(leaf_indices_train, dtype=torch.long).to(device)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).to(device)
    X_val_tensor = torch.tensor(leaf_indices_val, dtype=torch.long).to(device)
    y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).to(device)
    
    # Create data loaders
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    
    # Initialize XBNet model
    model = XBNet(num_trees=num_trees, num_leaves=num_leaves).to(device)
    
    # Define loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Training loop
    epochs = 50
    best_rmse = float('inf')
    early_stop_count = 0
    patience = 10
    
    for epoch in range(epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
        
        # Validation
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val_tensor)
            val_loss = criterion(val_outputs, y_val_tensor)
            val_rmse = torch.sqrt(val_loss).item()
            
            # Calculate predictions for C-index
            val_preds = val_outputs.cpu().numpy()
            val_c_index = concordance_index(y_val.values, val_preds)
            
            print(f"Epoch {epoch+1}/{epochs}, RMSE: {val_rmse:.4f}, C-index: {val_c_index:.4f}")
            
            # Early stopping
            if val_rmse < best_rmse:
                best_rmse = val_rmse
                early_stop_count = 0
            else:
                early_stop_count += 1
                
            if early_stop_count >= patience:
                print("Early stopping triggered")
                break
    
    return model, tree_extractor, best_rmse, val_c_index


def xbnet_predict(model, tree_extractor, X, device='cpu'):
    """Generate predictions with a trained XBNet model"""
    # Extract tree features
    leaf_indices = tree_extractor.transform(X)
    
    # Convert to tensor
    X_tensor = torch.tensor(leaf_indices, dtype=torch.long).to(device)
    
    # Make predictions
    model.eval()
    with torch.no_grad():
        predictions = model(X_tensor).cpu().numpy()
    
    return predictions

In [12]:
# Cell 7: Cross-Validation and Model Evaluation
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
import torch

# Set device 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Store results
fold_scores = []
fold_c_indices = []

# Get categorical feature names
cat_features = list(X.select_dtypes(include=['object', 'category']).columns)

# Iterate through each fold
for fold, (train_index, val_index) in enumerate(kf.split(X)):
    print(f"\n--- Fold {fold+1} ---")
    
    # Split data
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Preprocess data
    X_train = preprocess_data_for_xbnet(X_train, cat_features)
    X_val = preprocess_data_for_xbnet(X_val, cat_features)
    
    # Train XBNet model
    model, tree_extractor, rmse, c_index = train_xbnet_model(
        X_train, y_train, X_val, y_val, cat_features, device
    )
    
    # Store results
    fold_scores.append(rmse)
    fold_c_indices.append(c_index)
    
    print(f"Fold {fold+1} RMSE: {rmse:.4f}")
    print(f"Fold {fold+1} C-index: {c_index:.4f}")

# Summary of results
print("\n--- Final Results ---")
print(f"Mean RMSE: {np.mean(fold_scores):.4f}")
print(f"Standard Deviation of RMSE: {np.std(fold_scores):.4f}")
print(f"Mean C-index: {np.mean(fold_c_indices):.4f}")
print(f"Standard Deviation of C-index: {np.std(fold_c_indices):.4f}")

Using device: cpu

--- Fold 1 ---


TypeError: Cannot setitem on a Categorical with a new category (Unknown), set the categories first

In [11]:
# Cell 8: Train Final Model and Make Predictions
import torch

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Get categorical feature names
cat_features = list(X.select_dtypes(include=['object']).columns)

# Preprocess data
X_processed = preprocess_data_for_xbnet(X, cat_features)
test_processed = preprocess_data_for_xbnet(test_data, cat_features)

# Train the final model on all training data
print("Training final model on all data...")
final_model, final_tree_extractor, _, _ = train_xbnet_model(
    X_processed, y, X_processed, y, cat_features, device
)

# Generate predictions for the test set
print("Generating predictions for test data...")
test_predictions = xbnet_predict(final_model, final_tree_extractor, test_processed, device)

# Create submission dataframe
submission = pd.DataFrame({
    'ID': test_data.index if 'ID' not in test_data.columns else test_data['ID'],
    'predicted_survival': test_predictions.flatten()
})

# Save predictions to CSV
submission.to_csv('xbnet_survival_predictions.csv', index=False)
print("Predictions saved to 'xbnet_survival_predictions.csv'")

Training final model on all data...


TypeError: Cannot setitem on a Categorical with a new category (Unknown), set the categories first