In [1]:

from preprocessing import DataLoader
import os
import pandas as pd

base_path = os.path.dirname(os.getcwd()) 
loader = DataLoader(base_path)
loader.load_all_data()

In [2]:
# Expressions-Daten
exprs_data = loader.exprs_data

# Originale pData
pdata_original = loader.pdata_original

# Imputierte pData
pdata_imputed = loader.pdata_imputed

# All Genes Daten
all_genes_data = loader.all_genes_data

# Common Genes Daten
common_genes_data = loader.common_genes_data

# Intersection Daten
intersection_data = loader.intersection_data

# Merged originale pData
merged_pdata_original = loader.merged_pdata_original

# Merged imputierte pData
merged_pdata_imputed = loader.merged_pdata_imputed

In [None]:
all_genes_data['all_genes.csv'].shape[1]

In [3]:
exprs = loader.common_genes_data['common_genes_knn_imputed.csv']
pdata = loader.merged_pdata_imputed['merged_imputed_pData.csv']
clinical_features = [
    'GLEASON_SCORE',
    'PATH_T_STAGE', 'CLIN_T_STAGE',
    'PRE_OPERATIVE_PSA', 'AGE'
]

In [7]:
from preprocessing import DataLoader
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
#from sksurv.linear_model import CoxnetSurvivalAnalysis
#from sksurv.util import Surv
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
#from sksurv.ensemble import RandomSurvivalForest, GradientBoostingSurvivalAnalysis, ComponentwiseGradientBoostingSurvivalAnalysis
from sklearn.ensemble import RandomForestClassifier
from sksurv.metrics import concordance_index_censored
import xgboost as xgb
import torch
import torch.nn as nn
import warnings
warnings.filterwarnings('ignore')
from lifelines.utils import concordance_index



ModuleNotFoundError: No module named 'sksurv'

In [4]:
import numpy as np
import torch
import torch.nn as nn
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from lifelines.utils import concordance_index
import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def prepare_data(data, categorical_features, numeric_features):
    """Prepare data by encoding categorical variables and scaling numeric ones"""
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(drop='first', sparse=False), categorical_features)
        ])
    
    X_transformed = preprocessor.fit_transform(data)
    
    # Get feature names after transformation
    cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
    all_feature_names = numeric_features.tolist() + cat_feature_names.tolist()
    
    return X_transformed, preprocessor, all_feature_names

class SimpleDeepSurv(nn.Module):
    def __init__(self, in_features, hidden_size=32):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, 1)
        )
        
    def forward(self, x):
        return self.net(x)

def train_deepsurv(X, time, event, epochs=10, lr=0.01):
    X_tensor = torch.FloatTensor(X)
    
    # Initialize model
    model = SimpleDeepSurv(X.shape[1])
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    # Training loop
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        risk_scores = model(X_tensor)
        
        # Calculate negative log likelihood loss
        log_risk = risk_scores
        censored_likelihood = log_risk.squeeze()
        uncensored_likelihood = log_risk.squeeze()
        
        # Cox loss
        loss = -torch.sum(uncensored_likelihood[event == 1] - 
                         torch.log(torch.sum(torch.exp(log_risk)[time >= time[event == 1][:, None]])))
        
        loss.backward()
        optimizer.step()
    
    return model

def predict_deepsurv(model, X):
    model.eval()
    with torch.no_grad():
        return -model(torch.FloatTensor(X)).numpy().flatten()

class SimpleRSF:
    def __init__(self, n_estimators=100):
        self.model = RandomForestRegressor(
            n_estimators=n_estimators,
            min_samples_split=10,
            min_samples_leaf=5,
            max_features='sqrt',
            random_state=42
        )
        
    def fit(self, X, time, event):
        self.model.fit(X, -time)
        return self
        
    def predict_risk(self, X):
        return self.model.predict(X)

class SimpleGBSurvival:
    def __init__(self, n_estimators=100):
        self.model = xgb.XGBRegressor(
            n_estimators=n_estimators,
            max_depth=3,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42
        )
        
    def fit(self, X, time, event):
        self.model.fit(X, -time)
        return self
        
    def predict_risk(self, X):
        return self.model.predict(X)

def evaluate_model(risk_scores, time, event):
    return concordance_index(time, -risk_scores, event)

In [5]:
# Define feature types for clinical data only
categorical_features = ['PATH_T_STAGE', 'CLIN_T_STAGE']
numeric_clinical_features = ['GLEASON_SCORE', 'PRE_OPERATIVE_PSA', 'AGE']

# First prepare clinical data
clinical_data = pdata[categorical_features + numeric_clinical_features]
clinical_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_clinical_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ])

# Transform clinical data
X_clinical = clinical_preprocessor.fit_transform(clinical_data)

# Scale expression data separately
expression_scaler = StandardScaler()
X_expression = expression_scaler.fit_transform(exprs)

# Combine clinical and expression data
X_combined = np.hstack([X_clinical, X_expression])

time = pdata['MONTH_TO_BCR'].values
event = pdata['BCR_STATUS'].values


In [6]:
# Train and evaluate RSF
rsf_model = SimpleRSF(n_estimators=100)
rsf_model.fit(X_combined, time, event)
rsf_risks = rsf_model.predict_risk(X_combined)
rsf_cindex = evaluate_model(rsf_risks, time, event)
print(f"RSF C-index: {rsf_cindex:.3f}")

RSF C-index: 0.940


In [15]:
# Train and evaluate Gradient Boosting
gb_model = SimpleGBSurvival(n_estimators=100)
gb_model.fit(X_combined, time, event)
gb_risks = gb_model.predict_risk(X_combined)
gb_cindex = evaluate_model(gb_risks, time, event)
print(f"Gradient Boosting C-index: {gb_cindex:.3f}")

Gradient Boosting C-index: 0.933


In [14]:
from sklearn.model_selection import train_test_split

# Train-test split
X_train, X_test, time_train, time_test, event_train, event_test = train_test_split(
    X_combined, time, event, 
    test_size=0.2, 
    random_state=42,
    stratify=event  # Stratifizierung nach Events
)

class SimpleDeepSurv(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Dropout(0.3),
            nn.Linear(32, 1)
        )
        
    def forward(self, x):
        return self.net(x)

def train_deepsurv(X_train, X_val, time_train, time_val, event_train, event_val, 
                   epochs=50, lr=0.001, batch_size=32):
    # Convert to tensors
    X_train = torch.FloatTensor(X_train)
    time_train = torch.FloatTensor(time_train)
    event_train = torch.FloatTensor(event_train)
    X_val = torch.FloatTensor(X_val)
    
    # Initialize model
    model = SimpleDeepSurv(X_train.shape[1])
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
    
    # Training loop
    best_val_cindex = 0
    best_model = None
    patience = 10
    patience_counter = 0
    
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        
        # Forward pass
        risk_scores = model(X_train).squeeze()
        
        # Calculate loss
        loss = negative_log_likelihood(risk_scores, time_train, event_train)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        # Validation
        model.eval()
        with torch.no_grad():
            val_risks = model(X_val).squeeze().numpy()
            val_cindex = concordance_index(time_val, -val_risks, event_val)
        model.train()
        
        # Learning rate scheduling
        scheduler.step(loss)
        
        # Early stopping check
        if val_cindex > best_val_cindex:
            best_val_cindex = val_cindex
            best_model = model.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1
            
        if patience_counter >= patience:
            print(f'Early stopping at epoch {epoch+1}')
            break
            
        if (epoch + 1) % 5 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}, Val C-index: {val_cindex:.3f}')
    
    # Load best model
    if best_model is not None:
        model.load_state_dict(best_model)
    
    return model

def predict_deepsurv(model, X):
    model.eval()
    with torch.no_grad():
        return model(torch.FloatTensor(X)).squeeze().numpy()

# Print dataset sizes
print("Dataset sizes:")
print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")
print(f"Training events: {event_train.sum()}")
print(f"Test events: {event_test.sum()}")

# Train model
deepsurv_model = train_deepsurv(X_train, X_test, time_train, time_test, event_train, event_test)

# Evaluate on both train and test
train_risks = predict_deepsurv(deepsurv_model, X_train)
test_risks = predict_deepsurv(deepsurv_model, X_test)

train_cindex = concordance_index(time_train, -train_risks, event_train)
test_cindex = concordance_index(time_test, -test_risks, event_test)

print(f"\nTraining C-index: {train_cindex:.3f}")
print(f"Test C-index: {test_cindex:.3f}")

# Additional diagnostics on test set
print("\nTest Set Risk Score Distribution:")
print(f"Min: {test_risks.min():.3f}")
print(f"Max: {test_risks.max():.3f}")
print(f"Mean: {test_risks.mean():.3f}")
print(f"Std: {test_risks.std():.3f}")

# Compare risk scores for events vs non-events in test set
test_event_risks = test_risks[event_test == 1]
test_non_event_risks = test_risks[event_test == 0]
print("\nTest Set Mean Risk Scores:")
print(f"Events: {test_event_risks.mean():.3f} ± {test_event_risks.std():.3f}")
print(f"Non-Events: {test_non_event_risks.mean():.3f} ± {test_non_event_risks.std():.3f}")

Dataset sizes:
Training samples: 872
Test samples: 219
Training events: 241
Test events: 60
Epoch [5/50], Loss: 5.8293, Val C-index: 0.732
Epoch [10/50], Loss: 5.4569, Val C-index: 0.757
Epoch [15/50], Loss: 5.1732, Val C-index: 0.762
Epoch [20/50], Loss: 4.9876, Val C-index: 0.763
Epoch [25/50], Loss: 4.7655, Val C-index: 0.760
Epoch [30/50], Loss: 4.5655, Val C-index: 0.764
Epoch [35/50], Loss: 4.4006, Val C-index: 0.770
Epoch [40/50], Loss: 4.2487, Val C-index: 0.776
Epoch [45/50], Loss: 4.2363, Val C-index: 0.769
Early stopping at epoch 49

Training C-index: 0.982
Test C-index: 0.765

Test Set Risk Score Distribution:
Min: -5.259
Max: 4.633
Mean: 0.109
Std: 1.853

Test Set Mean Risk Scores:
Events: 1.395 ± 1.657
Non-Events: -0.376 ± 1.683


#### Gradient Boosting
performed better with merged data than with intersection


In [None]:


def prepare_clinical_features(pdata):
    """
    Enhanced clinical feature preparation
    """
    # Add more potentially relevant clinical features
    clinical_features = [
        'GLEASON_SCORE',
        'PATH_T_STAGE', 'CLIN_T_STAGE', 
        'PRE_OPERATIVE_PSA', 'AGE'
    ]
    
    clinical_data = pdata[clinical_features].copy()
    
    # Enhanced categorical handling
    categorical_features = ['PATH_T_STAGE', 'CLIN_T_STAGE']
    clinical_data = pd.get_dummies(
        clinical_data, 
        columns=categorical_features, 
        drop_first=True,
        prefix=categorical_features
    )
    
    # More sophisticated missing value imputation
    numerical_columns = ['GLEASON_SCORE', 
                        'PRE_OPERATIVE_PSA', 'AGE']
    
    # Use different strategies for different types of missing values
    for col in numerical_columns:
        if col in ['GLEASON_SCORE']:
            clinical_data[col] = clinical_data[col].fillna(clinical_data[col].mode()[0])
        else:
            clinical_data[col] = clinical_data[col].fillna(clinical_data[col].median())
    
    # Add interaction terms for important clinical features
    clinical_data['GLEASON_PSA_INTERACTION'] = clinical_data['GLEASON_SCORE'] * clinical_data['PRE_OPERATIVE_PSA']
    
    return clinical_data

def select_important_genes(exprs_data, y, n_features=1000):
    """
    Pre-select important genetic features
    """
    selector = SelectFromModel(
        GradientBoostingSurvivalAnalysis(
            n_estimators=500,
            learning_rate=0.01,
            max_depth=3
        ),
        max_features=n_features
    )
    selector.fit(exprs_data, y)
    return selector.transform(exprs_data), selector.get_feature_names_out()

def run_boosting_comparison_with_clinical(exprs_data, pdata, survival_data):
    """
    Enhanced gradient boosting with extensive tuning
    """
    # Prepare clinical features
    clinical_data = prepare_clinical_features(pdata)
    
    # Create survival array
    y = np.array([(status, time) for status, time in 
                  zip(survival_data['BCR_STATUS'], survival_data['MONTH_TO_BCR'])],
                 dtype=[('status', bool), ('time', float)])
    
    # Pre-select important genes
    selected_exprs, selected_genes = select_important_genes(exprs_data, y)
    
    # Combine features
    combined_features = pd.concat([
        pd.DataFrame(selected_exprs, index=exprs_data.index, columns=selected_genes),
        clinical_data
    ], axis=1)
    
    # Create cross-validation strategy
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Create pipeline
    pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('gb', GradientBoostingSurvivalAnalysis())
    ])
    
    # Define parameter grid
    param_grid = {
        'gb__n_estimators': [500, 1000],
        'gb__learning_rate': [0.01, 0.005],
        'gb__max_depth': [3, 5],
        'gb__min_samples_split': [5, 10],
        'gb__min_samples_leaf': [3, 5],
        'gb__subsample': [0.8, 0.8],
        'gb__max_features': ['sqrt', None]
    }
    
    # Perform grid search
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=cv,
        scoring=lambda estimator, X, y: concordance_index_censored(
            y['status'],
            y['time'],
            estimator.predict(X)
        )[0],
        n_jobs=-1,  # Use all available cores
        verbose=2
    )
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        combined_features, y, test_size=0.2, random_state=42, stratify=y['status']
    )
    
    # Fit grid search
    grid_search.fit(X_train, y_train)
    
    # Get best model
    best_model = grid_search.best_estimator_
    
    # Get predictions
    pred_risk = best_model.predict(X_test)
    c_index = concordance_index_censored(y_test['status'], y_test['time'], pred_risk)[0]
    
    # Get feature importance
    feature_importance = pd.DataFrame({
        'feature': combined_features.columns,
        'importance': best_model.named_steps['gb'].feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Print detailed results
    print("\nBest parameters:", grid_search.best_params_)
    print("\nCross-validation results:")
    cv_results = pd.DataFrame(grid_search.cv_results_)
    print(cv_results[['params', 'mean_test_score', 'std_test_score']])
    
    return best_model, c_index, feature_importance, grid_search

# Run with enhanced monitoring
model, c_index, feature_importance, grid_search = run_boosting_comparison_with_clinical(
    exprs, pdata, survival_data
)

# Enhanced visualization
def plot_detailed_results(feature_importance, grid_search, c_index):
    plt.figure(figsize=(15, 10))
    
    # Feature importance plot
    plt.subplot(2, 1, 1)
    top_features = feature_importance.head(20)
    plt.barh(top_features['feature'], top_features['importance'])
    plt.title(f'Top 20 Most Important Features (C-index: {c_index:.4f})')
    plt.xlabel('Importance')
    
    # Learning curves
    plt.subplot(2, 1, 2)
    cv_results = pd.DataFrame(grid_search.cv_results_)
    plt.plot(cv_results['mean_test_score'], label='Mean CV Score')
    plt.fill_between(
        range(len(cv_results)),
        cv_results['mean_test_score'] - cv_results['std_test_score'],
        cv_results['mean_test_score'] + cv_results['std_test_score'],
        alpha=0.3
    )
    plt.title('Cross-validation Learning Curves')
    plt.xlabel('Parameter Combination')
    plt.ylabel('C-index')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

plot_detailed_results(feature_importance, grid_search, c_index)

#### Including Batch effects. Curretnly not trained with sufficent params but interestingly Gleason Score is now relevant
C-Index:0,739, dauert 750min

In [None]:
class BatchAwareSurvivalModel:
    def __init__(self):
        self.scaler = StandardScaler()
        self.base_model = GradientBoostingSurvivalAnalysis(
            n_estimators=1000,
            learning_rate=0.05,
            max_depth=3
        )
        
    def _extract_batch_info(self, data):
        """Extrahiert Batch-Information aus den Index-Namen"""
        return data.index.map(lambda x: x.split('.')[0])
    
    def _center_within_batch(self, X, batch_ids):
        """Zentriert Features innerhalb jedes Batches"""
        X_centered = X.copy()
        for batch in np.unique(batch_ids):
            mask = batch_ids == batch
            X_centered.loc[mask] = self.scaler.fit_transform(X.loc[mask])
        return X_centered
    
    def fit(self, X, y):
        """
        X: Feature-Matrix (gene expression + clinical)
        y: Survival data (status, time)
        """
        # Batch-Information extrahieren
        batch_ids = self._extract_batch_info(X)
        
        # Batch-Korrektur durchführen
        X_corrected = self._center_within_batch(X, batch_ids)
        
        # Basis-Modell fitten
        self.base_model.fit(X_corrected, y)
        
    def predict(self, X):
        # Batch-Korrektur für Vorhersage
        batch_ids = self._extract_batch_info(X)
        X_corrected = self._center_within_batch(X, batch_ids)
        return self.base_model.predict(X_corrected)
    
    def get_feature_importance(self):
        return pd.Series(
            self.base_model.feature_importances_,
            index=self.base_model.feature_names_in_
        )

def evaluate_model_with_cv(X, y, n_splits=5):
    """Evaluiert das Modell mit Cross-Validation"""
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    c_indices = []
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
        # Daten für diesen Fold aufteilen
        X_train = X.iloc[train_idx]
        X_test = X.iloc[test_idx]
        y_train = y[train_idx]
        y_test = y[test_idx]
        
        # Modell trainieren und evaluieren
        model = BatchAwareSurvivalModel()
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        
        # C-Index berechnen
        c_index = concordance_index_censored(
            y_test['status'],
            y_test['time'],
            pred
        )[0]
        
        c_indices.append(c_index)
        print(f"Fold {fold+1} C-index: {c_index:.3f}")
    
    return np.mean(c_indices), np.std(c_indices)

def plot_results(feature_importance, c_index_mean, c_index_std):
    """Visualisiert die Ergebnisse"""
    plt.figure(figsize=(12, 6))
    
    # Top 20 wichtigste Features
    top_features = feature_importance.sort_values(ascending=True)[-20:]
    
    plt.barh(range(len(top_features)), top_features)
    plt.yticks(range(len(top_features)), top_features.index)
    plt.xlabel('Feature Importance')
    plt.title(f'Top 20 Features (C-index: {c_index_mean:.3f} ± {c_index_std:.3f})')
    plt.tight_layout()
    plt.show()

# Hauptausführung
def main():
    # Daten laden
    exprs = loader.common_genes_data['common_genes_knn_imputed.csv']
    pdata = loader.merged_pdata_imputed['merged_imputed_pData.csv']
    
    # Survival Array erstellen
    y = np.array([(status, time) for status, time in 
                  zip(pdata['BCR_STATUS'], pdata['MONTH_TO_BCR'])],
                 dtype=[('status', bool), ('time', float)])
    
    # Klinische Features vorbereiten
    clinical_features = ['GLEASON_SCORE', 'PATH_T_STAGE', 'PRE_OPERATIVE_PSA']
    clinical_data = pd.get_dummies(
        pdata[clinical_features], 
        columns=['PATH_T_STAGE']
    )
    
    # Genetische und klinische Daten kombinieren
    combined_features = pd.concat([exprs, clinical_data], axis=1)
    
    # Modell evaluieren
    c_index_mean, c_index_std = evaluate_model_with_cv(combined_features, y)
    
    # Finales Modell für Feature Importance
    final_model = BatchAwareSurvivalModel()
    final_model.fit(combined_features, y)
    feature_importance = final_model.get_feature_importance()
    
    # Ergebnisse plotten
    plot_results(feature_importance, c_index_mean, c_index_std)
    
    print(f"\nOverall C-index: {c_index_mean:.3f} (±{c_index_std:.3f})")
    print("\nTop 10 most important features:")
    print(feature_importance.sort_values(ascending=False).head(10))

if __name__ == "__main__":
    from preprocessing import DataLoader
    import os
    
    # DataLoader initialisieren
    base_path = os.path.dirname(os.getcwd())
    loader = DataLoader(base_path)
    loader.load_all_data()
    
    main()

### Ensemle mit RSF und Compontent wise GB 0,67
(Params noch nicht gefixed)

In [None]:


class SurvivalEnsemble:
    def __init__(self):
        # Initialisiere beide Modelle mit einfachen Parametern
        self.rsf = RandomSurvivalForest(
            n_estimators=1000,
            max_depth=4,
            n_jobs=-1,
            random_state=42,
            min_samples_split=5
        )
        
        self.cgb = ComponentwiseGradientBoostingSurvivalAnalysis(
            n_estimators=1000,
            learning_rate=0.01,
            random_state=42
        )
        
        self.scaler = StandardScaler()
        
    def fit(self, X, y):
        # Standardisiere Features
        X_scaled = pd.DataFrame(
            self.scaler.fit_transform(X),
            index=X.index,
            columns=X.columns
        )
        
        # Fitte beide Modelle
        self.rsf.fit(X_scaled, y)
        self.cgb.fit(X_scaled, y)
        
    def predict(self, X):
        # Standardisiere Features
        X_scaled = pd.DataFrame(
            self.scaler.transform(X),
            index=X.index,
            columns=X.columns
        )
        
        # Hole Vorhersagen
        pred_rsf = self.rsf.predict(X_scaled)
        pred_cgb = self.cgb.predict(X_scaled)
        
        # Kombiniere Vorhersagen (einfacher Durchschnitt)
        return (pred_rsf + pred_cgb) / 2

def evaluate_ensemble():
    # Daten laden
    exprs = loader.common_genes_data['common_genes_knn_imputed.csv']
    pdata = loader.merged_pdata_imputed['merged_imputed_pData.csv']
    
    # Survival Array erstellen
    y = np.array([(status, time) for status, time in 
                  zip(pdata['BCR_STATUS'], pdata['MONTH_TO_BCR'])],
                 dtype=[('status', bool), ('time', float)])
    
    # Klinische Features vorbereiten
    clinical_features = ['GLEASON_SCORE', 'PATH_T_STAGE', 'PRE_OPERATIVE_PSA']
    clinical_data = pd.get_dummies(
        pdata[clinical_features], 
        columns=['PATH_T_STAGE']
    )
    
    # Features kombinieren
    X = pd.concat([exprs, clinical_data], axis=1)
    
    # Cross-Validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    c_indices = []
    
    # Progress Tracking
    total_folds = 5
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
        print(f"\nStarting Fold {fold}/{total_folds} ({fold/total_folds*100:.1f}% complete)")
        
        # Split data
        X_train = X.iloc[train_idx]
        X_test = X.iloc[test_idx]
        y_train = y[train_idx]
        y_test = y[test_idx]
        
        # Train ensemble
        print("Training Random Survival Forest...")
        ensemble = SurvivalEnsemble()
        ensemble.fit(X_train, y_train)
        
        # Make predictions
        print("Making predictions...")
        pred = ensemble.predict(X_test)
        
        # Calculate c-index
        c_index = concordance_index_censored(
            y_test['status'],
            y_test['time'],
            pred
        )[0]
        
        c_indices.append(c_index)
        print(f"Fold {fold} C-index: {c_index:.3f}")
    
    # Final results
    mean_c_index = np.mean(c_indices)
    std_c_index = np.std(c_indices)
    
    print("\nFinal Results:")
    print(f"Mean C-index: {mean_c_index:.3f} (±{std_c_index:.3f})")
    print("Individual fold C-indices:", c_indices)
    
    return mean_c_index, std_c_index

if __name__ == "__main__":
    from preprocessing import DataLoader
    import os
    
    # DataLoader initialisieren
    base_path = os.path.dirname(os.getcwd())
    loader = DataLoader(base_path)
    loader.load_all_data()
    
    # Daten laden
    common_genes_data = loader.common_genes_data
    
    # Evaluation durchführen
    mean_c_index, std_c_index = evaluate_ensemble()

#### Multi Task Learning


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader as TorchDataLoader
from torch.optim.lr_scheduler import OneCycleLR
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sksurv.metrics import concordance_index_censored

class DeepMTLSurvival(nn.Module):
    def __init__(self, clinical_dim, genomic_dim):
        super().__init__()
        
        # Clinical pathway
        self.clinical_net = nn.Sequential(
            nn.Linear(clinical_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Genomic pathway
        self.genomic_net = nn.Sequential(
            nn.Linear(genomic_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Combined pathway
        self.combined_net = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU()
        )
        
        # Output heads
        self.time_out = nn.Linear(16, 1)
        self.event_out = nn.Linear(16, 1)
        
    def forward(self, clinical, genomic):
        # Process clinical data
        clinical_features = self.clinical_net(clinical)
        
        # Process genomic data
        genomic_features = self.genomic_net(genomic)
        
        # Combine features
        combined = torch.cat([clinical_features, genomic_features], dim=1)
        shared_features = self.combined_net(combined)
        
        # Generate predictions
        time_pred = self.time_out(shared_features)
        event_pred = torch.sigmoid(self.event_out(shared_features))
        
        return time_pred, event_pred, clinical_features, genomic_features

class MTLSurvivalLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.time_criterion = nn.MSELoss()
        self.event_criterion = nn.BCELoss()
        
    def forward(self, time_pred, event_pred, time_true, event_true, 
                time_pred_b=None, event_pred_b=None, 
                time_true_b=None, event_true_b=None, lam=1.0):
        if time_pred_b is None:  # No mixup
            time_loss = self.time_criterion(time_pred, time_true.view(-1, 1))
            event_loss = self.event_criterion(event_pred, event_true.view(-1, 1).float())
            return time_loss + event_loss
        else:  # Mixup
            time_loss = lam * self.time_criterion(time_pred, time_true.view(-1, 1)) + \
                       (1 - lam) * self.time_criterion(time_pred_b, time_true_b.view(-1, 1))
            event_loss = lam * self.event_criterion(event_pred, event_true.view(-1, 1).float()) + \
                        (1 - lam) * self.event_criterion(event_pred_b, event_true_b.view(-1, 1).float())
            return time_loss + event_loss

class SurvivalDataset(Dataset):
    def __init__(self, clinical_data, genomic_data, time, event):
        self.clinical_data = torch.FloatTensor(clinical_data)
        self.genomic_data = torch.FloatTensor(genomic_data)
        self.time = torch.FloatTensor(time)
        self.event = torch.FloatTensor(event)
        
    def __len__(self):
        return len(self.time)
    
    def __getitem__(self, idx):
        return {
            'clinical': self.clinical_data[idx],
            'genomic': self.genomic_data[idx],
            'time': self.time[idx],
            'event': self.event[idx]
        }

class DeepMTLSurvivalAnalysis:
    def __init__(self, clinical_dim, genomic_dim, device='cuda', **kwargs):
        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        self.model = DeepMTLSurvival(clinical_dim, genomic_dim).to(self.device)
        self.criterion = MTLSurvivalLoss()
        self.optimizer = torch.optim.Adam(
            self.model.parameters(),
            lr=kwargs.get('lr', 0.001),
            weight_decay=kwargs.get('weight_decay', 1e-5)
        )
    
    def mixup_data(self, clinical, genomic, time, event, alpha=0.2):
        if alpha > 0:
            lam = np.random.beta(alpha, alpha)
        else:
            lam = 1

        batch_size = clinical.size()[0]
        index = torch.randperm(batch_size).to(self.device)

        mixed_clinical = lam * clinical + (1 - lam) * clinical[index]
        mixed_genomic = lam * genomic + (1 - lam) * genomic[index]
        
        return mixed_clinical, mixed_genomic, time, time[index], event, event[index], lam
        
    def fit(self, train_loader, val_loader=None, epochs=100):
        print("Starting training...")
        self.scheduler = OneCycleLR(
            self.optimizer,
            max_lr=0.001,
            epochs=epochs,
            steps_per_epoch=len(train_loader),
            pct_start=0.3
        )
        
        best_val_loss = float('inf')
        patience_counter = 0
        
        for epoch in range(epochs):
            self.model.train()
            train_loss = 0
            
            for batch_idx, batch in enumerate(train_loader):
                self.optimizer.zero_grad()
                
                clinical = batch['clinical'].to(self.device)
                genomic = batch['genomic'].to(self.device)
                time = batch['time'].to(self.device)
                event = batch['event'].to(self.device)
                
                if np.random.random() > 0.5:
                    (clinical_mixed, genomic_mixed, 
                     time_a, time_b, 
                     event_a, event_b, 
                     lam) = self.mixup_data(clinical, genomic, 
                                          time, event)
                    
                    time_pred_mixed, event_pred_mixed, _, _ = self.model(
                        clinical_mixed, genomic_mixed
                    )
                    
                    loss = self.criterion(
                        time_pred_mixed, event_pred_mixed,
                        time_a, event_a,
                        time_pred_mixed, event_pred_mixed,
                        time_b, event_b,
                        lam
                    )
                else:
                    time_pred, event_pred, _, _ = self.model(
                        clinical, genomic
                    )
                    
                    loss = self.criterion(time_pred, event_pred, time, event)
                
                loss.backward()
                
                torch.nn.utils.clip_grad_norm_(
                    self.model.parameters(), 
                    max_norm=1.0
                )
                
                self.optimizer.step()
                self.scheduler.step()
                
                train_loss += loss.item()
                
                if batch_idx % 10 == 0:
                    print(f'Epoch: {epoch+1}/{epochs}, Batch: {batch_idx}/{len(train_loader)}, '
                          f'Loss: {loss.item():.4f}')
            
            avg_train_loss = train_loss / len(train_loader)
            print(f'Epoch {epoch+1}/{epochs} - Average Training Loss: {avg_train_loss:.4f}')
            
            if val_loader is not None:
                val_loss = self.evaluate(val_loader)
                print(f'Validation Loss: {val_loss:.4f}')
                
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    patience_counter = 0
                else:
                    patience_counter += 1
                
                if patience_counter >= 15:
                    print("Early stopping triggered")
                    break
                
    def predict(self, loader):
        print("Making predictions...")
        self.model.eval()
        time_preds = []
        event_preds = []
        
        with torch.no_grad():
            for batch in loader:
                clinical = batch['clinical'].to(self.device)
                genomic = batch['genomic'].to(self.device)
                
                time_pred, event_pred, _, _ = self.model(clinical, genomic)
                
                time_preds.append(time_pred.cpu())
                event_preds.append(event_pred.cpu())
        
        return (
            torch.cat(time_preds).numpy(),
            torch.cat(event_preds).numpy()
        )
    
    def evaluate(self, loader):
        self.model.eval()
        total_loss = 0
        
        with torch.no_grad():
            for batch in loader:
                clinical = batch['clinical'].to(self.device)
                genomic = batch['genomic'].to(self.device)
                time = batch['time'].to(self.device)
                event = batch['event'].to(self.device)
                
                time_pred, event_pred, _, _ = self.model(clinical, genomic)
                
                loss = self.criterion(time_pred, event_pred, time, event)
                total_loss += loss.item()
        
        return total_loss / len(loader)

def evaluate_mtl_survival(data_loader):
    print("Starting evaluation...")
    
    # Load data
    exprs = data_loader.common_genes_data['common_genes_knn_imputed.csv']
    pdata = data_loader.merged_pdata_imputed['merged_imputed_pData.csv']
    
    print(f"Data loaded - Expression shape: {exprs.shape}, Clinical shape: {pdata.shape}")
    
    clinical_features = ['GLEASON_SCORE', 'PATH_T_STAGE', 'PRE_OPERATIVE_PSA']
    clinical_data = pd.get_dummies(
        pdata[clinical_features], 
        columns=['PATH_T_STAGE']
    )
    
    print(f"Clinical features after one-hot encoding: {clinical_data.columns.tolist()}")
    
    clinical_scaler = StandardScaler()
    genomic_scaler = StandardScaler()
    
    X_clinical = clinical_scaler.fit_transform(clinical_data)
    X_genomic = genomic_scaler.fit_transform(exprs)
    
    time = pdata['MONTH_TO_BCR'].values
    event = pdata['BCR_STATUS'].values
    
    print(f"Data prepared - Clinical shape: {X_clinical.shape}, Genomic shape: {X_genomic.shape}")
    print(f"Time shape: {time.shape}, Event shape: {event.shape}")
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    c_indices = []
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(X_clinical), 1):
        print(f"\nProcessing Fold {fold}/5")
        
        X_clinical_train = X_clinical[train_idx]
        X_clinical_test = X_clinical[test_idx]
        X_genomic_train = X_genomic[train_idx]
        X_genomic_test = X_genomic[test_idx]
        time_train = time[train_idx]
        time_test = time[test_idx]
        event_train = event[train_idx]
        event_test = event[test_idx]
        
        print(f"Training set sizes - Clinical: {X_clinical_train.shape}, Genomic: {X_genomic_train.shape}")
        print(f"Test set sizes - Clinical: {X_clinical_test.shape}, Genomic: {X_genomic_test.shape}")
        
        train_dataset = SurvivalDataset(
            X_clinical_train, X_genomic_train, 
            time_train, event_train
        )
        test_dataset = SurvivalDataset(
            X_clinical_test, X_genomic_test,
            time_test, event_test
        )
        
        train_loader = TorchDataLoader(
            train_dataset, 
            batch_size=64,
            shuffle=True,
            drop_last=True
        )
        test_loader = TorchDataLoader(
            test_dataset, 
            batch_size=64,
            shuffle=False
        )
        
        model = DeepMTLSurvivalAnalysis(
            clinical_dim=X_clinical.shape[1],
            genomic_dim=X_genomic.shape[1],
            lr=0.001,
            weight_decay=1e-5
        )
        
        print(f"\nTraining model for fold {fold}")
        model.fit(train_loader, epochs=100)
        
        print(f"\nMaking predictions for fold {fold}")
        time_pred, _ = model.predict(test_loader)
        
        c_index = concordance_index_censored(
            event_test.astype(bool),
            time_test,
            -time_pred.squeeze()
        )[0]
        
        c_indices.append(c_index)
        print(f"Fold {fold} C-index: {c_index:.3f}")
    
    print("\nFinal Results:")
    print(f"Mean C-index: {np.mean(c_indices):.3f} (±{np.std(c_indices):.3f})")
    print("Individual fold C-indices:", c_indices)
    
    return np.mean(c_indices), np.std(c_indices)

if __name__ == "__main__":
    from preprocessing import DataLoader
    import os
    
    print("Starting program...")
    base_path = os.path.dirname(os.getcwd())
    print(f"Base path: {base_path}")
    
    loader = DataLoader(base_path)
    print("DataLoader initialized")
    
    loader.load_all_data()
    print("All data loaded")
    
    mean_c_index, std_c_index = evaluate_mtl_survival(loader)
    print(f"\nFinal Results:")
    print(f"Mean C-index: {mean_c_index:.3f} (±{std_c_index:.3f})")