# Example notebook: BaseModel and child classes

In [1]:
from new_structure.src.models.base_model import BaseSurvivalModel
from new_structure.src.models.rsf import RSFModel
from new_structure.src.models.cox import PenCoxModel
from new_structure.src.models.boosting import GBModel
from new_structure.src.utils import utils

import numpy as np 
import pandas as pd
from source.preprocessing import DataLoader 
from sklearn.preprocessing import StandardScaler
from sksurv.ensemble import RandomSurvivalForest, GradientBoostingSurvivalAnalysis
import os
base_path = os.path.dirname(os.getcwd()) 

In [2]:
# load the data
dl = DataLoader("C:/Users/laeti/OneDrive/Dokumente/LMU/StatisticalConsulting")
dl.load_all_data()

In [3]:
# prepare example data
intersection_exprs_df = pd.DataFrame(dl.intersection_data['exprs_intersect.csv'])
merged_pData_imputed = pd.DataFrame(dl.merged_pdata_imputed['merged_imputed_pData.csv'])

y = utils.create_surv_y(merged_pData_imputed['BCR_STATUS'], merged_pData_imputed['MONTH_TO_BCR'])
X = intersection_exprs_df

## Generelles

Manche der Pipeline Steps, die der fit_model Funktion übergeben werden MÜSSEN auf eine bestimmte Art und Weise benannt werden: 
- Schritt zur feature selection: 'feat_sel'
- Schritt zur Modellierung/zum Model: 'model'

Die Schritte zur Datentransfromation (z.B. standard scaler etc.) müssen nicht speziell benannt werden

## Beispiele

### Gradient boosting

In [6]:
# instantiate GBModel, which inherits from BaseModel
gb = GBModel()

In [10]:
# create steps for sksurv pipeline, necessary to fit the model
pipe_steps_gb = [('gb', GradientBoostingSurvivalAnalysis())]
params_cv = {
        'gb__n_estimators': [2],
        'gb__learning_rate': [0.1],
        'gb__max_depth': [3]
    }

path = os.path.join(base_path, 'examples_new_struct')
gb.fit_model(X, y, fname= 'gb_example', path =  path, pipeline_steps = pipe_steps_gb, params_cv = params_cv, refit = True, save_model = True) 


Fitting 9 folds for each of 1 candidates, totalling 9 fits

Best parameters: {'gb__learning_rate': 0.1, 'gb__max_depth': 3, 'gb__n_estimators': 2}

Cross-validation results:
   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0      18.094693      1.193542         0.122473        0.039382   

   param_gb__learning_rate  param_gb__max_depth  param_gb__n_estimators  \
0                      0.1                    3                       2   

                                              params  split0_test_score  \
0  {'gb__learning_rate': 0.1, 'gb__max_depth': 3,...           0.614306   

   split1_test_score  split2_test_score  split3_test_score  split4_test_score  \
0           0.571788           0.476091           0.571704             0.7023   

   split5_test_score  split6_test_score  split7_test_score  split8_test_score  \
0           0.620368           0.843594           0.629852            0.60783   

   mean_test_score  std_test_score  rank_test_score  
0        

OSError: Cannot save file into a non-existent directory: 'c:\Users\laeti\OneDrive\Dokumente\LMU\examples_new_struct'

## Deep Surv
##### Ich habe nochmal nen eigenen import gemacht. Die Cohorten für die cohortenweise CV bestimme ich extra und c-index mache ich auch extra im deep surv. Müssen wir dann noch zusammen führen.



In [2]:
def get_cohorts_from_index(index):
   cohort_names = [idx.split('.')[0] for idx in index]
   
   unique_cohorts = sorted(list(set(cohort_names)))
   cohort_to_id = {cohort: idx for idx, cohort in enumerate(unique_cohorts)}
   
   cohorts = np.array([cohort_to_id[cohort] for cohort in cohort_names])
   
   for cohort, id in cohort_to_id.items():
       n_samples = sum(cohorts == id)
       print(f"{cohort}: {n_samples} Samples (ID: {id})")
   
   return cohorts

In [3]:
base_path = os.path.dirname(os.getcwd()) 
base_path = os.path.join(base_path, 'code')
loader = DataLoader(base_path)
loader.load_all_data()

In [7]:
from new_structure.src.models.deep_surv import DeepSurvModel

In [4]:
import numpy as np
import pandas as pd
import itertools
from new_structure.src.models.deep_surv import DeepSurvModel
from sklearn.model_selection import LeaveOneGroupOut

# Lade die Daten
exprs = loader.common_genes_data['common_genes_knn_imputed.csv']
pdata = loader.merged_pdata_imputed['merged_imputed_pData.csv']

n_samples = len(pdata)
survival_data = np.zeros(n_samples, dtype={'names': ('time', 'status'), 'formats': ('f8', 'bool')})
survival_data['time'] = pdata['MONTH_TO_BCR'].values
survival_data['status'] = pdata['BCR_STATUS'].astype(bool).values
y = survival_data
X = exprs

cohorts = get_cohorts_from_index(X.index)

param_grid = {
   'hidden_layers': [[32,16], [64,32]],
   'learning_rate': [0.001, 0.0001],
   'batch_size': [32, 64],
   'n_epochs': [50]
}

results = []
param_names = param_grid.keys()
param_values = param_grid.values()
param_combinations = list(itertools.product(*param_values))

logo = LeaveOneGroupOut()

for params in param_combinations:
   param_dict = dict(zip(param_names, params))
   print(f"\nTraining with parameters: {param_dict}")
   
   fold_c_indices = []
   cohort_names = []
   
   # Leave-One-Group-Out cross validation
   for train_idx, test_idx in logo.split(X, y, groups=cohorts):
       cohort_name = X.index[test_idx[0]].split('.')[0]
       print(f"\nTesting on cohort {cohort_name}")
       
       # Split data
       X_train = X.iloc[train_idx]
       y_train = y[train_idx]
       X_test = X.iloc[test_idx]
       y_test = y[test_idx]
       
       # Train model
       model = DeepSurvModel(**param_dict)
       model.fit_model(X_train, y_train)
       
       # Evaluate on test set
       predictions = model.predict_model(X_test)
       c_index = model.calculate_c_index(y_test, predictions.flatten())  # Hier die Änderung
       fold_c_indices.append(c_index)
       cohort_names.append(cohort_name)
       
       print(f"Test cohort {cohort_name} C-index: {c_index:.4f}")
   
   # Calculate mean and std of c-index across cohorts
   mean_c_index = np.mean(fold_c_indices)
   std_c_index = np.std(fold_c_indices)
   
   print(f"\nMean C-index across cohorts: {mean_c_index:.4f} (±{std_c_index:.4f})")
   
   # Store results
   results.append({
       **param_dict,
       'mean_c_index': mean_c_index,
       'std_c_index': std_c_index,
       'cohort_scores': fold_c_indices,
       'cohort_names': cohort_names
   })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Sort by mean c-index
print("\nGrid Search Results:")
print(results_df.sort_values('mean_c_index', ascending=False))



Atlanta_2014_Long: 100 Samples (ID: 0)
Belfast_2018_Jain: 248 Samples (ID: 1)
CPC_GENE_2017_Fraser: 73 Samples (ID: 2)
CPGEA_2020_Li: 120 Samples (ID: 3)
CamCap_2016_Ross_Adams: 112 Samples (ID: 4)
CancerMap_2017_Luca: 133 Samples (ID: 5)
DKFZ_2018_Gerhauser: 82 Samples (ID: 6)
MSKCC_2010_Taylor: 131 Samples (ID: 7)
Stockholm_2016_Ross_Adams: 92 Samples (ID: 8)

Training with parameters: {'hidden_layers': [32, 16], 'learning_rate': 0.001, 'batch_size': 32, 'n_epochs': 50}

Testing on cohort Atlanta_2014_Long
Epoch 0: Loss = 694.6953
Epoch 10: Loss = 480.2270
Epoch 20: Loss = 424.1550
Epoch 30: Loss = 379.3379
Epoch 40: Loss = 290.8111
Test cohort Atlanta_2014_Long C-index: 0.6855

Testing on cohort Belfast_2018_Jain
Epoch 0: Loss = 727.2180
Epoch 10: Loss = 581.7438
Epoch 20: Loss = 492.0078
Epoch 30: Loss = 423.3386
Epoch 40: Loss = 402.7075
Test cohort Belfast_2018_Jain C-index: 0.6507

Testing on cohort CPC_GENE_2017_Fraser
Epoch 0: Loss = 719.9620
Epoch 10: Loss = 548.8331
Epoch 20