In [None]:
! pip install scikit-learn==1.1.3
! pip install scikit-survival
! pip install lifelines
! pip install scikit-optimize

In [None]:
import numpy as np
import pandas as pd
import os
import random
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sksurv.datasets import load_veterans_lung_cancer
from sksurv.ensemble import RandomSurvivalForest, ComponentwiseGradientBoostingSurvivalAnalysis
from sksurv.svm import FastSurvivalSVM
from sksurv.metrics import concordance_index_censored
from sksurv.nonparametric import kaplan_meier_estimator
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test
from lifelines.plotting import add_at_risk_counts
import matplotlib.pyplot as plt
import matplotlib as mlp
from lifelines import CoxPHFitter
from hotelling.stats import hotelling_t2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Font settings for plots
font_size = 16
mlp.rcParams['figure.figsize'] = (8, 6)
mlp.rcParams['figure.labelsize'] = 'large'
font = {'weight' : 'bold',
        'size' : font_size}

mlp.rc('font', **font)
font_label = font_size

In [None]:
# The name of datasets
dataset_names = ['LC_DF_SCT', 'LC_DF_SPT', 'LC_RF_CT',  'LC_RF_PT','LC_RF+DF_CT', 'LC_RF+DF_PT']
result_names = ['DRF-CT', 'DRF-PET', 'HRF-CT', 'HRF-PET','HRF-PET Plus DRF-CT', 'HRF-PET Plus DRF-PET']

# Algorithms

In [None]:
# Initialize HyperParametters for Random Survival Forest, Fast Survival SVM and Component-wise Gradient Boosting Survival Analysis

# Random Survival Forest
rsf = RandomSurvivalForest(n_estimators=10,
                           min_samples_split=5,
                           min_samples_leaf=10,
                           max_features="sqrt",
                           n_jobs=-1,
                           random_state=42)

# Fast Survival SVM
fssvm = FastSurvivalSVM(max_iter=512, tol=1e-6, random_state=42)

# Component-wise Gradient Boosting Survival Analysis
cwgbsa = ComponentwiseGradientBoostingSurvivalAnalysis(random_state=42)

cph = CoxPHFitter(penalizer=0.1)


In [None]:
# Split Five folds
# Assuming X is your feature matrix and y is your target vector
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
cd path_to_directy

## 1. Random Survival Forest

In [None]:
################################# Random Survival Forest #################################

# Add RFS Algorithm reported Results to its Data Frame
df_reported_results_rsf = pd.DataFrame(columns=['Dataset', 'SRA', 'Mean C-Index Internal', 'Std C-Index Internal', 'Mean C-Index External',
                                                'Std C-Index External', '5-Folds P-Value', 'P-value External', 'Combined C-Index', 'Combined P-value'])

# Compute statistics features for all datasets with RFS algorithm
for index_name in range(6):

    # Load a dataset (example: Veterans' Lung Cancer trial)
    X = pd.read_csv(os.path.join('Data', '{}.csv'.format(str(dataset_names[index_name]))), header=None)
    y = pd.read_csv(os.path.join('Data', 'COX_OUTCOME.csv'))

    # Convert days to years
    years = y['Duration'].astype('float') / 365

    # Convert the structured array y to a boolean array
    event = y['Censor'].astype(bool)
    time = y['Duration'].values


    # Prepare the data for the model
    train_size = int(len(X) * 0.8)

    # Split Data
    X_train = X[:train_size]
    X_test = X[train_size:]
    event_train = event[:train_size]
    time_train = time[:train_size]
    event_test_external = event[train_size:]
    time_test_external = time[train_size:]

    # Dimention Reduction with PCA
    min_max_scaler = MinMaxScaler()
    X_train = min_max_scaler.fit_transform(X_train)
    X_test = min_max_scaler.transform(X_test)

    # Define PCA
    pca = PCA(n_components=10)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    y_train = np.array([(e, t) for e, t in zip(event_train, time_train)],
                        dtype=[('Censor', bool), ('Duration', float)])

    y_test = np.array([(e, t) for e, t in zip(event_test_external, time_test_external)],
                       dtype=[('Censor', bool), ('Duration', float)])

    # Add Algorithm internal and external Results to its Data Frame
    df_internal_results_rsf = pd.DataFrame(columns=['C-Index internal', 'C-Index External', 'P-Value Internal', 'P-Value External'])

    # Risk Scores for 5-fold and external test
    risk_scores_folds = []
    risk_scores_test_external = []

    # Fit the model
    for train_index, test_index in kf.split(X_train):
        X_train_in, X_test_in = X_train[train_index], X_train[test_index]
        y_train_in, y_test_in = y_train[train_index], y_train[test_index]

        # Determine internal Event and Time
        event_test_internal = np.array([tup[0] for tup in y_test_in])
        time_test_internal = np.array([tup[1] for tup in y_test_in])

        # Train Model
        rsf.fit(X_train_in, y_train_in)

        ################################# Predicting survival - Internal Test #################################

        # Predict
        risk_scores_internal = rsf.predict(X_test_in)
        for item in risk_scores_internal.tolist():
            risk_scores_folds.append(item)

        # C-Index
        result_internal = concordance_index_censored(event_test_internal, time_test_internal, risk_scores_internal)
        c_index_internal = result_internal[0]

        # Log-rank Test
        group_labels_internal = np.random.choice([0, 1], size=X_test_in.shape[0], replace=True)  # Example groups
        idx_internal = group_labels_internal == 1
        time1_internal, event1_internal = time_test_internal[idx_internal], event_test_internal[idx_internal]
        time2_internal, event2_internal = time_test_internal[~idx_internal], event_test_internal[~idx_internal]

        # Compute log-rank test
        test_result_internal = logrank_test(time1_internal, time2_internal, event_observed_A=event1_internal, event_observed_B=event2_internal)
        # Compute P-Value test
        p_value_internal = test_result_internal.p_value

        ################################# Predicting survival - External Test #################################

        # Predict
        risk_scores_external = rsf.predict(X_test)
        for item in risk_scores_external.tolist():
            risk_scores_test_external.append(item)

        # C-Index
        result_external = concordance_index_censored(event_test_external, time_test_external, risk_scores_external)
        c_index_external = result_external[0]

        # Log-rank Test
        group_labels_external = np.random.choice([0, 1], size=X_test.shape[0], replace=True)  # Example groups
        idx_external = group_labels_external == 1
        time1_external, event1_external = time_test_external[idx_external], event_test_external[idx_external]
        time2_external, event2_external = time_test_external[~idx_external], event_test_external[~idx_external]

        # Compute log-rank test
        test_result_external = logrank_test(time1_external, time2_external, event_observed_A=event1_external, event_observed_B=event2_external)
        # Compute P-Value test
        p_value_external = test_result_external.p_value

        # Add Internal and External Results to Data Frame
        df_internal_results_rsf.loc[len(df_internal_results_rsf)] = [c_index_internal, c_index_external, p_value_internal, p_value_external]

    # Add Internal and External Results to Data Frame
    df_internal_results_rsf.to_csv(os.path.join('Results', 'RSF_{}-Results.csv'.format(str(result_names[index_name]))))


    ############################ P-Value #########################################################################

    # Log-rank Test
    group_labels = np.random.choice([0, 1], size=X_train.shape[0], replace=True)  # Example groups
    idx = group_labels == 1
    time1, event1 = time_train[idx], event_train[idx]
    time2, event2 = time_train[~idx], event_train[~idx]

    # Compute log-rank test
    train_result = logrank_test(time1, time2, event_observed_A=event1, event_observed_B=event2)
    # Compute P-Value test
    p_value_train = train_result.p_value

    ############################################ Save Results #####################################################

    # Add C-Indexes and P-values to data frame
    mean_cindex_internal = df_internal_results_rsf['C-Index internal'].mean()
    std_cindex_internal = df_internal_results_rsf['C-Index internal'].std()
    mean_cindex_external = df_internal_results_rsf['C-Index External'].mean()
    std_cindex_external = df_internal_results_rsf['C-Index External'].std()
    pvalue_external = df_internal_results_rsf['P-Value External'].min()

    ############################################## Draw Kaplan Mier ##############################################

    # Hotelling T-test
    risk_scores_combine_for_hotelling = np.array(risk_scores_folds + risk_scores_test_external[:40])
    threshold_for_hotelling  = np.percentile(risk_scores_combine_for_hotelling, 50)

    y_for_hotelling = np.array(y_train.tolist() + y_test.tolist())
    X_for_hotelling = np.array(X_train.tolist() + X_test.tolist())

    high_risk_for_h = X_for_hotelling[risk_scores_combine_for_hotelling >= threshold_for_hotelling]
    low_risk_for_h = X_for_hotelling[risk_scores_combine_for_hotelling < threshold_for_hotelling]

    print(hotelling_t2(high_risk_for_h, low_risk_for_h)[2])

    # Determine high and low risk groups
    risk_scores_combined = np.array(risk_scores_folds + risk_scores_test_external)

    threshold = np.percentile(risk_scores_combined, 50)

    y_combined = np.array(y_train.tolist() + (y_test.tolist() * 5))

    high_risk = y_combined[risk_scores_combined >= threshold]
    low_risk = y_combined[risk_scores_combined < threshold]



    high_risk_event = np.array([tup[0] for tup in high_risk])
    high_risk_time = np.array([tup[1] for tup in high_risk])
    low_risk_event = np.array([tup[0] for tup in low_risk])
    low_risk_time = np.array([tup[1] for tup in low_risk])

    if len(low_risk) > 0 and len(high_risk) > 0:
        # Calculate the Kaplan-Meier estimates for the two groups
        time_high, survival_prob_high = kaplan_meier_estimator(high_risk_event.astype(bool), high_risk_time)
        time_low, survival_prob_low = kaplan_meier_estimator(low_risk_event.astype(bool), low_risk_time)

        # Compute P-value combined data
        results_pvalue_combined = logrank_test(low_risk_time, high_risk_time,
                                               event_observed_A=low_risk_event, event_observed_B=high_risk_event)
        pvalue_combined = results_pvalue_combined.p_value

        # Compute c-index combined data
        event_combined = np.array([tup[0] for tup in y_combined])
        time_combined = np.array([tup[1] for tup in y_combined])
        results_cindex_combined = concordance_index_censored(event_combined.astype(bool), time_combined, risk_scores_combined)
        c_index_combined = results_cindex_combined[0]
        plt.clf()

        kmf1 = KaplanMeierFitter()
        kmf2 = KaplanMeierFitter()

        kmf1.fit(high_risk_time.astype('float') / 365, high_risk_event, label='High Risk')

        kmf2.fit(low_risk_time.astype('float') / 365, low_risk_event, label='Low Risk')

        fig = plt.figure()
        ax = plt.subplot(111)

        ax = kmf1.plot(color='r', label='High Risk',show_censors=True, censor_styles={'ms': 6, 'marker': '|'})
        ax = kmf2.plot(color='g', label='Low Risk',show_censors=True, censor_styles={'ms': 6, 'marker': '|'})

        plt.title("RSF , "+result_names[index_name], fontsize=font_label, fontweight='bold')

        plt.xlabel("Time (Years)", fontsize=font_label, fontweight='bold')
        plt.ylabel("Survival probability", fontsize=font_label, fontweight='bold')
        plt.xticks(np.arange(int(years.min()), int(years.max()) + 1, 1.0))
        ax.grid(axis='both', which='both', color='lightgray', linestyle='-', linewidth=0.5, zorder=-1000)
        # ax.text(0, 0, "Log Rank P-value : "+str(round(pvalue_combined, 4)),bbox = {'facecolor': 'lightgray'})
        # ax.text(0, 0, "C-index : "+str(round(c_index_combined, 2)),bbox = {'facecolor': 'lightgray'})



        fig.savefig(os.path.join('Plots', 'RSF_{}-KaplanMier.jpg'.format(str(result_names[index_name]))), dpi=300)

    #########################################################################################################

    # Add Reported Results to its Data Frame
    df_reported_results_rsf.loc[len(df_reported_results_rsf)] = [result_names[index_name], 'RSF', mean_cindex_internal, std_cindex_internal,
                                                                 mean_cindex_external, std_cindex_external, p_value_train, pvalue_external,
                                                                 c_index_combined, pvalue_combined]

## Fast Survival SVM

In [None]:
################################# Random Survival Forest #################################

# Data Frame for Add RFS Algorithm reported Results
df_reported_results_fssvm = pd.DataFrame(columns=['Dataset', 'SRA', 'Mean C-Index Internal', 'Std C-Index Internal', 'Mean C-Index External',
                                                  'Std C-Index External', '5-Folds P-Value', 'P-value External', 'Combined C-Index', 'Combined P-value'])

# Compute statistics features for all datasets with RFS algorithm
for index_name in range(6):
    # Load a dataset (example: Veterans' Lung Cancer trial)
    X = pd.read_csv(os.path.join('Data', '{}.csv'.format(str(dataset_names[index_name]))), header=None)
    y = pd.read_csv(os.path.join('Data', 'COX_OUTCOME.csv'))

    # Convert days to years
    years = y['Duration'].astype('float') / 365

        # Convert the structured array y to a boolean array
    event = y['Censor'].astype(bool)
    time = y['Duration'].values


    # Prepare the data for the model
    train_size = int(len(X) * 0.8)

    # Split Data
    X_train = X[:train_size]
    X_test = X[train_size:]
    event_train = event[:train_size]
    time_train = time[:train_size]
    event_test_external = event[train_size:]
    time_test_external = time[train_size:]

    # Dimention Reduction with PCA
    min_max_scaler = MinMaxScaler()
    X_train = min_max_scaler.fit_transform(X_train)
    X_test = min_max_scaler.transform(X_test)

    # Define PCA
    pca = PCA(n_components=10)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    y_train = np.array([(e, t) for e, t in zip(event_train, time_train)],
                        dtype=[('Censor', bool), ('Duration', float)])

    y_test = np.array([(e, t) for e, t in zip(event_test_external, time_test_external)],
                       dtype=[('Censor', bool), ('Duration', float)])

    # Add Algorithm internal and external Results to its Data Frame
    df_internal_results_fssvm = pd.DataFrame(columns=['C-Index internal', 'C-Index External', 'P-Value Internal', 'P-Value External'])

    # Risk Scores for 5-fold and external test
    risk_scores_folds = []
    risk_scores_test_external = []

    # Fit the model
    for train_index, test_index in kf.split(X_train):
        X_train_in, X_test_in = X_train[train_index], X_train[test_index]
        y_train_in, y_test_in = y_train[train_index], y_train[test_index]

        # Determine internal Event and Time
        event_test_internal = np.array([tup[0] for tup in y_test_in])
        time_test_internal = np.array([tup[1] for tup in y_test_in])

        # Train Model
        fssvm.fit(X_train_in, y_train_in)

        ################################# Predicting survival - Internal Test #################################

        # Predict
        risk_scores_internal = fssvm.predict(X_test_in)
        for item in risk_scores_internal.tolist():
            risk_scores_folds.append(item)

        # C-Index
        result_internal = concordance_index_censored(event_test_internal, time_test_internal, risk_scores_internal)
        c_index_internal = result_internal[0]

        # Log-rank Test
        group_labels_internal = np.random.choice([0, 1], size=X_test_in.shape[0], replace=True)  # Example groups
        idx_internal = group_labels_internal == 1
        time1_internal, event1_internal = time_test_internal[idx_internal], event_test_internal[idx_internal]
        time2_internal, event2_internal = time_test_internal[~idx_internal], event_test_internal[~idx_internal]

        # Compute log-rank test
        test_result_internal = logrank_test(time1_internal, time2_internal, event_observed_A=event1_internal, event_observed_B=event2_internal)
        # Compute P-Value test
        p_value_internal = test_result_internal.p_value

        ################################# Predicting survival - External Test #################################

        # Predict
        risk_scores_external = fssvm.predict(X_test)
        for item in risk_scores_external.tolist():
            risk_scores_test_external.append(item)

        # C-Index
        result_external = concordance_index_censored(event_test_external, time_test_external, risk_scores_external)
        c_index_external = result_external[0]

        # Log-rank Test
        group_labels_external = np.random.choice([0, 1], size=X_test.shape[0], replace=True)  # Example groups
        idx_external = group_labels_external == 1
        time1_external, event1_external = time_test_external[idx_external], event_test_external[idx_external]
        time2_external, event2_external = time_test_external[~idx_external], event_test_external[~idx_external]

        # Compute log-rank test
        test_result_external = logrank_test(time1_external, time2_external, event_observed_A=event1_external, event_observed_B=event2_external)
        # Compute P-Value test
        p_value_external = test_result_external.p_value

        # Add Internal and External Results to Data Frame
        df_internal_results_fssvm.loc[len(df_internal_results_fssvm)] = [c_index_internal, c_index_external, p_value_internal, p_value_external]

    # Add Internal and External Results to Data Frame
    # df_internal_results_fssvm.to_csv(os.path.join('Results', 'FSSVM_{}-Results.csv'.format(str(result_names[index_name]))))


    ############################ P-Value #########################################################################

    # Log-rank Test
    group_labels = np.random.choice([0, 1], size=X_train.shape[0], replace=True)  # Example groups
    idx = group_labels == 1
    time1, event1 = time_train[idx], event_train[idx]
    time2, event2 = time_train[~idx], event_train[~idx]

    # Compute log-rank test
    train_result = logrank_test(time1, time2, event_observed_A=event1, event_observed_B=event2)
    # Compute P-Value test
    p_value_train = train_result.p_value

    ############################################ Save Results #####################################################

    # Add C-Indexes and P-values to its Data Frame
    mean_cindex_internal = df_internal_results_fssvm['C-Index internal'].mean()
    std_cindex_internal = df_internal_results_fssvm['C-Index internal'].std()
    mean_cindex_external = df_internal_results_fssvm['C-Index External'].mean()
    std_cindex_external = df_internal_results_fssvm['C-Index External'].std()
    pvalue_external = df_internal_results_fssvm['P-Value External'].min()

    ############################################## Draw Kaplan Mier ##############################################
    # Hotelling T-test
    risk_scores_combine_for_hotelling = np.array(risk_scores_folds + risk_scores_test_external[:40])
    threshold_for_hotelling  = np.percentile(risk_scores_combine_for_hotelling, 50)

    y_for_hotelling = np.array(y_train.tolist() + y_test.tolist())
    X_for_hotelling = np.array(X_train.tolist() + X_test.tolist())

    high_risk_for_h = X_for_hotelling[risk_scores_combine_for_hotelling >= threshold_for_hotelling]
    low_risk_for_h = X_for_hotelling[risk_scores_combine_for_hotelling < threshold_for_hotelling]

    print(hotelling_t2(high_risk_for_h, low_risk_for_h)[2])

    # Determine high and low risk groups
    risk_scores_combined = np.array(risk_scores_folds + risk_scores_test_external)
    threshold = np.percentile(risk_scores_combined, 50)

    y_combined = np.array(y_train.tolist() + (y_test.tolist() * 5))

    high_risk = y_combined[risk_scores_combined >= threshold]
    low_risk = y_combined[risk_scores_combined < threshold]

    high_risk_event = np.array([tup[0] for tup in high_risk])
    high_risk_time = np.array([tup[1] for tup in high_risk])
    low_risk_event = np.array([tup[0] for tup in low_risk])
    low_risk_time = np.array([tup[1] for tup in low_risk])

    if len(low_risk) > 0 and len(high_risk) > 0:
        # Calculate the Kaplan-Meier estimates for the two groups
        time_high, survival_prob_high = kaplan_meier_estimator(high_risk_event.astype(bool), high_risk_time)
        time_low, survival_prob_low = kaplan_meier_estimator(low_risk_event.astype(bool), low_risk_time)

        # Compute P-value combined data
        results_pvalue_combined = logrank_test(low_risk_time, high_risk_time,
                                               event_observed_A=low_risk_event, event_observed_B=high_risk_event)
        pvalue_combined = results_pvalue_combined.p_value

        # Compute c-index combined data
        event_combined = np.array([tup[0] for tup in y_combined])
        time_combined = np.array([tup[1] for tup in y_combined])
        results_cindex_combined = concordance_index_censored(event_combined.astype(bool), time_combined, risk_scores_combined)
        c_index_combined = results_cindex_combined[0]

        kmf1 = KaplanMeierFitter()
        kmf2 = KaplanMeierFitter()

        kmf1.fit(high_risk_time.astype('float') / 365, high_risk_event, label='High Risk')

        kmf2.fit(low_risk_time.astype('float') / 365, low_risk_event, label='Low Risk')

        fig = plt.figure()
        ax = fig.add_subplot(111)

        ax = kmf1.plot(color='r', label='High Risk',show_censors=True, censor_styles={'ms': 6, 'marker': '|'})
        ax = kmf2.plot(color='g', label='Low Risk',show_censors=True, censor_styles={'ms': 6, 'marker': '|'})
        plt.title("FSVM , "+result_names[index_name], fontsize=font_label, fontweight='bold')
        plt.xlabel("Time (Years)", fontsize=font_label, fontweight='bold')
        plt.ylabel("Survival probability", fontsize=font_label, fontweight='bold')
        plt.xticks(np.arange(int(years.min()), int(years.max()) + 1, 1.0))
        ax.grid(axis='both', which='both', color='lightgray', linestyle='-', linewidth=0.5,zorder=-1000)

        # add_at_risk_counts(kmf1, kmf2 , ax=ax)
        # plt.tight_layout()

        fig.savefig(os.path.join('Plots', 'FSSVM_{}-KaplanMier.jpg'.format(str(result_names[index_name]))), dpi=300)

    #########################################################################################################

    # Add Reported Results to its Data Frame
    df_reported_results_fssvm.loc[len(df_reported_results_fssvm)] = [result_names[index_name], 'FSSVM', mean_cindex_internal, std_cindex_internal,
                                                                     mean_cindex_external, std_cindex_external, p_value_train, pvalue_external,
                                                                     c_index_combined, pvalue_combined]

## Component-wise Gradient Boosting Survival Analysis

In [None]:
################################# Random Survival Forest #################################

# Data Frame for Add RFS Algorithm reported Results
df_reported_results_cwgbsa = pd.DataFrame(columns=['Dataset', 'SRA', 'Mean C-Index Internal', 'Std C-Index Internal', 'Mean C-Index External',
                                                   'Std C-Index External', '5-Folds P-Value', 'P-value External', 'Combined C-Index', 'Combined P-value'])

# Compute statistics features for all datasets with RFS algorithm
for index_name in range(6):
    # Load a dataset (example: Veterans' Lung Cancer trial)
    X = pd.read_csv(os.path.join('Data', '{}.csv'.format(str(dataset_names[index_name]))), header=None)
    y = pd.read_csv(os.path.join('Data', 'COX_OUTCOME.csv'))

    # Convert days to years
    years = y['Duration'].astype('float') / 365

    # Convert the structured array y to a boolean array
    event = y['Censor'].astype(bool)
    time = y['Duration'].values


    # Prepare the data for the model
    train_size = int(len(X) * 0.8)

    # Split Data
    X_train = X[:train_size]
    X_test = X[train_size:]
    event_train = event[:train_size]
    time_train = time[:train_size]
    event_test_external = event[train_size:]
    time_test_external = time[train_size:]

    # Dimention Reduction with PCA
    min_max_scaler = MinMaxScaler()
    X_train = min_max_scaler.fit_transform(X_train)
    X_test = min_max_scaler.transform(X_test)

    # Define PCA
    pca = PCA(n_components=10)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    y_train = np.array([(e, t) for e, t in zip(event_train, time_train)],
                        dtype=[('Censor', bool), ('Duration', float)])

    y_test = np.array([(e, t) for e, t in zip(event_test_external, time_test_external)],
                       dtype=[('Censor', bool), ('Duration', float)])

    # Add Algorithm internal and external Results to its Data Frame
    df_internal_results_cwgbsa = pd.DataFrame(columns=['C-Index internal', 'C-Index External', 'P-Value Internal', 'P-Value External'])

    # Risk Scores for 5-fold and external test
    risk_scores_folds = []
    risk_scores_test_external = []

    # Fit the model
    for train_index, test_index in kf.split(X_train):
        X_train_in, X_test_in = X_train[train_index], X_train[test_index]
        y_train_in, y_test_in = y_train[train_index], y_train[test_index]

        # Determine internal Event and Time
        event_test_internal = np.array([tup[0] for tup in y_test_in])
        time_test_internal = np.array([tup[1] for tup in y_test_in])

        # Train Model
        cwgbsa.fit(X_train_in, y_train_in)

        ################################# Predicting survival - Internal Test #################################

        # Predict
        risk_scores_internal = cwgbsa.predict(X_test_in)
        for item in risk_scores_internal.tolist():
            risk_scores_folds.append(item)

        # C-Index
        result_internal = concordance_index_censored(event_test_internal, time_test_internal, risk_scores_internal)
        c_index_internal = result_internal[0]

        # Log-rank Test
        group_labels_internal = np.random.choice([0, 1], size=X_test_in.shape[0], replace=True)  # Example groups
        idx_internal = group_labels_internal == 1
        time1_internal, event1_internal = time_test_internal[idx_internal], event_test_internal[idx_internal]
        time2_internal, event2_internal = time_test_internal[~idx_internal], event_test_internal[~idx_internal]

        # Compute log-rank test
        test_result_internal = logrank_test(time1_internal, time2_internal, event_observed_A=event1_internal, event_observed_B=event2_internal)
        # Compute P-Value test
        p_value_internal = test_result_internal.p_value

        ################################# Predicting survival - External Test #################################

        # Predict
        risk_scores_external = cwgbsa.predict(X_test)
        for item in risk_scores_external.tolist():
            risk_scores_test_external.append(item)

        # C-Index
        result_external = concordance_index_censored(event_test_external, time_test_external, risk_scores_external)
        c_index_external = result_external[0]

        # Log-rank Test (Example for two hypothetical groups)
        group_labels_external = np.random.choice([0, 1], size=X_test.shape[0], replace=True)  # Example groups
        idx_external = group_labels_external == 1
        time1_external, event1_external = time_test_external[idx_external], event_test_external[idx_external]
        time2_external, event2_external = time_test_external[~idx_external], event_test_external[~idx_external]

        # Compute log-rank test
        test_result_external = logrank_test(time1_external, time2_external, event_observed_A=event1_external, event_observed_B=event2_external)
        # Compute P-Value test
        p_value_external = test_result_external.p_value

        # Add Internal and External Results to Data Frame
        df_internal_results_cwgbsa.loc[len(df_internal_results_cwgbsa)] = [c_index_internal, c_index_external, p_value_internal, p_value_external]

    # Add Internal and External Results to Data Frame
    # df_internal_results_cwgbsa.to_csv(os.path.join('Results', 'CWGBSA_{}-Results.csv'.format(str(result_names[index_name]))))


    ############################ P-Value #########################################################################

    # Log-rank Test
    group_labels = np.random.choice([0, 1], size=X_train.shape[0], replace=True)  # Example groups
    idx = group_labels == 1
    time1, event1 = time_train[idx], event_train[idx]
    time2, event2 = time_train[~idx], event_train[~idx]

    # Compute log-rank test
    train_result = logrank_test(time1, time2, event_observed_A=event1, event_observed_B=event2)
    # Compute P-Value test
    p_value_train = train_result.p_value

    ############################################ Save Results #####################################################

    # Add C-Indexes and P-values to its Data Frame
    mean_cindex_internal = df_internal_results_cwgbsa['C-Index internal'].mean()
    std_cindex_internal = df_internal_results_cwgbsa['C-Index internal'].std()
    mean_cindex_external = df_internal_results_cwgbsa['C-Index External'].mean()
    std_cindex_external = df_internal_results_cwgbsa['C-Index External'].std()
    pvalue_external = df_internal_results_cwgbsa['P-Value External'].min()

    ############################################## Draw Kaplan Mier ##############################################
    # Hotelling T-test
    risk_scores_combine_for_hotelling = np.array(risk_scores_folds + risk_scores_test_external[:40])
    threshold_for_hotelling  = np.percentile(risk_scores_combine_for_hotelling, 50)

    y_for_hotelling = np.array(y_train.tolist() + y_test.tolist())
    X_for_hotelling = np.array(X_train.tolist() + X_test.tolist())

    high_risk_for_h = X_for_hotelling[risk_scores_combine_for_hotelling >= threshold_for_hotelling]
    low_risk_for_h = X_for_hotelling[risk_scores_combine_for_hotelling < threshold_for_hotelling]

    print(hotelling_t2(high_risk_for_h, low_risk_for_h)[2])

    # Determine high and low risk groups
    risk_scores_combined = np.array(risk_scores_folds + risk_scores_test_external)
    threshold = np.percentile(risk_scores_combined, 50)

    y_combined = np.array(y_train.tolist() + (y_test.tolist() * 5))

    high_risk = y_combined[risk_scores_combined >= threshold]
    low_risk = y_combined[risk_scores_combined < threshold]

    high_risk_event = np.array([tup[0] for tup in high_risk])
    high_risk_time = np.array([tup[1] for tup in high_risk])
    low_risk_event = np.array([tup[0] for tup in low_risk])
    low_risk_time = np.array([tup[1] for tup in low_risk])

    if len(low_risk) > 0 and len(high_risk) > 0:
        # Calculate the Kaplan-Meier estimates for the two groups
        time_high, survival_prob_high = kaplan_meier_estimator(high_risk_event.astype(bool), high_risk_time)
        time_low, survival_prob_low = kaplan_meier_estimator(low_risk_event.astype(bool), low_risk_time)

        # Compute P-value combined data
        results_pvalue_combined = logrank_test(low_risk_time, high_risk_time,
                                               event_observed_A=low_risk_event, event_observed_B=high_risk_event)
        pvalue_combined = results_pvalue_combined.p_value

        # Compute c-index combined data
        event_combined = np.array([tup[0] for tup in y_combined])
        time_combined = np.array([tup[1] for tup in y_combined])
        results_cindex_combined = concordance_index_censored(event_combined.astype(bool), time_combined, risk_scores_combined)
        c_index_combined = results_cindex_combined[0]

        kmf1 = KaplanMeierFitter()
        kmf2 = KaplanMeierFitter()

        kmf1.fit(high_risk_time.astype('float') / 365, high_risk_event, label='High Risk')

        kmf2.fit(low_risk_time.astype('float') / 365, low_risk_event, label='Low Risk')

        fig = plt.figure()
        ax = fig.add_subplot(111)

        ax = kmf1.plot(color='r', label='High Risk',show_censors=True, censor_styles={'ms': 6, 'marker': '|'})
        ax = kmf2.plot(color='g', label='Low Risk',show_censors=True, censor_styles={'ms': 6, 'marker': '|'})
        plt.title("CWGB , "+result_names[index_name], fontsize=font_label, fontweight='bold')
        plt.xlabel("Time (Years)", fontsize=font_label, fontweight='bold')
        plt.ylabel("Survival probability", fontsize=font_label, fontweight='bold')
        plt.xticks(np.arange(int(years.min()), int(years.max()) + 1, 1.0))
        plt.yticks(np.arange(0.0, 1.0, 0.1))
        ax.grid(axis='both', which='both', color='lightgray', linestyle='-', linewidth=0.5,zorder=-1000)

        # add_at_risk_counts(kmf1, kmf2 , ax=ax)
        # plt.tight_layout()

        fig.savefig(os.path.join('Plots', 'CWGBSA_{}-KaplanMier.jpg'.format(str(result_names[index_name]))), dpi=300)

    #########################################################################################################

    # Add Reported Results to its Data Frame
    df_reported_results_cwgbsa.loc[len(df_reported_results_cwgbsa)] = [result_names[index_name], 'CWGBSA', mean_cindex_internal, std_cindex_internal,
                                                                       mean_cindex_external, std_cindex_external, p_value_train, pvalue_external,
                                                                       c_index_combined, pvalue_combined]

##CoxPH

In [None]:
################################# Random Survival Forest #################################

# Data Frame for Add RFS Algorithm reported Results
df_reported_results_cph = pd.DataFrame(columns=['Dataset', 'SRA', 'Mean C-Index Internal', 'Std C-Index Internal', 'Mean C-Index External',
                                                   'Std C-Index External', '5-Folds P-Value', 'P-value External', 'Combined C-Index', 'Combined P-value'])

# Compute statistics features for all datasets with RFS algorithm
for index_name in range(6):
    # Load a dataset (example: Veterans' Lung Cancer trial)
    X = pd.read_csv(os.path.join('Data', '{}.csv'.format(str(dataset_names[index_name]))), header=None)
    y = pd.read_csv(os.path.join('Data', 'COX_OUTCOME.csv'))

    # Convert days to years
    years = y['Duration'].astype('float') / 365

    # Convert the structured array y to a boolean array
    event = y['Censor'].astype(bool)
    time = y['Duration'].values


    # Prepare the data for the model
    train_size = int(len(X) * 0.8)

    # Split Data
    X_train = X[:train_size]
    X_test = X[train_size:]
    event_train = event[:train_size]
    time_train = time[:train_size]
    event_test_external = event[train_size:]
    time_test_external = time[train_size:]

    # Dimention Reduction with PCA
    min_max_scaler = MinMaxScaler()
    X_train = min_max_scaler.fit_transform(X_train)
    X_test = min_max_scaler.transform(X_test)

    # Define PCA
    pca = PCA(n_components=10)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    y_train = np.array([(e, t) for e, t in zip(event_train, time_train)],
                        dtype=[('Censor', bool), ('Duration', float)])

    y_test = np.array([(e, t) for e, t in zip(event_test_external, time_test_external)],
                       dtype=[('Censor', bool), ('Duration', float)])

    # Add Algorithm internal and external Results to its Data Frame
    df_internal_results_cph = pd.DataFrame(columns=['C-Index internal', 'C-Index External', 'P-Value Internal', 'P-Value External'])

    # Risk Scores for 5-fold and external test
    risk_scores_folds = []
    risk_scores_test_external = []

    # Fit the model
    for train_index, test_index in kf.split(X_train):
        X_train_in, X_test_in = X_train[train_index], X_train[test_index]
        y_train_in, y_test_in = y_train[train_index], y_train[test_index]

        # Determine internal Event and Time
        event_test_internal = np.array([tup[0] for tup in y_test_in])
        time_test_internal = np.array([tup[1] for tup in y_test_in])

        # Initialize and fit the Cox Proportional Hazards model
        cph.fit(pd.concat([pd.DataFrame(X_train_in), pd.DataFrame(y_train_in)],axis=1), duration_col='Duration', event_col='Censor')

        ################################# Predicting survival - Internal Test #################################

        # Predict
        risk_scores_internal = cph.predict_partial_hazard(X_test_in)
        for item in risk_scores_internal.tolist():
            risk_scores_folds.append(item)

        # C-Index
        result_internal = concordance_index_censored(event_test_internal, time_test_internal, risk_scores_internal)
        c_index_internal = result_internal[0]

        # Log-rank Test
        group_labels_internal = np.random.choice([0, 1], size=X_test_in.shape[0], replace=True)  # Example groups
        idx_internal = group_labels_internal == 1
        time1_internal, event1_internal = time_test_internal[idx_internal], event_test_internal[idx_internal]
        time2_internal, event2_internal = time_test_internal[~idx_internal], event_test_internal[~idx_internal]

        # Compute log-rank test
        test_result_internal = logrank_test(time1_internal, time2_internal, event_observed_A=event1_internal, event_observed_B=event2_internal)
        # Compute P-Value test
        p_value_internal = test_result_internal.p_value

        ################################# Predicting survival - External Test #################################

        # Predict
        risk_scores_external = cph.predict_partial_hazard(X_test)
        for item in risk_scores_external.tolist():
            risk_scores_test_external.append(item)

        # C-Index
        result_external = concordance_index_censored(event_test_external, time_test_external, risk_scores_external)
        c_index_external = result_external[0]

        # Log-rank Test (Example for two hypothetical groups)
        group_labels_external = np.random.choice([0, 1], size=X_test.shape[0], replace=True)  # Example groups
        idx_external = group_labels_external == 1
        time1_external, event1_external = time_test_external[idx_external], event_test_external[idx_external]
        time2_external, event2_external = time_test_external[~idx_external], event_test_external[~idx_external]

        # Compute log-rank test
        test_result_external = logrank_test(time1_external, time2_external, event_observed_A=event1_external, event_observed_B=event2_external)
        # Compute P-Value test
        p_value_external = test_result_external.p_value

        # Add Internal and External Results to Data Frame
        df_internal_results_cph.loc[len(df_internal_results_cph)] = [c_index_internal, c_index_external, p_value_internal, p_value_external]

    # Add Internal and External Results to Data Frame
    df_internal_results_cph.to_csv(os.path.join('Results', 'cph_{}-Results.csv'.format(str(result_names[index_name]))))


    ############################ P-Value #########################################################################

    # Log-rank Test
    group_labels = np.random.choice([0, 1], size=X_train.shape[0], replace=True)  # Example groups
    idx = group_labels == 1
    time1, event1 = time_train[idx], event_train[idx]
    time2, event2 = time_train[~idx], event_train[~idx]

    # Compute log-rank test
    train_result = logrank_test(time1, time2, event_observed_A=event1, event_observed_B=event2)
    # Compute P-Value test
    p_value_train = train_result.p_value

    ############################################ Save Results #####################################################

    # Add C-Indexes and P-values to its Data Frame
    mean_cindex_internal = df_internal_results_cph['C-Index internal'].mean()
    std_cindex_internal = df_internal_results_cph['C-Index internal'].std()
    mean_cindex_external = df_internal_results_cph['C-Index External'].mean()
    std_cindex_external = df_internal_results_cph['C-Index External'].std()
    pvalue_external = df_internal_results_cph['P-Value External'].min()

    ############################################## Draw Kaplan Mier ##############################################
    # Hotelling T-test
    risk_scores_combine_for_hotelling = np.array(risk_scores_folds + risk_scores_test_external[:40])
    threshold_for_hotelling  = np.percentile(risk_scores_combine_for_hotelling, 50)

    y_for_hotelling = np.array(y_train.tolist() + y_test.tolist())
    X_for_hotelling = np.array(X_train.tolist() + X_test.tolist())

    high_risk_for_h = X_for_hotelling[risk_scores_combine_for_hotelling >= threshold_for_hotelling]
    low_risk_for_h = X_for_hotelling[risk_scores_combine_for_hotelling < threshold_for_hotelling]

    print(hotelling_t2(high_risk_for_h, low_risk_for_h)[2])
    # Determine high and low risk groups
    risk_scores_combined = np.array(risk_scores_folds + risk_scores_test_external)
    threshold = np.percentile(risk_scores_combined, 50)

    y_combined = np.array(y_train.tolist() + (y_test.tolist() * 5))

    high_risk = y_combined[risk_scores_combined >= threshold]
    low_risk = y_combined[risk_scores_combined < threshold]

    high_risk_event = np.array([tup[0] for tup in high_risk])
    high_risk_time = np.array([tup[1] for tup in high_risk])
    low_risk_event = np.array([tup[0] for tup in low_risk])
    low_risk_time = np.array([tup[1] for tup in low_risk])

    if len(low_risk) > 0 and len(high_risk) > 0:
        # Calculate the Kaplan-Meier estimates for the two groups
        time_high, survival_prob_high = kaplan_meier_estimator(high_risk_event.astype(bool), high_risk_time)
        time_low, survival_prob_low = kaplan_meier_estimator(low_risk_event.astype(bool), low_risk_time)

        # Compute P-value combined data
        results_pvalue_combined = logrank_test(low_risk_time, high_risk_time,
                                               event_observed_A=low_risk_event, event_observed_B=high_risk_event)
        pvalue_combined = results_pvalue_combined.p_value

        # Compute c-index combined data
        event_combined = np.array([tup[0] for tup in y_combined])
        time_combined = np.array([tup[1] for tup in y_combined])
        results_cindex_combined = concordance_index_censored(event_combined.astype(bool), time_combined, risk_scores_combined)
        c_index_combined = results_cindex_combined[0]

        kmf1 = KaplanMeierFitter()
        kmf2 = KaplanMeierFitter()

        kmf1.fit(high_risk_time.astype('float') / 365, high_risk_event, label='High Risk')

        kmf2.fit(low_risk_time.astype('float') / 365, low_risk_event, label='Low Risk')

        fig = plt.figure()
        ax = fig.add_subplot(111)

        ax = kmf1.plot(color='r', label='High Risk',show_censors=True, censor_styles={'ms': 6, 'marker': '|'})
        ax = kmf2.plot(color='g', label='Low Risk',show_censors=True, censor_styles={'ms': 6, 'marker': '|'})
        plt.title("COXR , "+result_names[index_name], fontsize=font_label, fontweight='bold')
        plt.xlabel("Time (Years)", fontsize=font_label, fontweight='bold')
        plt.ylabel("Survival probability", fontsize=font_label, fontweight='bold')
        plt.xticks(np.arange(int(years.min()), int(years.max()) + 1, 1.0))
        plt.yticks(np.arange(0.0, 1.0, 0.1))
        ax.grid(axis='both', which='both', color='lightgray', linestyle='-', linewidth=0.5,zorder=-1000)

        # add_at_risk_counts(kmf1, kmf2 , ax=ax)
        # plt.tight_layout()

        fig.savefig(os.path.join('Plots', 'cph_{}-KaplanMier.jpg'.format(str(result_names[index_name]))), dpi=300)

    #########################################################################################################

    # Add Reported Results to its Data Frame
    df_reported_results_cph.loc[len(df_reported_results_cph)] = [result_names[index_name], 'COXPH', mean_cindex_internal, std_cindex_internal,
                                                                       mean_cindex_external, std_cindex_external, p_value_train, pvalue_external,
                                                                       c_index_combined, pvalue_combined]

# Save All Results

In [None]:
#concatenating all results along rows
df_all_results = pd.concat([df_reported_results_rsf,df_reported_results_cwgbsa,df_reported_results_fssvm,df_reported_results_cph], axis=0, ignore_index=True)
df_all_results.to_csv(os.path.join('Results', 'All Reported Results-2.csv'))
df_all_results