In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.util import Surv
from sksurv.metrics import concordance_index_censored
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import random
from scipy import stats

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)


In [2]:
def concordance_index_scorer(estimator, X, y):
    """
    Computes the concordance index for a survival model's predictions.

    Parameters:
        estimator: Trained survival model with a .predict() method.
        X: Feature matrix for prediction.
        y: Structured array with fields 'event' and 'time'.

    Returns:
        Concordance index (float)
    """
    event, time = y["event"], y["time"]
    prediction = estimator.predict(X)
    return concordance_index_censored(event, time, prediction)[0]

def confidence_interval(cindex_scores, confidence=0.95):
    """
    Computes the mean and confidence interval for a list of C-index scores.

    Parameters:
        cindex_scores: List or array of C-index values.
        confidence: Confidence level (default is 95%).

    Returns:
        Tuple: (mean, lower_bound, upper_bound)
    """
    mean = np.mean(cindex_scores)
    sem = stats.sem(cindex_scores)
    ci = stats.t.interval(confidence, df=len(cindex_scores)-1, loc=mean, scale=sem)
    print(f"Mean C-index: {mean:.2f}, {int(confidence*100)}% CI: ({ci[0]:.2f}, {ci[1]:.2f})")
    

def clinical_data(train_cohort, test_cohort, clinical_f=None):
    """
    Encodes categorical clinical variables for training and testing datasets.

    Parameters:
    - train_cohort: DataFrame for training data
    - test_cohort: DataFrame for test data
    - clinical_f: Not used, kept for compatibility

    Returns:
    - clinical_train_df: DataFrame with encoded clinical features for training
    - clinical_test_df: DataFrame with encoded clinical features for testing
    """
    
    # Define mappings
    sex_map = {'Female': 1, 'Male': 0}
    smoking_map = {
        'Former smoker': 0,
        'Non smoker': 1,
        'Smoker': 2,
        'Passive smoker': 3
    }
    histo_map = {
        'Carcinoid tumor': 0,
        'Small cell carcinoma': 1,
        'Large cell neuroendocrine carcinoma': 2
    }

    # Apply mappings
    def encode_feature(df, column, mapping):
        return df[column].map(mapping).values.reshape(-1, 1)

    train_sex = encode_feature(train_cohort, 'sex', sex_map)
    test_sex = encode_feature(test_cohort, 'sex', sex_map)

    train_smoking = encode_feature(train_cohort, 'smoking_habit', smoking_map)
    test_smoking = encode_feature(test_cohort, 'smoking_habit', smoking_map)

    train_histo = encode_feature(train_cohort, 'Histological subtype 1_x', histo_map)
    test_histo = encode_feature(test_cohort, 'Histological subtype 1_x', histo_map)

    # Concatenate features
    clinical_train = np.hstack([train_sex, train_smoking, train_histo])
    clinical_test = np.hstack([test_sex, test_smoking, test_histo])

    # Create DataFrames
    columns = ['sex', 'smoking_habit', 'Histological subtype 1_x']
    clinical_train_df = pd.DataFrame(clinical_train, columns=columns)
    clinical_test_df = pd.DataFrame(clinical_test, columns=columns)

    #print('clinical_train_df shape:', clinical_train_df.shape)
    return clinical_train_df, clinical_test_df



def fit_and_score_features(X, y):
    """
    Scores each feature independently using a univariate CoxnetSurvivalAnalysis.

    Parameters:
    - X: 2D array-like, shape (n_samples, n_features)
    - y: structured array with fields ('event', 'time')

    Returns:
    - scores: array of concordance scores for each feature
    """
    n_features = X.shape[1]
    scores = np.empty(n_features)
    model = CoxnetSurvivalAnalysis(l1_ratio=0.1, alpha_min_ratio=0.001)

    for j in range(n_features):
        Xj = X[:, j:j+1]
        model.fit(Xj, y)
        scores[j] = model.score(Xj, y)

    return scores

def remove_highly_correlated_features(X, y, target_column, threshold=0.9):
    """
    Remove features from X that are highly correlated with each other,
    keeping the one more correlated with the target.

    Parameters:
    - X: pd.DataFrame, feature matrix
    - y: pd.Series or pd.DataFrame, target variable
    - target_column: str, name of the target column
    - threshold: float, correlation threshold for dropping features

    Returns:
    - X_new: pd.DataFrame, with selected features
    """
    # Combine features and target into one DataFrame
    df = pd.concat([X, y], axis=1)

    # Compute correlation matrix
    correlation_matrix = df.corr()

    # Extract upper triangle of the correlation matrix
    upper_triangle = correlation_matrix.where(
        np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
    )

    # Identify pairs with high correlation
    high_corr_pairs = [
        (col1, col2) for col1 in upper_triangle.columns
        for col2 in upper_triangle.index
        if upper_triangle.at[col2, col1] > threshold
    ]

    # Decide which columns to drop
    columns_to_drop = set()
    for col1, col2 in high_corr_pairs:
        if abs(correlation_matrix.at[target_column, col1]) >= abs(correlation_matrix.at[target_column, col2]):
            columns_to_drop.add(col2)
        else:
            columns_to_drop.add(col1)

    return columns_to_drop

In [3]:
#Load dataset
df = pd.read_csv('data/radiomics_clinical.csv')

# Filter out missing or zero survival times
column_name = 'O.S. (2022)' #'PFS (2022)'
df = df[df[column_name].notna() & (df[column_name] != 0)].reset_index(drop=True)

# Prepare features and labels
X = df.iloc[:, :-9]
y = pd.DataFrame(df[column_name])
vital = pd.DataFrame(df['Vital status'].to_list(), columns=['status'])
vital['status'] = vital['status'].replace({'Alive': 1, 'Deceased': 0})
clinical = df[['sex', 'smoking_habit', 'Histological subtype 1_x']]



In [4]:
X.head()

Unnamed: 0,diagnostics_Image-original_Mean,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,diagnostics_Mask-original_VolumeNum,diagnostics_Image-interpolated_Mean,diagnostics_Image-interpolated_Minimum,diagnostics_Image-interpolated_Maximum,diagnostics_Mask-interpolated_VoxelNum,diagnostics_Mask-interpolated_VolumeNum,diagnostics_Mask-interpolated_Mean,...,gradient_gldm_DependenceNonUniformity,gradient_gldm_DependenceNonUniformityNormalized,gradient_gldm_DependenceVariance,gradient_gldm_GrayLevelNonUniformity,gradient_gldm_LargeDependenceEmphasis,gradient_gldm_LargeDependenceHighGrayLevelEmphasis,gradient_gldm_LargeDependenceLowGrayLevelEmphasis,gradient_gldm_SmallDependenceEmphasis,gradient_gldm_SmallDependenceHighGrayLevelEmphasis,gradient_gldm_SmallDependenceLowGrayLevelEmphasis
0,-292.160982,3071,150197,1,-0.390559,-1.636785,1.809719,150197,1,0.143349,...,85355.740428,0.568292,23.3296,150197.0,632.572348,632.572348,632.572348,0.002328,0.002328,0.002328
1,-339.754633,3071,14460,1,-0.381562,-1.480772,2.163564,14460,1,0.108508,...,5061.876902,0.350061,32.907438,14460.0,565.469986,565.469986,565.469986,0.002988,0.002988,0.002988
2,-538.652739,3071,7100,2,-0.429011,-0.979874,4.157713,7100,2,0.181918,...,1799.929859,0.253511,37.839214,7100.0,508.949296,508.949296,508.949296,0.003723,0.003723,0.003723
3,-505.063316,3071,857,1,0.704429,-1.074868,1.547384,857,1,1.104793,...,128.841307,0.15034,29.418985,857.0,426.390898,426.390898,426.390898,0.003484,0.003484,0.003484
4,-418.799287,3071,5324,1,-0.402432,-1.231057,1.789626,5324,1,0.516721,...,1515.024418,0.284565,26.324517,5324.0,543.793013,543.793013,543.793013,0.002547,0.002547,0.002547


In [5]:
clinical.head()

Unnamed: 0,sex,smoking_habit,Histological subtype 1_x
0,Female,Former smoker,Carcinoid tumor
1,Female,Passive smoker,Carcinoid tumor
2,Female,Former smoker,Carcinoid tumor
3,Female,Former smoker,Carcinoid tumor
4,Male,Former smoker,Carcinoid tumor


In [6]:
# Initialize results
result_all = [{'seed': 0, 'cv_cindex_rad': 0, 'test_cindex_rad': 0, 'hr_rad': 0,
               'best_features': 0, 'cv_cindex_clin': 0, 'test_cindex_clin': 0, 'hr_clin': 0}]
seeds = [1,10,99,999,99999]#,99,999,9999,99999]


In [7]:
#seeds=[1]
seeds

[1, 10, 99, 999, 99999]

In [8]:
# Main loop
for seed in seeds:
    # Split data
    X_train, X_test, y_train, y_test, clin_train, clin_test, vital_train, vital_test = train_test_split(
        X, y, clinical, vital, test_size=0.30, random_state=seed, stratify=vital)

    # Convert to DataFrames with proper columns
    X_train = pd.DataFrame(X_train, columns=X.columns).reset_index(drop=True)
    X_test = pd.DataFrame(X_test, columns=X.columns).reset_index(drop=True)
    clin_train = pd.DataFrame(clin_train, columns=clinical.columns).reset_index(drop=True)
    clin_test = pd.DataFrame(clin_test, columns=clinical.columns).reset_index(drop=True)
    y_train = pd.DataFrame(y_train, columns=y.columns).reset_index(drop=True)
    y_test = pd.DataFrame(y_test, columns=y.columns).reset_index(drop=True)
    vital_train = pd.DataFrame(vital_train, columns=vital.columns).reset_index(drop=True)
    vital_test = pd.DataFrame(vital_test, columns=vital.columns).reset_index(drop=True)

    # Prepare survival labels
    data_y_train = Surv.from_arrays(event=vital_train.status.tolist(), time=y_train[column_name].tolist())
    data_y_test = Surv.from_arrays(event=vital_test.status.tolist(), time=y_test[column_name].tolist())

    # Feature selection
    columns_to_drop = remove_highly_correlated_features(X_train, y_train, target_column=column_name)
    X_train_filtered = X_train.drop(columns=columns_to_drop, errors='ignore')
    X_test_filtered = X_test.drop(columns=columns_to_drop, errors='ignore')

    # Radiomics model pipeline
    pipe = Pipeline([
        ("scaler", StandardScaler()), 
        ("select", SelectKBest(fit_and_score_features, k=10)),
        ("model", CoxnetSurvivalAnalysis()),
    ])

    param_grid = {
        "select__k": np.arange(1, 15),
        "model__l1_ratio": [ 0.3, 0.4, 0.5,0.6],
    }

    cv = KFold(n_splits=3, random_state=1, shuffle=True)
    rad_model = GridSearchCV(pipe, param_grid, return_train_score=True, cv=cv, scoring=concordance_index_scorer)
    rad_model.fit(X_train_filtered, data_y_train)

    # Evaluate radiomics model
    best_rad = rad_model.best_estimator_
    rad_cv = rad_model.best_score_
    predicted_risk = best_rad.predict(X_test_filtered)
    rad_test_score = concordance_index_censored(
        vital_test.status.astype(bool).to_numpy(), y_test[column_name].to_numpy(), predicted_risk)
    hr_rad = np.mean(np.exp(best_rad.named_steps["model"].coef_))

    # Feature importance
    best_k = rad_model.best_params_['select__k']
    ranking_feat = fit_and_score_features(X_train_filtered.values, data_y_train)
    top_features = list(pd.Series(ranking_feat, index=X_train_filtered.columns)
                        .sort_values(ascending=False)[:best_k].keys())

    x_train_fs = X_train_filtered[top_features]
    x_test_fs = X_test_filtered[top_features]

    # Clinical model
    clinical_f = 'all_exceptage'
    clinical_train, clinical_test = clinical_data(clin_train, clin_test, clinical_f)

    comb_train = pd.concat([x_train_fs, clinical_train], axis=1)
    comb_test = pd.concat([x_test_fs, clinical_test], axis=1)

    pipe_clin = Pipeline([
        ("scaler", StandardScaler()),
        ("model", CoxnetSurvivalAnalysis()),
    ])

    param_grid_clin = {
        "model__l1_ratio": [0.0003, 0.0004,0.0005, 0.0007, 0.0009],
    }
    comb_model = GridSearchCV(pipe_clin, param_grid_clin, return_train_score=True, cv=cv, scoring=concordance_index_scorer)
    comb_model.fit(comb_train, data_y_train)

    # Evaluate combined model
    best_comb = comb_model.best_estimator_
    predicted_risk_comb = best_comb.predict(comb_test)
    comb_test_score = concordance_index_censored(
        vital_test.status.astype(bool).to_numpy(), y_test[column_name].to_numpy(), predicted_risk_comb)
    hr_cl = np.mean(np.exp(best_comb.named_steps["model"].coef_))
    comb_cv = comb_model.best_score_

    # Save results
    result_all.append({
        'seed': seed,
        'cv_cindex_rad': rad_cv,
        'test_cindex_rad': rad_test_score[0],
        'hr_rad': hr_rad,
        'best_features': len(top_features),
        'cv_cindex_clin': comb_cv,
        'test_cindex_clin': comb_test_score[0],
        'hr_clin': hr_cl
    })


In [9]:
x_train_fs.shape

(58, 7)

In [10]:
rad_model.best_params_

{'model__l1_ratio': 0.3, 'select__k': 7}

In [11]:
# Save to CSV
df_results = pd.DataFrame(result_all[1:])
df_results.to_csv('result_all_os.csv', index=False)

In [12]:
np.mean(df_results.cv_cindex_rad), np.std(df_results.cv_cindex_rad),np.mean(df_results.test_cindex_rad),np.std(df_results.test_cindex_rad)

(0.6305077444441154,
 0.0447219052111049,
 0.6360222043143647,
 0.06507369516957667)

In [13]:
df_results

Unnamed: 0,seed,cv_cindex_rad,test_cindex_rad,hr_rad,best_features,cv_cindex_clin,test_cindex_clin,hr_clin
0,1,0.624718,0.736842,0.780743,2,0.64868,0.705263,0.974214
1,10,0.704702,0.533019,1.064028,9,0.671829,0.589623,1.01345
2,99,0.616061,0.632075,0.945235,3,0.656157,0.679245,0.997767
3,999,0.566129,0.652582,0.981007,4,0.696774,0.638498,1.009788
4,99999,0.640929,0.625592,0.9095,7,0.67681,0.606635,1.006254


In [14]:

confidence_interval(df_results.cv_cindex_rad)
confidence_interval(df_results.test_cindex_rad)

Mean C-index: 0.63, 95% CI: (0.57, 0.69)
Mean C-index: 0.64, 95% CI: (0.55, 0.73)


In [15]:
np.mean(df_results.cv_cindex_clin), np.std(df_results.cv_cindex_clin),np.mean(df_results.test_cindex_clin),np.std(df_results.test_cindex_clin)

(0.6700500217894989,
 0.016809269548769417,
 0.6438527612190491,
 0.04330959928234484)

In [16]:
confidence_interval(df_results.cv_cindex_clin)
confidence_interval(df_results.test_cindex_clin)



Mean C-index: 0.67, 95% CI: (0.65, 0.69)
Mean C-index: 0.64, 95% CI: (0.58, 0.70)


In [17]:
np.mean(df_results.hr_rad),np.mean(df_results.hr_clin),

(0.9361025920304366, 1.0002944785689898)