In [1]:
import os
import sys
from pathlib import Path
import yaml
import pickle
import argparse
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from lifelines.utils import concordance_index
from lifelines import CoxPHFitter

from coxkan import CoxKAN
from coxkan.utils import bootstrap_metric, set_seed

SEED = set_seed(42)

In [2]:
### Reusable functions for the notebook

def true_cindex(df):
    global sim_config, duration_col, event_col, covariates
    lph = sim_config['log_partial_hazard'](**df[covariates])
    return concordance_index(df[duration_col], -lph, df[event_col])

def cph_cindex(df):
    global cph
    return cph.score(df, scoring_method='concordance_index')

def cph_reg_cindex(df):
    global cph_reg
    return cph_reg.score(df, scoring_method='concordance_index')

def cph_formula(cph):
    coefficients = cph.params_
    terms = []
    for covariate, coefficient in coefficients.items():
        term = f"{coefficient:.4f} * {covariate}"
        terms.append(term)
    expression = " + ".join(terms)
    return expression

In [3]:
# open CoxKAN/data/TCGA/genomics/{name}_genomics.csv
name = 'LUAD'

genomics_df = pd.read_csv(f'./TCGA/genomics/{name}_genomics.csv')
genomic_labels_train_df = pd.read_csv(f'./TCGA/{name}_labels/train.csv')
genomic_labels_test_df = pd.read_csv(f'./TCGA/{name}_labels/test.csv')

#concat train and test labels
genomic_labels_df = pd.concat([genomic_labels_train_df, genomic_labels_test_df])

genomic_labels_df = genomic_labels_df[['case_id', 'dss_cr_survival_days', 'dss_cr_censorship']]

# fill all empty dss_cr_censorship with zeros
genomic_labels_df['dss_cr_censorship'] = genomic_labels_df['dss_cr_censorship'].fillna(0)

#get rid of appended string after the final '_' in the sample column
genomics_df['sample'] = genomics_df['sample'].str.split('-').str[:-1].str.join('-')

# merge the genomic data with the labels based on the sample column in genomics_df and case_id column in genomic_labels_df
genomics_df = pd.merge(genomics_df, genomic_labels_df, left_on='sample', right_on='case_id', how='inner')

# drop any column beginning with 'Unnamed'
genomics_df = genomics_df.loc[:, ~genomics_df.columns.str.contains('^Unnamed')]

#same for 'case_id'
genomics_df = genomics_df.loc[:, ~genomics_df.columns.str.contains('^case_id')]

print(f"Genomics dataframe shape: {genomics_df.shape}")


Genomics dataframe shape: (538, 4171)


In [4]:
correlations = {}

#for each non-duration and non-event column, calculate the univariate CoxPH p-value
for col in genomics_df.columns:
    if 'sample' in col:
        continue
    if col not in ['dss_cr_survival_days', 'dss_cr_censorship']:
        train_df_univariate = genomics_df[['dss_cr_survival_days', 'dss_cr_censorship', col]].copy()
        
        # drop cols with nans
        train_df_univariate = train_df_univariate.dropna()
        # fit the CoxPH model
        cph = CoxPHFitter()
        try:
            cph.fit(train_df_univariate, duration_col='dss_cr_survival_days', event_col='dss_cr_censorship', formula=f"{col}")
            p_value = cph.summary['p'].values[0]
            
            # append to the list
            correlations[col] = p_value
        except Exception as e:
            p_value = np.inf
            correlations[col] = p_value
            print(f"Error fitting CoxPH model for {col}: {e}")
        
        # get the p-value

        
        print(f"Univariate CoxPH p-value for {col}: {p_value}")

        
# filter the genomics_df to only include features with p-values < 0.05
filtered_genomics_df = genomics_df.copy()
for col, p_value in correlations.items():
    if p_value >= 0.05:
        filtered_genomics_df = filtered_genomics_df.drop(columns=[col])

print(f"Filtered genomics dataframe shape: {filtered_genomics_df.shape}")

Univariate CoxPH p-value for A2M: 0.22046914373131415
Univariate CoxPH p-value for AAAS: 0.1389791676474925
Univariate CoxPH p-value for AADAT: 0.4838511007107471
Univariate CoxPH p-value for ABAT: 0.7352437203502353
Univariate CoxPH p-value for ABCA1: 0.00833664866570198
Univariate CoxPH p-value for ABCA2: 1.6860600986309297e-05
Univariate CoxPH p-value for ABCA3: 0.7969088491075322
Univariate CoxPH p-value for ABCA4: 0.0033906422700804902
Univariate CoxPH p-value for ABCA5: 0.5189853150987529
Univariate CoxPH p-value for ABCA6: 0.10509937482479388
Univariate CoxPH p-value for ABCA8: 0.9400792306096518
Univariate CoxPH p-value for ABCA9: 0.03004297364580446
Univariate CoxPH p-value for ABCB1: 0.3409629006453053
Univariate CoxPH p-value for ABCB11: 0.22883029880682343
Univariate CoxPH p-value for ABCB4: 0.4558846231027003
Univariate CoxPH p-value for ABCB6: 0.14274511106217538
Univariate CoxPH p-value for ABCB7: 0.6824307436693885
Univariate CoxPH p-value for ABCB8: 0.0631403136325118



  return (X - mean) / std


Error fitting CoxPH model for TSSK2: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.
Univariate CoxPH p-value for TSSK2: inf
Univariate CoxPH p-value for TST: 0.011263014961793598
Univariate CoxPH p-value for TTC37: 0.08544453733189636
Univariate CoxPH p-value for TTC39A: 0.2813468135135507
Univariate CoxPH p-value for TTC39B: 0.4690092739430185
Univariate CoxPH p-value for TTK: 0.16236843252793912
Univariate CoxPH p-value for TTPA: 0.03204006779414318
Univariate CoxPH p-value for TTR: 0.9816187188783811
Univariate CoxPH p-value for TUBA3C: 0.863556294623483
Univariate CoxPH p-value for TUBA4A: 0.7736522907383969
Univariate CoxPH p-value for TUBB: 0.7675685341145884
Univariate CoxPH p-value for TUBB2A: 0.2510577491129572
Univariate CoxPH p-value for TU

In [5]:
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)
# train a random forest regressor to impute missing values
imp.fit(filtered_genomics_df.drop(columns=['sample']))
imputed_genomics = imp.transform(filtered_genomics_df.drop(columns=['sample']))

In [6]:
# combine imputed_genomics with the sample column to create new df imputed_genomics_df
imputed_genomics_df = pd.DataFrame(imputed_genomics, columns=filtered_genomics_df.columns[1:])
imputed_genomics_df['sample'] = filtered_genomics_df['sample'].values
print(imputed_genomics_df.shape)
# check for multicollinearity in imputed_genomics_df
corr = imputed_genomics_df.corr(numeric_only=True).abs()
high_corr_pairs = (
    corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
         .stack()
         .loc[lambda s: s > 0.5]              # pick your own threshold
)

# for the highly correlated pairs, drop the one with the lower p-value
for col1, col2 in high_corr_pairs.index:
    #if not all of col1 and col2 are in imputed_genomics_df, skip
    if col1 not in imputed_genomics_df.columns or col2 not in imputed_genomics_df.columns:
        continue
        
    if correlations[col1] < correlations[col2]:
        imputed_genomics_df = imputed_genomics_df.drop(columns=[col2])
    else:
        imputed_genomics_df = imputed_genomics_df.drop(columns=[col1])
        
final_genomics_df = imputed_genomics_df.copy()
print(f"Final genomics dataframe shape: {final_genomics_df.shape}")


(538, 1256)
Final genomics dataframe shape: (538, 432)


In [7]:
#split into test and train based on genomic_labels_train_df
train_df = final_genomics_df[final_genomics_df['sample'].isin(genomic_labels_train_df['case_id'])].copy()
test_df = final_genomics_df[final_genomics_df['sample'].isin(genomic_labels_test_df['case_id'])].copy()
print(f"Train dataframe shape: {train_df.shape}")
print(f"Test dataframe shape: {test_df.shape}")

# convert duration column to name 'duration', and event column to name 'event'
# in the event col, convert all values of 1 to 0, and all other values to 1
train_df.rename(columns={'dss_cr_survival_days': 'duration'}, inplace=True)
train_df.rename(columns={'dss_cr_censorship': 'event'}, inplace=True)
train_df['event'] = train_df['event'].replace({0:1,1:0})
test_df.rename(columns={'dss_cr_survival_days': 'duration'}, inplace=True)
test_df.rename(columns={'dss_cr_censorship': 'event'}, inplace=True)
test_df['event'] = test_df['event'].replace({0:1,1:0})

#for train and test, delete duplicate rowswith the same sample
train_df = train_df.drop_duplicates(subset=['sample'], keep='first')
test_df = test_df.drop_duplicates(subset=['sample'], keep='first')

# where -1 exists in the event column, remove row
train_df = train_df[train_df['event'] != -1]
test_df = test_df[test_df['event'] != -1]

#print test and train shapes
print(f"Train dataframe shape: {train_df.shape}")
print(f"Test dataframe shape: {test_df.shape}")

#make sample column the index
train_df.set_index('sample', inplace=True)
test_df.set_index('sample', inplace=True)

#print how many unique samples (in index) are in train_df and test_df
all_samples = train_df.index.tolist() + test_df.index.tolist()
print(f"Number of rows: {len(all_samples)}")
print(f"Number of unique samples: {len(set(all_samples))}")

#save to {name}_train.csv within folder CoxKAN/data/TCGA
train_df.to_csv(f'./TCGA/{name}_train.csv')
test_df.to_csv(f'./TCGA/{name}_test.csv')

print(f"Train dataframe shape: {train_df.shape}")
print(f"Test dataframe shape: {test_df.shape}")

Train dataframe shape: (422, 432)
Test dataframe shape: (116, 432)
Train dataframe shape: (289, 432)
Test dataframe shape: (97, 432)
Number of rows: 386
Number of unique samples: 386
Train dataframe shape: (289, 431)
Test dataframe shape: (97, 431)
