In [None]:
import numpy as np
import pandas as pd
from lifelines import CoxPHFitter
from sklearn.decomposition import PCA
import warnings
pd.set_option('display.max_columns', 500)

In [None]:
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=UserWarning)


# Specify data locations

In [None]:
#Specify the path to clinical_data_cleaned.csv, which contains clinical features and outcome data
clinical_data_path = "../CSV/clinical_data_KNN.csv"

#Specify the path to the extracted features from KimiaNet
features_path = "../CSV/features_autoencoder_2.csv.csv"

#Specify the path to the extracted features from KimiaNet with clinical data
features_with_clinical_data_path = "../CSV/features_with_clinical_data_1024_V3.csv"

# Select which features to use

In [None]:
#Specify one of the following: "ALL", "EXTRACTED", "CLINICAL"
FEATURES = "ALL"

In [None]:
if FEATURES == "ALL":
    data = features_with_clinical_data_path
elif FEATURES == "EXTRACTED":
    data = features_with_clinical_data_path
else:
    data = clinical_data_path

In [None]:
df_2 = pd.read_csv(clinical_data_path)
if FEATURES == "ALL":
    df = pd.read_csv(data)
    df = df.astype(np.float32)
    df.set_index("patient_id", inplace=True)
    df.drop(["PFS"], inplace=True, axis=1)
elif FEATURES == "EXTRACTED":
    df = pd.read_csv(data)
    df = df.astype(np.float32)
    df.set_index("patient_id", inplace=True)
    df.drop(["PFS", "MYC IHC", "BCL2 IHC", "BCL6 IHC", "CD10 IHC", "MUM1 IHC",
           "HANS", "BCL6 FISH", "MYC FISH", "BCL2 FISH", "Age", "ECOG PS", "LDH",
           "EN", "Stage", "IPI Score", "IPI Risk Group (4 Class)", "RIPI Risk Group"], inplace=True, axis=1)
else:
    df = pd.read_csv(data)
    df = df.astype(np.float32)
    df.set_index("patient_id", inplace=True)
    df.drop(["PFS"], inplace=True, axis=1)
df.rename(columns={"Follow-up Status": "FUS"}, inplace=True)
df

In [None]:
df.describe()

In [None]:
# Rename columns 0, 1, 2, ..., 1023 to "C0", "C1", "C2", ..., "C1023"
if FEATURES == "ALL":
    last_cols = ["C" + str(i) for i in range(df.shape[1]-19)]
    #last_cols = [str(i) for i in range(df.shape[1]-19)]
    first_cols = ['MYC IHC', 'BCL2 IHC', 'BCL6 IHC', 'CD10 IHC', 'MUM1 IHC',
           'HANS', 'BCL6 FISH', 'MYC FISH', 'BCL2 FISH', 'Age', 'ECOG PS', 'LDH',
           'EN', 'Stage', 'IPI Score', 'IPI Risk Group (4 Class)', 'RIPI Risk Group', 'OS', 'FUS']
elif FEATURES == "EXTRACTED":
    last_cols = ["C" + str(i) for i in range(df.shape[1]-2)]
    first_cols = ['OS', 'FUS']
else:
    last_cols = []
    first_cols = ['MYC IHC', 'BCL2 IHC', 'BCL6 IHC', 'CD10 IHC', 'MUM1 IHC',
           'HANS', 'BCL6 FISH', 'MYC FISH', 'BCL2 FISH', 'Age', 'ECOG PS', 'LDH',
           'EN', 'Stage', 'IPI Score', 'IPI Risk Group (4 Class)', 'RIPI Risk Group', 'OS', 'FUS']
cols = first_cols + last_cols
df.columns = cols

In [None]:
# Drop all the columns with a variance of 0
final_df = df.loc[:, (df != df.iloc[0]).any()]
final_df


In [None]:
def PCAMethod(df, n_feats):
    new_df = df[last_cols]
    pca = PCA(n_components=n_feats)
    pca.fit(new_df)
    pca_transformed = pca.transform(new_df)
    pca_df = pd.DataFrame(data=pca_transformed, columns=['PC{}'.format(i+1) for i in range(n_feats)])
    final_df = np.hstack((df[first_cols], pca_df))
    final_df= pd.DataFrame(final_df, columns=list(df[first_cols].columns) + list(pca_df.columns))
    return final_df

In [None]:
final_df = PCAMethod(df, 128)
final_df

In [None]:
# NUMBER_OF_FEATURES = 128
#
# new_df = df[last_cols]
# std = []
# for col in new_df.columns:
#     std.append(df[col].std())
# std = np.array(std)
# std_top_n = np.argsort(std)[-NUMBER_OF_FEATURES:]
#
# std_top_n_df = new_df.iloc[:, std_top_n]
# std_top_n_df

In [None]:
# final_df = pd.concat([df[first_cols], std_top_n_df], axis=1)
# final_df

# Preprocess the data (quantile transformation)

In [None]:
from sklearn.preprocessing import QuantileTransformer

qt = QuantileTransformer(n_quantiles=10, random_state=42)
qt.fit(final_df)
final_df = pd.DataFrame.from_records(qt.transform(final_df), columns=final_df.columns)
final_df

In [None]:
cph = CoxPHFitter(penalizer=0.01)
cph.fit(final_df, duration_col='OS', event_col='FUS', show_progress=True)
c_main = cph.score(final_df, scoring_method="concordance_index")
print("The C-index is: %.3f" % c_main)

In [None]:
cph.predict_survival_function(final_df.iloc[0:10, :]).plot()

In [None]:
cph.print_summary()

## Optimism computation

In [None]:
np.random.seed(42)
c_b_boot, c_b_orig = [], []
num_bootstraps = 1000
bootstrap_size = len(final_df)

for i in range(num_bootstraps):
    choices = np.random.choice(np.arange(0, len(final_df)), size=bootstrap_size, replace=True)
    new_df = final_df.iloc[choices]  #sample bootstrap replicate with replacement

    cph = CoxPHFitter(penalizer=0.01)
    cph.fit(new_df, duration_col='OS', event_col='FUS')  #fit on bootstrap

    c = cph.score(new_df, scoring_method="concordance_index")  #score on bootstrap
    c_b_boot.append(c)

    c = cph.score(final_df, scoring_method="concordance_index")  #score on original
    c_b_orig.append(c)

c_b_orig = np.array(c_b_orig)
c_b_boot = np.array(c_b_boot)

In [None]:
o = np.mean(c_b_boot - c_b_orig)
print("measure of optimism: %.3f" % o)

In [None]:
c_final = c_main - o
print("optimism-corrected c-index is %.3f" % c_final)

# 95% Confidence Intervals


In [None]:
np.random.seed(42)
c_indices = []
num_bootstraps = 1000
bootstrap_size = len(final_df)

cph = CoxPHFitter(penalizer=0.0001) #fit on original data
cph.fit(final_df, duration_col='OS', event_col='FUS')

for i in range(num_bootstraps):
    choices = np.random.choice(np.arange(0, len(final_df)), size=bootstrap_size, replace=True)
    new_df = final_df.iloc[choices]

    c_index = cph.score(new_df, scoring_method="concordance_index")
    c_indices.append(c_index)

c_indices.sort()
hi = c_indices[974]
lo = c_indices[24]

print("Confidence interval is: %.3f - %.3f" % (lo, hi))

In [None]:
print("Adjusting for optimism")
print("Adjusted confidence interval is: %.3f - %.3f" % (lo-o, hi-o))