# 3.1 - Survival analysis using TCGA samples

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import torch

import os
import sys
sys.path.append(f"../")
from sslcox.utils.model_evaluation import model_already_trained
from sslcox.data.load_datasets import load_TCGA_survival_data

In [2]:
import sys
sys.path.insert(0, '..')

In [3]:
EXPRESSIONS = 'LGG' # Change to 'KIRC' or 'LUAD'

DS_DIR = f'{EXPRESSIONS}-optuna'
CV_DIR = lambda cv: f'CV-{cv}'
MODEL_DIR = lambda m: f'{m}-model-results'
training_results = []

for cv in sorted(os.listdir(f'../data/training-results/{DS_DIR}')):
    if cv.startswith('.'): continue
    cv_results = {}

    for model in os.listdir(f'../data/training-results/{DS_DIR}/{cv}'):
        if model.startswith('.'): continue

        name = model.split('-model-results')[0]
        base_path = f'../data/training-results/{DS_DIR}/{cv}/{model}'

        if not model_already_trained(DS_DIR, CV_DIR(cv.split('-')[1]), MODEL_DIR(name)):
            continue

        X_train_latent = pd.read_csv(f'{base_path}/X_train_latent.tsv', sep='\t', index_col=['index'])
        X_test_latent = pd.read_csv(f'{base_path}/X_test_latent.tsv', sep='\t', index_col=['index'])
        
        cv_results[name] = {
            'X_train_latent': X_train_latent,
            'X_test_latent': X_test_latent,
        }
        
        
    training_results += [cv_results]

## Evaluating Survival

In [4]:
metadata = pd.read_csv(f'../data/raw/{EXPRESSIONS.lower()}-survival-metadata.txt', sep='\t', index_col=0)
merged = pd.concat((training_results[0]['vae-cox']['X_train_latent'], training_results[0]['vae-cox']['X_test_latent']), axis=0)
metadata = metadata.loc[[x[:15] for x in merged.index.values if x[:15] in metadata.index.values]].fillna(0)
#metadata = metadata.loc[~metadata.index.duplicated(keep='first')].fillna(0)
metadata.index = [x for x in merged.index.values if x[:15] in metadata.index.values]

In [5]:
from lifelines import CoxPHFitter
from tqdm import tqdm

survival_data = {
    'pfi': [metadata['PFI.time'], metadata['PFI']],
}

ridge_models = {}
performance = {}
SCORING_METHOD = 'concordance_index'

for cv in range(len(training_results)):
    ridge_models[cv] = {}
    performance[cv] = {}

    for model, values in tqdm(list(training_results[cv].items())):
        ridge_models[cv][model] = {}
        performance[cv][model] = {}

        for process in survival_data.keys():
            y_time, y_event = survival_data[process]

            X_train = values['X_train_latent']
            X_test = values['X_test_latent']

            X_train_mean, X_train_std = X_train.mean(axis=0), X_train.std(axis=0)
            X_train = (X_train - X_train_mean)/X_train_std
            X_test = (X_test - X_train_mean)/X_train_std

            train_set = pd.concat((X_train, y_time, y_event), axis=1, join='inner')
            train_set.columns = list(train_set.columns[:-2]) + ['time', 'event']
            test_set = pd.concat((X_test, y_time, y_event), axis=1, join='inner')
            test_set.columns = list(test_set.columns[:-2]) + ['time', 'event']

            estimator = CoxPHFitter(penalizer=0.1)
            estimator.fit(train_set, 'time', 'event')

            ridge_models[cv][model][process] = estimator

            score = estimator.score(test_set, SCORING_METHOD)
            performance[cv][model][process] = score
        

100%|██████████| 5/5 [00:11<00:00,  2.21s/it]
100%|██████████| 5/5 [00:10<00:00,  2.07s/it]
100%|██████████| 5/5 [00:11<00:00,  2.31s/it]
100%|██████████| 5/5 [00:11<00:00,  2.37s/it]

100%|██████████| 5/5 [16:02<00:00, 192.57s/it]


In [6]:
results = {
    name: {
        d: [per[name][d] for per in performance.values()]
        for d in survival_data.keys()
    }
    for name in performance[0].keys()
}

results = {name: results[name] for name in ['vae-cox', 'vae-mse', 'vae-div', 'pca-emb', 'no-embedding']}

In [7]:
mean_results = pd.DataFrame(
    [[(np.mean(results[name][d]), np.std(results[name][d])) for name in results.keys()] for d in survival_data.keys()],
    index=survival_data.keys(), columns=results.keys()
).T
mean_results.head()

Unnamed: 0,pfi
vae-cox,"(0.6922825172328875, 0.035951434459448255)"
vae-mse,"(0.6848588776841932, 0.03210406119370901)"
vae-div,"(0.6817379281679113, 0.031473613018448046)"
pca-emb,"(0.6804823768923391, 0.028633660942411973)"
no-embedding,"(0.6596952840011958, 0.023454854066439608)"


In [8]:
for name in results.keys():
    print(name, f"\({np.mean(results[name]['pfi']):.3f} \pm {np.std(results[name]['pfi'])/np.sqrt(len(results[name]['pfi'])):.3f}\)")

vae-cox \(0.692 \pm 0.016\)
vae-mse \(0.685 \pm 0.014\)
vae-div \(0.682 \pm 0.014\)
pca-emb \(0.680 \pm 0.013\)
no-embedding \(0.660 \pm 0.010\)
