# 1.0 - Clustering TCGA samples

In [1]:
import os 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys

from sklearn.manifold import TSNE
from tqdm import tqdm

sys.path.insert(0, '..')
from sslcox.metrics.morans_measure import moran_measure
from sslcox.data.load_datasets import load_TCGA_clustering

In [2]:
def load_X_latent(tcga_project):

    DATA_DIR = f'../data/training-results/{tcga_project}-clustering-optuna/CV-1'
    dirs = os.listdir(DATA_DIR)

    X_latent_dict = {
        name.split('-model-results')[0]: pd.read_csv(f'{DATA_DIR}/{name}/X_train_latent.tsv', sep='\t', index_col=0)
        for name in dirs
    }

    return X_latent_dict

In [3]:
def bootstrap_moran_I(tsne_data, labels, M=100):

    measures = []

    np.random.seed(0)
    for i in tqdm(range(M)):
        ids = np.random.choice(np.arange(len(tsne_data)), size=len(tsne_data))

        measures += [moran_measure(tsne_data[ids], labels[ids], percent_distance=0.05)]

    mu = moran_measure(tsne_data, labels, percent_distance=0.05)
    
    return mu, np.std(measures), np.std(measures)/np.sqrt(M)


In [4]:
def moran_I_data_for_TCGA(tcga_project):

    unique_values = {
        'BRCA': ['Positive', 'Negative'],
        'KIRP': ['Type 1', 'Type 2'],
        'LGG': ['NO', 'YES'],
        'STAD': ['STAD_CIN', 'STAD_MSI'],
    }

    exp, meta = load_TCGA_clustering(tcga_project)
    X_latent_dict = load_X_latent(tcga_project)

    X_tsne_dict = {
        name: pd.DataFrame(TSNE(init='pca', perplexity=50, random_state=0).fit_transform(data.values), index=data.index)
        for name, data in tqdm(X_latent_dict.items())
    }

    meta_valid = meta[meta.isin(unique_values[tcga_project])]

    X_moran_I_setup = {
        name: (data.loc[meta_valid.index], meta_valid)
        for name, data in X_tsne_dict.items()
    }
    X_moran_I_setup = {
        name: (data.loc[~data.index.duplicated()], meta_valid)
        for name, (data, meta_valid) in X_moran_I_setup.items()
    }
    X_moran_I = {
        name: bootstrap_moran_I(tsne_data.loc[meta_data.index.values].values, np.array([unique_values[tcga_project].index(i) for i in meta_data.values]), M=100)
        for name, (tsne_data, meta_data) in X_moran_I_setup.items()
    }

    return {
        'expressions': exp,
        'meta': meta,
        'tsne': X_tsne_dict,
        'X_moran_setup': X_moran_I_setup,
        'X_moran_I': X_moran_I
    }

## Evaluating

In [5]:
tcga_moran = {
    'BRCA': moran_I_data_for_TCGA('BRCA'),
    'KIRP': moran_I_data_for_TCGA('KIRP'),
    'LGG': moran_I_data_for_TCGA('LGG'),
}

100%|██████████| 5/5 [00:10<00:00,  2.11s/it]
100%|██████████| 100/100 [00:02<00:00, 34.11it/s]
100%|██████████| 100/100 [00:02<00:00, 34.25it/s]
100%|██████████| 100/100 [00:02<00:00, 34.33it/s]
100%|██████████| 100/100 [00:02<00:00, 34.43it/s]
100%|██████████| 100/100 [00:02<00:00, 34.04it/s]
100%|██████████| 5/5 [00:04<00:00,  1.22it/s]
100%|██████████| 100/100 [00:00<00:00, 965.34it/s]
100%|██████████| 100/100 [00:00<00:00, 1058.46it/s]
100%|██████████| 100/100 [00:00<00:00, 1025.68it/s]
100%|██████████| 100/100 [00:00<00:00, 992.42it/s]
100%|██████████| 100/100 [00:00<00:00, 835.56it/s]
100%|██████████| 5/5 [00:06<00:00,  1.30s/it]
100%|██████████| 100/100 [00:00<00:00, 1605.86it/s]
100%|██████████| 100/100 [00:00<00:00, 1646.96it/s]
100%|██████████| 100/100 [00:00<00:00, 1011.54it/s]
100%|██████████| 100/100 [00:00<00:00, 1487.29it/s]
100%|██████████| 100/100 [00:00<00:00, 1340.14it/s]


In [6]:
## Latex table

model_order = ['vae-cox', 'vae-div', 'vae-mse', 'pca-emb', 'no-embedding']
model_names = ['VAE TDD', 'VAE DIV', 'VAE MSE', 'PCA', 'No Embedding']

end = f'\\'
for tcga in tcga_moran.keys():
    print(f' & {tcga}', end='')
print(f'{end}{end}')

for i, model in enumerate(model_order):

    print(f'{model_names[i]}', end='')

    for tcga, moran_dict in tcga_moran.items():
        print(f" & \({moran_dict['X_moran_I'][model][0]:.3f} \pm {moran_dict['X_moran_I'][model][2]:.3f}\)", end='')
    print(f'{end}{end}')


 & BRCA & KIRP & LGG\\
VAE TDD & \(0.596 \pm 0.005\) & \(0.178 \pm 0.006\) & \(0.352 \pm 0.008\)\\
VAE DIV & \(0.495 \pm 0.004\) & \(0.204 \pm 0.007\) & \(0.244 \pm 0.011\)\\
VAE MSE & \(0.407 \pm 0.004\) & \(0.144 \pm 0.006\) & \(0.229 \pm 0.010\)\\
PCA & \(0.614 \pm 0.004\) & \(0.325 \pm 0.007\) & \(0.301 \pm 0.011\)\\
No Embedding & \(0.569 \pm 0.005\) & \(0.268 \pm 0.007\) & \(0.300 \pm 0.011\)\\
