In [1]:
%cd /Users/gwg/dmcm
%matplotlib inline

/Users/gwg/dmcm


In [2]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from   data import GTExConfig, GTExDataset, loader
from   models import DPCCA

In [3]:
cfg = GTExConfig()

directory = 'experiments/20190305_gtexv6_celeba/'\
            'gtex_mode-dpcca_seed-0_latent_dim-10_lr-0.0001_l1_coef-0.0_em_iters-1'

state = torch.load(directory + '/model.pt', map_location={'cuda:0': 'cpu'})
model = DPCCA(cfg, latent_dim=10, use_gene_net=True)
model.load_state_dict(state)

In [4]:
df = pd.read_csv('data/gtexv8/brian_names.tsv', sep='\t')
BRIANS_NAMES = df.columns.values[1:]

In [5]:
train_loader, test_loader = loader.get_data_loaders(cfg,
                                                    batch_size=128,
                                                    num_workers=1,
                                                    pin_memory=False,
                                                    directory=directory)
dataset = train_loader.dataset

In [8]:
vals = []
for tissue in np.unique(dataset.tissues):
    n = (dataset.tissues == tissue).sum()
    print(tissue, n)
    vals.append(n)
    
print(min(vals), max(vals))
print(len(np.unique(dataset.tissues)))

Adipose Tissue 5
Adrenal Gland 134
Bladder 4
Blood Vessel 47
Brain 172
Breast 5
Cervix Uteri 7
Colon 81
Esophagus 134
Fallopian Tube 4
Heart 188
Kidney 12
Liver 115
Lung 76
Muscle 369
Nerve 9
Ovary 88
Pancreas 166
Pituitary 51
Prostate 53
Salivary Gland 10
Skin 28
Small Intestine 59
Spleen 103
Stomach 106
Testis 44
Thyroid 65
Uterus 69
Vagina 17
4 369
29


In [6]:
BRIAN_TISS_NAMES = [
    'Adipose_Subcutaneous',
    'Adipose_Visceral_Omentum',
    'Adrenal_Gland',
    'Artery_Aorta',
    'Artery_Coronary',
    'Artery_Tibial',
    'Brain_Amygdala',
    'Brain_Anterior_cingulate_cortex_BA24',
    'Brain_Caudate_basal_ganglia',
    'Brain_Cerebellar_Hemisphere',
    'Brain_Cerebellum',
    'Brain_Cortex',
    'Brain_Frontal_Cortex_BA9',
    'Brain_Hippocampus',
    'Brain_Hypothalamus',
    'Brain_Nucleus_accumbens_basal_ganglia',
    'Brain_Putamen_basal_ganglia',
    'Brain_Spinal_cord_cervical_c-1',
    'Brain_Substantia_nigra',
    'Breast_Mammary_Tissue',
    'Cells_Cultured_fibroblasts',
    'Cells_EBV-transformed_lymphocytes',
    'Colon_Sigmoid',
    'Colon_Transverse',
    'Esophagus_Gastroesophageal_Junction',
    'Esophagus_Mucosa',
    'Esophagus_Muscularis',
    'Heart_Atrial_Appendage',
    'Heart_Left_Ventricle',
    'Kidney_Cortex',
    'Liver',
    'Lung',
    'Minor_Salivary_Gland',
    'Muscle_Skeletal',
    'Nerve_Tibial',
    'Ovary',
    'Pancreas',
    'Pituitary',
    'Prostate',
    'Skin_Not_Sun_Exposed_Suprapubic',
    'Skin_Sun_Exposed_Lower_leg',
    'Small_Intestine_Terminal_Ileum',
    'Spleen',
    'Stomach',
    'Testis',
    'Thyroid',
    'Uterus',
    'Vagina',
    'Whole_Blood'
]

def convert_tiss_to_brian(tiss):
    parts = tiss.split(' - ')
    btiss = '_'.join(parts)
    btiss = btiss.replace(' ', '_')
    btiss = btiss.replace(')', '').replace('(', '')
    
    if btiss == 'Muscle':
        btiss = 'Muscle_Skeletal'
    if btiss == 'Skin':
        btiss = 'Skin_Not_Sun_Exposed_Suprapubic'
    if btiss == 'Kidney':
        btiss = 'Kidney_Cortex'
    if btiss == 'Brain':
        btiss = 'Brain_Cerebellum'
    if btiss == 'Adipose_Tissue':
        btiss = 'Adipose_Subcutaneous'
    if btiss == 'Breast':
        btiss = 'Breast_Mammary_Tissue'
    if btiss == 'Heart':
        btiss = 'Heart_Atrial_Appendage'
    if btiss == 'Esophagus':
        btiss = 'Esophagus_Mucosa'
    if btiss == 'Colon':
        btiss = 'Colon_Sigmoid'
    if btiss == 'Cervix_Uteri':
        btiss = 'Uterus'
    if btiss == 'Nerve':
        btiss = 'Nerve_Tibial'
    if btiss == 'Small_Intestine':
        btiss = 'Small_Intestine_Terminal_Ileum'
    if btiss == 'Salivary_Gland':
        btiss = 'Minor_Salivary_Gland'
    if btiss == 'Blood_Vessel':
        btiss = 'Whole_Blood'
        
    return btiss

# Sanity check that our function works as expected.
yes = []
no  = []
for tiss in np.unique(dataset.tissues):
    btiss = convert_tiss_to_brian(tiss)
    if btiss in BRIAN_TISS_NAMES:
        yes.append(btiss)
    else:
        no.append(btiss)

print(yes)
print(no)

['Adipose_Subcutaneous', 'Adrenal_Gland', 'Whole_Blood', 'Brain_Cerebellum', 'Breast_Mammary_Tissue', 'Uterus', 'Colon_Sigmoid', 'Esophagus_Mucosa', 'Heart_Atrial_Appendage', 'Kidney_Cortex', 'Liver', 'Lung', 'Muscle_Skeletal', 'Nerve_Tibial', 'Ovary', 'Pancreas', 'Pituitary', 'Prostate', 'Minor_Salivary_Gland', 'Skin_Not_Sun_Exposed_Suprapubic', 'Small_Intestine_Terminal_Ileum', 'Spleen', 'Stomach', 'Testis', 'Thyroid', 'Uterus', 'Vagina']
['Bladder', 'Fallopian_Tube']


In [7]:
n  = 2093  # Hard coded.
X1 = torch.Tensor(n, cfg.N_CHANNELS, cfg.IMG_SIZE, cfg.IMG_SIZE)
X2 = torch.Tensor(n, cfg.N_GENES)

gtex_names = []
discard = []
tissues = []
labels = []

j = 0
for i in range(len(dataset)):
    
    full_name = dataset.names[i]
    parts     = full_name.split('-')
    gtex_name = '-'.join(parts[:2])
    
    if gtex_name in BRIANS_NAMES:
        
        gtex_names.append(gtex_name)
        x1, x2 = dataset[i]
        X1[j] = x1
        X2[j] = x2
        j += 1
        tissues.append(dataset.tissues[i])
        labels.append(dataset.labels[i])
        
    else:
        discard.append(gtex_name)
        
print(X1.shape, X2.shape, len(gtex_names), len(tissues), len(labels))
print(len(discard))

torch.Size([2093, 3, 128, 128]) torch.Size([2093, 18659]) 2093 2093 2093
128


In [13]:
latent_dim = 10

for i, tiss in enumerate(np.unique(dataset.tissues)):

    inds  = np.array(tissues) == tiss
    X1_   = torch.Tensor(X1.detach().numpy()[inds])
    X2_   = torch.Tensor(X2.detach().numpy()[inds])
    labs  = np.array(labels)[inds]
    names = np.array(gtex_names)[inds]
    Zs    = model.estimate_z_given_x([X1_, X2_], threshold=None).detach().numpy()
    
    # Verify that Brians names are already sorted.
    assert (np.array(sorted(BRIANS_NAMES)) == BRIANS_NAMES).all()

    inds         = np.array(names).argsort()
    names_sorted = np.flip(names[inds[::-1]])
    Zs_sorted    = np.flip(Zs[inds[::-1], :])
    
    # This verifies that our columns are in the same order as Brian's,
    # ignoring instances where Brian has a column and we do not.
    curr_idx = -1
    for name in names_sorted:
        idx = BRIANS_NAMES.tolist().index(name)
        assert idx >= curr_idx
        curr_idx = idx
    
    df = pd.DataFrame(columns=names_sorted, data=Zs_sorted.T)
    df.index.name = 'gene_id'
    
    tiss  = convert_tiss_to_brian(tiss)
    fname = 'analysis/expression_matrices/expression_matrix_%s_latent_dim-%s.txt' % (tiss, latent_dim)
    print(i, fname)
    df.to_csv(fname, sep='\t')

0 analysis/expression_matrix_Adipose_Subcutaneous_latent_dim-10.txt
1 analysis/expression_matrix_Adrenal_Gland_latent_dim-10.txt
2 analysis/expression_matrix_Bladder_latent_dim-10.txt
559 559
600 600
635 635
3 analysis/expression_matrix_Whole_Blood_latent_dim-10.txt
2 2
9 9
13 13
24 24
25 25
29 29
31 31
32 32
33 33
53 53
65 65
75 75
121 121
124 124
125 125
138 138
141 141
143 143
147 147
154 154
157 157
555 555
556 556
587 587
591 591
593 593
598 598
606 606
610 610
611 611
623 623
626 626
652 652
656 656
664 664
677 677
695 695
699 699
706 706
721 721
751 751
829 829
4 analysis/expression_matrix_Brain_Cerebellum_latent_dim-10.txt
5 analysis/expression_matrix_Breast_Mammary_Tissue_latent_dim-10.txt
631 631
664 664
6 analysis/expression_matrix_Uterus_latent_dim-10.txt
715 715
749 749
7 analysis/expression_matrix_Colon_Sigmoid_latent_dim-10.txt
15 15
552 552
564 564
568 568
594 594
597 597
605 605
609 609
616 616
618 618
626 626
629 629
631 631
635 635
636 636
655 655
668 668
670 670
688