In [1]:
%cd /Users/gwg/dmcm
%matplotlib inline

/Users/gwg/dmcm


In [2]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from   data import GTExV8Config, GTExV8Dataset, loader
from   models import DPCCA

In [3]:
cfg        = GTExV8Config()
cfg.IMG_EMBED_DIM = 100
cfg.GENE_EMBED_DIM = 100
latent_dim = 10
directory  = '/Users/gwg/dmcm/experiments/20190131_big_sweep/'\
             'gtexv8_mode-dpcca_batch_size-128_seed-0_latent_dim-%'\
             's_lr-0.0001_l1_coef-0.5_em_iters-1_clip-1' % latent_dim

In [4]:
df = pd.read_csv('data/gtexv8/brian_names.tsv', sep='\t')
BRIANS_NAMES = df.columns.values[1:]

In [5]:
train_loader, test_loader = loader.get_data_loaders(cfg,
                                                    batch_size=128,
                                                    num_workers=1,
                                                    pin_memory=False,
                                                    directory=directory)
dataset = train_loader.dataset

In [6]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
all_tissues = dataset.tissues_df.values.squeeze()
le.fit(all_tissues)
tissue_nums = le.transform(all_tissues)

BRIAN_TISS_NAMES = [
    'Adipose_Subcutaneous',
    'Adipose_Visceral_Omentum',
    'Adrenal_Gland',
    'Artery_Aorta',
    'Artery_Coronary',
    'Artery_Tibial',
    'Brain_Amygdala',
    'Brain_Anterior_cingulate_cortex_BA24',
    'Brain_Caudate_basal_ganglia',
    'Brain_Cerebellar_Hemisphere',
    'Brain_Cerebellum',
    'Brain_Cortex',
    'Brain_Frontal_Cortex_BA9',
    'Brain_Hippocampus',
    'Brain_Hypothalamus',
    'Brain_Nucleus_accumbens_basal_ganglia',
    'Brain_Putamen_basal_ganglia',
    'Brain_Spinal_cord_cervical_c-1',
    'Brain_Substantia_nigra',
    'Breast_Mammary_Tissue',
    'Cells_Cultured_fibroblasts',
    'Cells_EBV-transformed_lymphocytes',
    'Colon_Sigmoid',
    'Colon_Transverse',
    'Esophagus_Gastroesophageal_Junction',
    'Esophagus_Mucosa',
    'Esophagus_Muscularis',
    'Heart_Atrial_Appendage',
    'Heart_Left_Ventricle',
    'Kidney_Cortex',
    'Liver',
    'Lung',
    'Minor_Salivary_Gland',
    'Muscle_Skeletal',
    'Nerve_Tibial',
    'Ovary',
    'Pancreas',
    'Pituitary',
    'Prostate',
    'Skin_Not_Sun_Exposed_Suprapubic',
    'Skin_Sun_Exposed_Lower_leg',
    'Small_Intestine_Terminal_Ileum',
    'Spleen',
    'Stomach',
    'Testis',
    'Thyroid',
    'Uterus',
    'Vagina',
    'Whole_Blood'
]

def convert_tiss_to_brian(tiss):
    parts = tiss.split(' - ')
    tiss = '_'.join(parts)
    tiss = tiss.replace(' ', '_')
    tiss = tiss.replace(')', '').replace('(', '')
    return tiss

# Sanity check that our function works as expected.
for tiss in np.unique(all_tissues):
    assert convert_tiss_to_brian(tiss) in BRIAN_TISS_NAMES

In [7]:
n  = len(dataset)
X1 = torch.Tensor(n, cfg.N_CHANNELS, cfg.IMG_SIZE, cfg.IMG_SIZE)
X2 = torch.Tensor(n, cfg.N_GENES)

gtex_names = []
for i in range(n):
    x1, x2 = dataset[i]
    X1[i] = x1
    X2[i] = x2
    full_name = dataset.samples[i]
    parts = full_name.split('-')
    gtex_name = '-'.join(parts[:2])
    assert gtex_name in BRIANS_NAMES
    gtex_names.append(gtex_name)

In [8]:
state = torch.load(directory + '/model.pt', map_location={'cuda:0': 'cpu'})
model = DPCCA(cfg, latent_dim=10, use_gene_net=True)
model.load_state_dict(state)

In [19]:
for i, tiss in enumerate(np.unique(all_tissues)):

    inds  = np.array(dataset.labels) == tiss
    X1_   = torch.Tensor(X1.detach().numpy()[inds])
    X2_   = torch.Tensor(X2.detach().numpy()[inds])
    labs  = tissue_nums[inds]
    names = np.array(gtex_names)[inds]
    Zs    = model.estimate_z_given_x([X1_, X2_], threshold=None).detach().numpy()
    print(Zs.shape)
    
    # Verify that Brians names are already sorted.
    assert (np.array(sorted(BRIANS_NAMES)) == BRIANS_NAMES).all()

    inds         = np.array(names).argsort()
    names_sorted = np.flip(names[inds[::-1]])
    Zs_sorted    = np.flip(Zs[inds[::-1], :])
    
    # This verifies that our columns are in the same order as Brian's,
    # ignoring instances where Brian has a column and we do not.
    curr_idx = -1
    for name in names_sorted:
        idx = BRIANS_NAMES.tolist().index(name)
        assert idx > curr_idx
        curr_idx = idx
    
    df = pd.DataFrame(columns=names_sorted, data=Zs_sorted.T)
    df.index.name = 'gene_id'
    
    tiss  = convert_tiss_to_brian(tiss)
    fname = 'analysis/expression_matrix_%s_latent_dim-%s.txt' % (tiss, latent_dim)
    print(i, fname)
    df.to_csv(fname, sep='\t')

(487, 30)
(487,)
