In [1]:
import re
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
npx = pd.read_csv('combined_pheno_forconsortium_v1_NPX.tsv',sep='\t',index_col=0,low_memory=False)
missingRatioProt = npx.apply(lambda x: x.isna().sum()/npx.shape[0],axis=0)
npx = npx.loc[:,list(missingRatioProt[missingRatioProt < .1].index)]
missingRatioSamp = npx.apply(lambda x: x.isna().sum()/npx.shape[1],axis=1)
npx = npx.loc[list(missingRatioSamp[missingRatioSamp < .2].index),:]
#npx = npx.dropna()

info = pd.read_csv('sampleInfo.csv',index_col=0)
info = info.loc[:,map(lambda x: re.search(r'in urine',x)==None,info.columns)]
info['Sex'] = info.apply(lambda x: 0 if x['Sex']=='F' else 1 if x['Sex']=='M' else np.nan,axis=1)
print(info.shape)
df = pd.concat([npx,info],join='inner',axis=1)
df

(52363, 63)


Unnamed: 0_level_0,GLO1:Q04760:OID20107:v1:Cardiometabolic,PAG1:Q9NWQ8:OID20108:v1:Cardiometabolic,ADAM15:Q13444:OID20109:v1:Cardiometabolic,USP8:P40818:OID20110:v1:Cardiometabolic,BMP6:P22004:OID20111:v1:Cardiometabolic,ITGB1BP2:Q9UKP3:OID20112:v1:Cardiometabolic,CTSH:P09668:OID20113:v1:Cardiometabolic,BAG6:P46379:OID20114:v1:Cardiometabolic,MSTN:O14793:OID20115:v1:Cardiometabolic,BOC:Q9BWV1:OID20116:v1:Cardiometabolic,...,Oestradiol,Phosphate,Rheumatoid factor,SHBG,Total bilirubin,Testosterone,Total protein,Triglycerides,Urate,Vitamin D
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5763561,-0.5539,-0.61835,-0.33685,-0.8604,0.2837,-0.82665,0.7754,-0.2148,1.2040,0.25545,...,,1.326,,76.58,14.96,13.185,67.83,0.770,254.3,45.3
1541419,-0.8117,-0.51535,-0.34215,-0.8946,-0.5164,-1.29915,-0.7953,-0.6748,0.5846,-0.29475,...,,1.512,,19.77,14.16,10.815,75.16,1.917,445.9,40.9
2845293,0.1584,0.42485,0.77525,1.5851,1.2331,1.12505,0.5892,0.5778,0.7071,0.68725,...,,1.084,,39.74,6.25,9.194,66.33,3.844,396.7,64.7
2178814,-0.6318,-0.10435,-0.06085,-1.1274,-0.4320,-1.67025,-0.1314,-0.5383,0.3361,0.10205,...,,,,,,,,,,
5084631,0.1961,-0.28665,0.81625,-0.1619,-0.1686,0.06795,0.5508,0.0556,0.5347,0.07815,...,,1.280,,41.39,10.05,14.113,73.15,3.248,300.7,65.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3310145,,,,,,,,,,,...,,1.206,17.3,50.28,10.10,1.694,75.97,1.643,332.1,35.4
2829755,,,,,,,,,,,...,,1.137,,28.87,6.94,0.385,78.00,1.816,314.8,31.1
6007204,,,,,,,,,,,...,,0.826,,46.94,9.75,1.410,70.33,0.725,281.3,49.3
1077460,,,,,,,,,,,...,,,,,,,,,,


In [3]:
torch.tensor(df.iloc[-2,:])

tensor([nan, nan, nan,  ..., nan, nan, nan], dtype=torch.float64)

In [4]:
torch.nan_to_num(torch.tensor(df.iloc[-2,:]))

tensor([0., 0., 0.,  ..., 0., 0., 0.], dtype=torch.float64)

In [5]:
class proteomicsDataset(Dataset):
    def __init__(self,exprMat):
        self.exprMat = exprMat.iloc[:,:-info.shape[1]].astype(float)
        self.labels = exprMat.iloc[:,-info.shape[1]:].astype(float)
        
    def __len__(self):
        return(self.labels.shape[0])
    
    def __getitem__(self,idx):
        #exprVector = np.array(self.exprMat.iloc[idx,:])
        #label = np.array(self.labels.iloc[idx,:])
        exprVector = torch.tensor(self.exprMat.iloc[idx,:], dtype=torch.float64)
        label = torch.tensor(self.labels.iloc[idx,:], dtype=torch.float64)
        return exprVector,label

In [6]:
dataset = proteomicsDataset(df)
print(len(dataset))

49583


In [7]:
dataset[200][0].shape

torch.Size([2931])

In [8]:
dataset[200][1].shape

torch.Size([63])

In [16]:
# Configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
INPUT_DIM = 2931
Z_DIM = 10
H_DIM = 200
NUM_EPOCHS = 20
BATCH_SIZE = 80
LR_RATE = 3e-4

cpu


In [10]:
train_loader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)

In [11]:
class VariationalAutoEncoder(nn.Module):
    def __init__(self, input_dim, z_dim, h_dim=200):
        super().__init__()
        # encoder
        self.img_2hid = nn.Linear(input_dim, h_dim)

        # one for mu and one for stds, note how we only output
        # diagonal values of covariance matrix. Here we assume
        # the pixels are conditionally independent 
        self.hid_2mu = nn.Linear(h_dim, z_dim)
        self.hid_2sigma = nn.Linear(h_dim, z_dim)

        # decoder
        self.z_2hid = nn.Linear(z_dim, h_dim)
        self.hid_2img = nn.Linear(h_dim, input_dim)
        
        self.double()

    def encode(self, x):
        h = F.relu(self.img_2hid(x))
        mu = self.hid_2mu(h)
        sigma = self.hid_2sigma(h)
        return mu, sigma

    def decode(self, z):
        new_h = self.z_2hid(z)
        x = self.hid_2img(new_h)
        return x

    def forward(self, x):
        mu, sigma = self.encode(x)
        sigma = torch.exp(sigma)

        # Sample from latent distribution from encoder
        epsilon = torch.randn_like(sigma)
        z_reparametrized = mu + sigma*epsilon

        x = self.decode(z_reparametrized)
        return x, mu, sigma

In [17]:
# Define train function
def train(num_epochs, model, optimizer, loss_fn):
    # Start training
    for epoch in range(num_epochs):
        loop = tqdm(enumerate(train_loader))
        for i, (x, y) in loop:
            # Forward pass
            x = x.to(device).view(-1, INPUT_DIM)
            nan_in_x = torch.isnan(x)
            x = torch.nan_to_num(x)
            x_reconst, mu, sigma = model(x)

            # loss, formulas from https://www.youtube.com/watch?v=igP03FXZqgo&t=2182s
            reconst_loss = loss_fn(x_reconst[~nan_in_x], x[~nan_in_x])
            kl_div = - torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))

            # Backprop and optimize
            loss = reconst_loss + kl_div
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loop.set_postfix(loss=loss.item())

In [18]:
# Initialize model, optimizer, loss
model = VariationalAutoEncoder(INPUT_DIM, Z_DIM, H_DIM).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LR_RATE)
loss_fn = nn.MSELoss(reduction="sum")

In [19]:
# Run training
train(NUM_EPOCHS, model, optimizer, loss_fn)

620it [00:42, 14.72it/s, loss=5.16e+4]
620it [00:42, 14.71it/s, loss=5.03e+4]
620it [00:42, 14.76it/s, loss=5.02e+4]
620it [00:42, 14.69it/s, loss=5.68e+4]
620it [00:41, 14.79it/s, loss=5.13e+4]
620it [00:40, 15.27it/s, loss=4.9e+4] 
620it [00:40, 15.19it/s, loss=5.7e+4] 
620it [00:41, 15.00it/s, loss=4.93e+4]
620it [00:40, 15.20it/s, loss=4.99e+4]
620it [00:40, 15.27it/s, loss=5.2e+4] 
620it [00:40, 15.13it/s, loss=4.82e+4]
620it [00:40, 15.34it/s, loss=5.3e+4] 
620it [00:39, 15.56it/s, loss=5.32e+4]
620it [00:40, 15.21it/s, loss=5.07e+4]
620it [00:40, 15.42it/s, loss=5.17e+4]
620it [00:41, 15.11it/s, loss=5.07e+4]
620it [00:40, 15.26it/s, loss=5.01e+4]
620it [00:41, 14.99it/s, loss=5.12e+4]
620it [00:42, 14.50it/s, loss=5.04e+4]
620it [00:43, 14.36it/s, loss=5.12e+4]


In [26]:
# Get Mu's and Sigma's for each image
mus = []
sigmas = []
for vector,label in dataset:
    vector = torch.nan_to_num(vector)
    with torch.no_grad():
        mu, sigma = model.encode(vector)
        mus.append(mu)
        sigmas.append(torch.exp(sigma))

In [27]:
mus[200]

tensor([ 1.2688, -0.5347,  0.1581,  0.2648, -1.5284,  0.9483,  0.5966, -0.0511,
        -0.8991, -0.0412], dtype=torch.float64)

In [29]:
rows = []
labels = []
for i,(vector,label) in enumerate(dataset):
    #labels.append(pd.Series(np.array(label),name=df.index[i]))
    rows.append(pd.Series(np.array(mus[i]),name=df.index[i]))

embedding = pd.concat(rows,join='inner',axis=1).transpose()
embedding.columns = ['latent'+str(i) for i in embedding.columns]
#info = pd.concat(labels,join='inner',axis=1).transpose()
print(embedding.shape)
#print(info.shape)
res = pd.concat([embedding,df.iloc[:,-info.shape[1]:]],join='inner',axis=1)
print(res)

(49583, 10)
          latent0   latent1   latent2   latent3   latent4   latent5   latent6  \
5763561  0.420493 -0.874674 -0.152970  1.729503 -1.308541  1.292282 -0.460884   
1541419  1.190057  0.778684  0.945594  0.590214 -0.590026  0.680408  0.449014   
2845293 -1.268925 -0.229118  1.525022  0.000524  0.006393  1.686119 -0.798386   
2178814  1.628365 -0.116593 -0.761448  0.424541  0.057755  0.005105 -1.584236   
5084631 -0.116767  0.207145  0.895125  0.999763 -1.271163  0.769380  0.231032   
...           ...       ...       ...       ...       ...       ...       ...   
3310145  0.327458 -0.287447  0.292517  0.146976  0.421845  0.116608 -1.610556   
2829755 -0.499582  0.268123  0.148672  0.182101  0.926376  0.071940  0.067529   
6007204 -0.245262  0.375030  0.078478  0.503889  0.058228 -0.284224 -1.326012   
1077460  0.523441 -0.003972  0.575323  1.066161 -1.166544 -0.506462 -0.067901   
5781248  0.459784  0.190546  0.282029  0.128540 -0.513569 -0.707341  1.740558   

          laten

In [30]:
embedding.iloc[200,:]

latent0    1.268800
latent1   -0.534684
latent2    0.158144
latent3    0.264766
latent4   -1.528416
latent5    0.948251
latent6    0.596591
latent7   -0.051135
latent8   -0.899143
latent9   -0.041226
Name: 2555809, dtype: float64

In [31]:
embedding.to_csv('VAE_embeddings.csv')