In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from torch.utils.data import DataLoader
from itertools import chain
import os
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.functional import normalize

  from .autonotebook import tqdm as notebook_tqdm


#### select 4298 genes (top varaince as well as drug-targeted genes)

In [2]:
tcga_exp = pd.read_csv("/Users/chengqi_xu/Documents/Elemento lab/synergyy/data/cell_line_data/tcga/tcga_exp.csv").T
ccle_exp = pd.read_csv("/Users/chengqi_xu/Documents/Elemento lab/synergyy/data/cell_line_data/tcga/CCLE_exp.csv",index_col=0)
final_table_columns = ccle_exp.index
df = tcga_exp.drop(columns=[col for col in tcga_exp if col not in final_table_columns])

In [3]:
df.shape

(12236, 4298)

In [4]:
dataX = Variable(torch.Tensor(np.array(df)))

#### autoencoder

In [5]:
class AE(nn.Module):

    def __init__(self, input_dim: int, latent_dim: int, hidden_dims: list = None, \
        dop: float = 0.1, noise_flag: bool = False, **kwargs) -> None:
        super(AE, self).__init__()
        self.latent_dim = latent_dim
        self.noise_flag = noise_flag
        self.dop = dop

        if hidden_dims is None:
            hidden_dims = [512]

        # build encoder
        modules = []

        modules.append(
            nn.Sequential(
                nn.Linear(input_dim, hidden_dims[0], bias=True),
                #nn.BatchNorm1d(hidden_dims[0]),
                nn.ReLU(),
                nn.Dropout(self.dop)
            )
        )

        for i in range(len(hidden_dims) - 1):
            modules.append(
                nn.Sequential(
                    nn.Linear(hidden_dims[i], hidden_dims[i + 1], bias=True),
                    #nn.BatchNorm1d(hidden_dims[i + 1]),
                    nn.ReLU(),
                    nn.Dropout(self.dop)
                )
            )
        modules.append(nn.Dropout(self.dop))
        modules.append(nn.Linear(hidden_dims[-1], latent_dim, bias=True))

        self.encoder = nn.Sequential(*modules)

        # build decoder
        modules = []

        modules.append(
            nn.Sequential(
                nn.Linear(latent_dim, hidden_dims[-1], bias=True),
                #nn.BatchNorm1d(hidden_dims[-1]),
                nn.ReLU(),
                nn.Dropout(self.dop)
            )
        )

        hidden_dims.reverse()

        for i in range(len(hidden_dims) - 1):
            modules.append(
                nn.Sequential(
                    nn.Linear(hidden_dims[i], hidden_dims[i + 1], bias=True),
                    #nn.BatchNorm1d(hidden_dims[i + 1]),
                    nn.ReLU(),
                    nn.Dropout(self.dop)
                )
            )
            
        self.decoder = nn.Sequential(*modules)
        self.final_layer = nn.Sequential(
            nn.Linear(hidden_dims[-1], hidden_dims[-1], bias=True),
            #nn.BatchNorm1d(hidden_dims[-1]),
            nn.ReLU(),
            nn.Dropout(self.dop),
            nn.Linear(hidden_dims[-1], input_dim)
        )
    
    def forward(self, input):
        encoded_input = self.encoder(input)
        encoded_input = nn.functional.normalize(encoded_input, p=2, dim=1)
        output = self.final_layer(self.decoder(encoded_input))

        return output

    def encode(self, input):
        return self.encoder(input)

    def decode(self, z):
        return self.decoder(z)

In [6]:
input_size = dataX.shape[1]
latent_dim = 256
model = AE(input_size,latent_dim)
criterion = nn.MSELoss()
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

In [7]:
dataloader = DataLoader(dataX, batch_size=256,shuffle=True)

num_epochs = 50
# do = nn.Dropout()  # comment out for standard AE
for epoch in range(num_epochs):
    for data in dataloader:
        img = data
        # ===================forward=====================
        output = model(img)  # feed  (for std AE) or  (for denoising AE)
        loss = criterion(output, img.data)
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # ===================log========================
    print(f'epoch [{epoch + 1}/{num_epochs}], loss:{loss.item():.4f}')

epoch [1/50], loss:15.8928
epoch [2/50], loss:3.6415
epoch [3/50], loss:2.1240
epoch [4/50], loss:1.9584
epoch [5/50], loss:1.9074
epoch [6/50], loss:1.8823
epoch [7/50], loss:1.9394
epoch [8/50], loss:1.9135
epoch [9/50], loss:1.8398
epoch [10/50], loss:1.8488
epoch [11/50], loss:1.6355
epoch [12/50], loss:1.6457
epoch [13/50], loss:1.5972
epoch [14/50], loss:1.6502
epoch [15/50], loss:1.5408
epoch [16/50], loss:1.5821
epoch [17/50], loss:1.4096
epoch [18/50], loss:1.3687
epoch [19/50], loss:1.3725
epoch [20/50], loss:1.3456
epoch [21/50], loss:1.3521
epoch [22/50], loss:1.3278
epoch [23/50], loss:1.3181
epoch [24/50], loss:1.2667
epoch [25/50], loss:1.2560
epoch [26/50], loss:1.2458
epoch [27/50], loss:1.2618
epoch [28/50], loss:1.2014
epoch [29/50], loss:1.2224
epoch [30/50], loss:1.1992
epoch [31/50], loss:1.1930
epoch [32/50], loss:1.2392
epoch [33/50], loss:1.1625
epoch [34/50], loss:1.1621
epoch [35/50], loss:1.1419
epoch [36/50], loss:1.1283
epoch [37/50], loss:1.0926
epoch [38

In [8]:
#ROOT_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), '..'))
#save_path = os.path.join(ROOT_DIR, 'data','cell_line_data','tcga_encoder.pth')
torch.save(model.state_dict(), "/Users/chengqi_xu/Documents/Elemento lab/synergyy/data/cell_line_data/tcga/tcga_encoder.pth")