In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from torch.utils.data import DataLoader
from itertools import chain
import os
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.functional import normalize

  from .autonotebook import tqdm as notebook_tqdm


#### select 4298 genes (top varaince as well as drug-targeted genes)

In [2]:
tcga_exp = pd.read_csv("/Users/chengqi_xu/Documents/Elemento lab/synergyy/data/cell_line_data/tcga/tcga_exp.csv").T
ccle_exp = pd.read_csv("/Users/chengqi_xu/Documents/Elemento lab/synergyy/data/cell_line_data/tcga/CCLE_exp.csv",index_col=0)
final_table_columns = ccle_exp.index
df = tcga_exp.drop(columns=[col for col in tcga_exp if col not in final_table_columns])

In [3]:
df.shape

(12236, 4298)

In [4]:
dataX = Variable(torch.Tensor(np.array(df)))

#### autoencoder

In [5]:
class AE(nn.Module):

    def __init__(self, input_dim: int, latent_dim: int, hidden_dims: list = None, \
        dop: float = 0.1, noise_flag: bool = False, **kwargs) -> None:
        super(AE, self).__init__()
        self.latent_dim = latent_dim
        self.noise_flag = noise_flag
        self.dop = dop

        if hidden_dims is None:
            hidden_dims = [512]

        # build encoder
        modules = []

        modules.append(
            nn.Sequential(
                nn.Linear(input_dim, hidden_dims[0], bias=True),
                #nn.BatchNorm1d(hidden_dims[0]),
                nn.ReLU(),
                nn.Dropout(self.dop)
            )
        )

        for i in range(len(hidden_dims) - 1):
            modules.append(
                nn.Sequential(
                    nn.Linear(hidden_dims[i], hidden_dims[i + 1], bias=True),
                    #nn.BatchNorm1d(hidden_dims[i + 1]),
                    nn.ReLU(),
                    nn.Dropout(self.dop)
                )
            )
        modules.append(nn.Dropout(self.dop))
        modules.append(nn.Linear(hidden_dims[-1], latent_dim, bias=True))

        self.encoder = nn.Sequential(*modules)

        # build decoder
        modules = []

        modules.append(
            nn.Sequential(
                nn.Linear(latent_dim, hidden_dims[-1], bias=True),
                #nn.BatchNorm1d(hidden_dims[-1]),
                nn.ReLU(),
                nn.Dropout(self.dop)
            )
        )

        hidden_dims.reverse()

        for i in range(len(hidden_dims) - 1):
            modules.append(
                nn.Sequential(
                    nn.Linear(hidden_dims[i], hidden_dims[i + 1], bias=True),
                    #nn.BatchNorm1d(hidden_dims[i + 1]),
                    nn.ReLU(),
                    nn.Dropout(self.dop)
                )
            )
            
        self.decoder = nn.Sequential(*modules)
        self.final_layer = nn.Sequential(
            nn.Linear(hidden_dims[-1], hidden_dims[-1], bias=True),
            #nn.BatchNorm1d(hidden_dims[-1]),
            nn.ReLU(),
            nn.Dropout(self.dop),
            nn.Linear(hidden_dims[-1], input_dim)
        )
    
    def forward(self, input):
        encoded_input = self.encoder(input)
        encoded_input = nn.functional.normalize(encoded_input, p=2, dim=1)
        output = self.final_layer(self.decoder(encoded_input))

        return output

    def encode(self, input):
        return self.encoder(input)

    def decode(self, z):
        return self.decoder(z)

In [6]:
input_size = dataX.shape[1]
latent_dim = 256
model = AE(input_size,latent_dim)
criterion = nn.MSELoss()
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

In [7]:
dataloader = DataLoader(dataX, batch_size=256,shuffle=True)

num_epochs = 50
# do = nn.Dropout()  # comment out for standard AE
for epoch in range(num_epochs):
    for data in dataloader:
        img = data
        # ===================forward=====================
        output = model(img)  # feed  (for std AE) or  (for denoising AE)
        loss = criterion(output, img.data)
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # ===================log========================
    print(f'epoch [{epoch + 1}/{num_epochs}], loss:{loss.item():.4f}')

epoch [1/50], loss:15.6698
epoch [2/50], loss:3.4600
epoch [3/50], loss:2.2080
epoch [4/50], loss:1.9165
epoch [5/50], loss:1.9226
epoch [6/50], loss:2.0124
epoch [7/50], loss:1.9090
epoch [8/50], loss:1.8625
epoch [9/50], loss:1.8386
epoch [10/50], loss:1.7932
epoch [11/50], loss:1.6058
epoch [12/50], loss:1.7646
epoch [13/50], loss:1.5789
epoch [14/50], loss:1.6420
epoch [15/50], loss:1.4677
epoch [16/50], loss:1.4616
epoch [17/50], loss:1.4413
epoch [18/50], loss:1.4825
epoch [19/50], loss:1.4121
epoch [20/50], loss:1.4006
epoch [21/50], loss:1.3408
epoch [22/50], loss:1.3019
epoch [23/50], loss:1.2889
epoch [24/50], loss:1.3389
epoch [25/50], loss:1.2603
epoch [26/50], loss:1.2973
epoch [27/50], loss:1.2434
epoch [28/50], loss:1.2488
epoch [29/50], loss:1.2023
epoch [30/50], loss:1.2122
epoch [31/50], loss:1.2180
epoch [32/50], loss:1.1744
epoch [33/50], loss:1.1335
epoch [34/50], loss:1.1506
epoch [35/50], loss:1.1629
epoch [36/50], loss:1.1535
epoch [37/50], loss:1.1029
epoch [38

In [8]:
#ROOT_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), '..'))
#save_path = os.path.join(ROOT_DIR, 'data','cell_line_data','tcga_encoder.pth')
torch.save(model.state_dict(), "/Users/chengqi_xu/Documents/Elemento lab/synergyy/data/cell_line_data/tcga/tcga_encoder.pth")