In [1]:
!nvidia-smi

Sun Dec 29 12:35:01 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-PCIE-16GB           On  | 00000000:21:00.0 Off |                    0 |
| N/A   29C    P0              23W / 250W |      4MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE-16GB           On  | 00000000:81:00.0 Off |  

In [14]:
import MOICVAE.SNF as snf
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchmetrics
from torchmetrics import MeanSquaredError
from sklearn.impute import KNNImputer
import scripts
from functools import lru_cache
import optuna
from sklearn.metrics import pairwise_distances
from sklearn.exceptions import DataConversionWarning
import warnings
from torch.utils.data import Dataset
# 경고 무시 설정
warnings.filterwarnings("ignore", category=DataConversionWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

In [2]:
class OmicsDataset_dict(Dataset): 
    def __init__(self, omic_dict, drug_dict, data): 
        self.omic_dict = omic_dict
        self.drug_dict = drug_dict
        self.cell_mapped_ids = {key:i for i, key in enumerate(self.omic_dict.keys())}
        # omic_dict의 키를 고유한 인덱스로 매핑
        # enumerate는 키들을 순서대로 열거하여 (인덱스, 키) 형태의 튜플로 반환
        # 딕셔너리 컴프레헨션: 각 키를 key로, 각 키의 인덱스를 i로 사용하여 {key:i}형태로 매핑된 딕셔너리 만듬.
        self.drug_mapped_ids = {key:i for i, key in enumerate(self.drug_dict.keys())}
        self.data = data
        
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx): # idx = train_data
        instance = self.data.iloc[idx] 
        cell_id = instance.iloc[0]
        drug_id = instance.iloc[1]
        target = instance.iloc[2]
        
        #omics_data = { # usage of dictionary here causes a problem or crash with collate_fn function in Dataloader 
        #    cell_id : {
        #        data_type: self.omic_dict[cell_id][data_type] for data_type in self.omic_dict[cell_id].keys()
        #    }
        #}
        
        return (torch.cat([self.omic_dict[cell_id][modality] for modality in self.omic_dict[cell_id].keys()]), 
                self.drug_dict[drug_id],
                torch.Tensor([target]),
                torch.Tensor([self.cell_mapped_ids[cell_id]]),
                torch.Tensor([self.drug_mapped_ids[drug_id]])) 

In [3]:
@lru_cache(maxsize=None)
def get_data_corr_with_filtering(n_fold = 0, fp_radius = 2, transform_into_corr = True, typ = ["rnaseq", "mutations", "cnvs"]):
    # drug
    smile_dict = pd.read_csv("data/smiles.csv", index_col=0)
    fp = scripts.FingerprintFeaturizer(R = fp_radius)
    drug_dict = fp(smile_dict.iloc[:, 1], smile_dict.iloc[:, 0])
    
    # loading all datasets
    driver_genes = pd.read_csv("data/driver_genes.csv").loc[:, "symbol"].dropna()

    rnaseq = pd.read_csv("data/rnaseq_normcount.csv", index_col=0)
    driver_columns = rnaseq.columns.isin(driver_genes)
    filtered_rna = rnaseq.loc[:, driver_columns]
    
    proteomics = pd.read_csv("data/proteomics.csv", index_col=0)
    
    mutation = pd.read_csv("data/binary_mutations.csv")
    mutation.columns = mutation.iloc[0]
    mutation = mutation.iloc[2:,:].set_index("gene_symbol")
    driver_columns = mutation.columns.isin(driver_genes)
    filtered_mut = mutation.loc[:, driver_columns]
    filtered_mut = filtered_mut.astype(float)

    methylations = pd.read_csv("data/methylations.csv",index_col = 0).sort_index(ascending = True)

    cnvs = pd.read_csv("data/copy_number_variations.csv",index_col= 0)

    # concatenate all dataset 
    # inner join based on index: model_ids with NaN are automatically filtered out 
    data_concat = pd.concat([filtered_rna, proteomics, filtered_mut, methylations, cnvs], axis=1, join='inner')
    
    
    # Filter data by common indices in all modalities
    filtered_rna = filtered_rna[filtered_rna.index.isin(data_concat.index)]
    proteomics = proteomics[proteomics.index.isin(data_concat.index)]
    filtered_mut = filtered_mut[filtered_mut.index.isin(data_concat.index)]
    methylations = methylations[methylations.index.isin(data_concat.index)]
    cnvs = cnvs[cnvs.index.isin(data_concat.index)]
    
    # Initialize cell_dict
    cell_dict = {}

    if not transform_into_corr:
        for cell in data_concat.index:
            # Initialize a sub-dictionary for each cell
            cell_dict[cell] = {}
            
            # Add data for each type specified in typ
            if "rnaseq" in typ:
                cell_dict[cell]["rnaseq"] = torch.Tensor(filtered_rna.loc[cell].to_numpy())
            if "proteomics" in typ:
                cell_dict[cell]["proteomics"] = torch.Tensor(proteomics.loc[cell].to_numpy())
            if "mutations" in typ:
                cell_dict[cell]["mutations"] = torch.Tensor(filtered_mut.loc[cell].to_numpy())
            if "methylations" in typ:
                cell_dict[cell]["methylations"] = torch.Tensor(methylations.loc[cell].to_numpy())
            if "cnvs" in typ:
                cell_dict[cell]["cnvs"] = torch.Tensor(cnvs.loc[cell].to_numpy())

    # GDSC
    GDSC1 = pd.read_csv("data/GDSC1.csv", index_col=0)
    # default, remove data where lines or drugs are missing:
    data = GDSC1.query("SANGER_MODEL_ID in @data_concat.index & DRUG_ID in @drug_dict.keys()")
    unique_cell_lines = data.loc[:, "SANGER_MODEL_ID"].unique()

    np.random.seed(420) # for comparibility, don't change it!
    np.random.shuffle(unique_cell_lines)
    folds = np.array_split(unique_cell_lines, 10)
    test_lines = folds[0] 
    train_idxs = list(range(10))
    train_idxs.remove(n_fold)
    np.random.seed(420)
    validation_idx = np.random.choice(train_idxs)
    train_idxs.remove(validation_idx)
    train_lines = np.concatenate([folds[idx] for idx in train_idxs])
    validation_lines = folds[validation_idx]
    test_lines = folds[n_fold] 
    
    if transform_into_corr:
        # ic50 filtering
        ic50_mat = data.pivot(index = 'SANGER_MODEL_ID', columns = 'DRUG_ID', values = 'LN_IC50')
        drug_nan_ratio = ic50_mat.isna().mean(axis=0) 
        cellline_nan_ratio = ic50_mat.isna().mean(axis=1)
        filtered_ic50 = ic50_mat.loc[cellline_nan_ratio < 0.3, drug_nan_ratio < 0.3]
        imputer = KNNImputer(n_neighbors=5)  # k-NN에서 k=5
        imputed_ic50 = pd.DataFrame(
            imputer.fit_transform(filtered_ic50),
            index=filtered_ic50.index,
            columns=filtered_ic50.columns)      
        t = imputed_ic50.median()
        binarized_ic50 = imputed_ic50.apply(lambda x: x.apply(lambda v: 1 if v <= t[x.name] else 0), axis=0)
        
        # index filtering, here only exp, mutation, cnv data are used
        cell_line_index = binarized_ic50.index.intersection(data_concat.index)
        ic50 = binarized_ic50.loc[cell_line_index]
        
        # train, val, test among filtered data
        # these are valid train_, val_ and test_data index
        train_lines = np.intersect1d(train_lines, ic50.index)
        valid_validation_lines = np.intersect1d(validation_lines, ic50.index)
        valid_test_lines = np.intersect1d(test_lines, ic50.index)
        
        n_train = len(train_lines)  
        n_val = len(valid_validation_lines)      
        n_test = len(valid_test_lines)
        
        # Precompute similarity matrices for each data type
        similarity_matrices = {}
        
        if "rnaseq" in typ:
            exp_com = np.corrcoef(np.vstack([filtered_rna.loc[train_lines], 
                                             filtered_rna.loc[valid_validation_lines], 
                                             filtered_rna.loc[valid_test_lines]]), rowvar=True)
            train = exp_com[:n_train, :n_train]
            val = exp_com[n_train:n_train+n_val, :n_train]
            test = exp_com[n_train+n_val:, :n_train]
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["rnaseq"] = sim_combined
        
        if "proteomics" in typ:
            prot_com = np.corrcoef(np.vstack([proteomics.loc[train_lines], 
                                              proteomics.loc[valid_validation_lines], 
                                              proteomics.loc[valid_test_lines]]), rowvar=True)
            train = prot_com[:n_train, :n_train]
            val = prot_com[n_train:n_train+n_val, :n_train]
            test = prot_com[n_train+n_val:, :n_train]
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["proteomics"] = sim_combined
        
        if "mutations" in typ:
            train_snp = filtered_mut.loc[train_lines].astype(bool)
            val_snp = filtered_mut.loc[valid_validation_lines].astype(bool)
            test_snp = filtered_mut.loc[valid_test_lines].astype(bool)
            
            train = 1 - pairwise_distances(train_snp.values, train_snp.values, metric="jaccard")
            val = 1 - pairwise_distances(val_snp.values, train_snp.values, metric="jaccard")
            test = 1 - pairwise_distances(test_snp.values, train_snp.values, metric="jaccard")
    
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["mutations"] = sim_combined
        
        if "methylations" in typ:
            methyl_com = np.corrcoef(np.vstack([methylations.loc[train_lines], 
                                                methylations.loc[valid_validation_lines], 
                                                methylations.loc[valid_test_lines]]), rowvar=True)
            train = methyl_com[:n_train, :n_train]
            val = methyl_com[n_train:n_train+n_val, :n_train]
            test = methyl_com[n_train+n_val:, :n_train]
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["mathylations"] = sim_combined
        
        if "cnvs" in typ:
            cnv_com = np.corrcoef(np.vstack([cnvs.loc[train_lines], 
                                             cnvs.loc[valid_validation_lines], 
                                             cnvs.loc[valid_test_lines]]), rowvar=True)
            train= cnv_com[:n_train, :n_train]
            val= cnv_com[n_train:n_train+n_val, :n_train]
            test= cnv_com[n_train+n_val:, :n_train]
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["cnvs"] = sim_combined
            
        cell_dict = {}

        # 
        for cell in cell_line_index:
            cell_dict[cell] = {}
            for data_type in typ:
                sim_matrices = similarity_matrices[data_type]
                sim_tensor = torch.Tensor(sim_matrices)
                cell_idx = cell_line_index.get_loc(cell)
                cell_dict[cell][data_type] = sim_tensor[cell_idx]
                
        train_lines = train_lines
        validation_lines = valid_validation_lines
        test_lines = valid_test_lines

    # no change needed, query works fine with some missing
    train_data = data.query("SANGER_MODEL_ID in @train_lines")
    validation_data = data.query("SANGER_MODEL_ID in @validation_lines")
    test_data = data.query("SANGER_MODEL_ID in @test_lines")
    
    return (OmicsDataset_dict(cell_dict, drug_dict, train_data),
    OmicsDataset_dict(cell_dict, drug_dict, validation_data),
    OmicsDataset_dict(cell_dict, drug_dict, test_data))

In [28]:
@lru_cache(maxsize=None)
def get_data_corr(n_fold = 0, fp_radius = 2, transform_into_corr = True, typ = ["rnaseq", "mutations", "cnvs"]):
    # drug
    smile_dict = pd.read_csv("data/smiles.csv", index_col=0)
    fp = scripts.FingerprintFeaturizer(R = fp_radius)
    drug_dict = fp(smile_dict.iloc[:, 1], smile_dict.iloc[:, 0])
    
    # loading all datasets
    driver_genes = pd.read_csv("data/driver_genes.csv").loc[:, "symbol"].dropna()

    rnaseq = pd.read_csv("data/rnaseq_normcount.csv", index_col=0)
    driver_columns = rnaseq.columns.isin(driver_genes)
    filtered_rna = rnaseq.loc[:, driver_columns]
    
    proteomics = pd.read_csv("data/proteomics.csv", index_col=0)
    
    mutation = pd.read_csv("data/binary_mutations.csv")
    mutation.columns = mutation.iloc[0]
    mutation = mutation.iloc[2:,:].set_index("gene_symbol")
    driver_columns = mutation.columns.isin(driver_genes)
    filtered_mut = mutation.loc[:, driver_columns]
    filtered_mut = filtered_mut.astype(float)

    methylations = pd.read_csv("data/methylations.csv",index_col = 0).sort_index(ascending = True)

    cnvs = pd.read_csv("data/copy_number_variations.csv",index_col= 0)

    # concatenate all dataset 
    # inner join based on index: model_ids with NaN are automatically filtered out 
    data_concat = pd.concat([filtered_rna, proteomics, filtered_mut, methylations, cnvs], axis=1, join='inner')
    
    
    # Filter data by common indices in all modalities
    filtered_rna = filtered_rna[filtered_rna.index.isin(data_concat.index)]
    proteomics = proteomics[proteomics.index.isin(data_concat.index)]
    filtered_mut = filtered_mut[filtered_mut.index.isin(data_concat.index)]
    methylations = methylations[methylations.index.isin(data_concat.index)]
    cnvs = cnvs[cnvs.index.isin(data_concat.index)]
    
    # Initialize cell_dict
    cell_dict = {}

    if not transform_into_corr:
        for cell in data_concat.index:
            # Initialize a sub-dictionary for each cell
            cell_dict[cell] = {}
            
            # Add data for each type specified in typ
            if "rnaseq" in typ:
                cell_dict[cell]["rnaseq"] = torch.Tensor(filtered_rna.loc[cell].to_numpy())
            if "proteomics" in typ:
                cell_dict[cell]["proteomics"] = torch.Tensor(proteomics.loc[cell].to_numpy())
            if "mutations" in typ:
                cell_dict[cell]["mutations"] = torch.Tensor(filtered_mut.loc[cell].to_numpy())
            if "methylations" in typ:
                cell_dict[cell]["methylations"] = torch.Tensor(methylations.loc[cell].to_numpy())
            if "cnvs" in typ:
                cell_dict[cell]["cnvs"] = torch.Tensor(cnvs.loc[cell].to_numpy())

    # GDSC
    GDSC1 = pd.read_csv("data/GDSC1.csv", index_col=0)
    # default, remove data where lines or drugs are missing:
    data = GDSC1.query("SANGER_MODEL_ID in @data_concat.index & DRUG_ID in @drug_dict.keys()")
    unique_cell_lines = data.loc[:, "SANGER_MODEL_ID"].unique()

    np.random.seed(420) # for comparibility, don't change it!
    np.random.shuffle(unique_cell_lines)
    folds = np.array_split(unique_cell_lines, 10)
    test_lines = folds[0] 
    train_idxs = list(range(10))
    train_idxs.remove(n_fold)
    np.random.seed(420)
    validation_idx = np.random.choice(train_idxs)
    train_idxs.remove(validation_idx)
    train_lines = np.concatenate([folds[idx] for idx in train_idxs])
    validation_lines = folds[validation_idx]
    test_lines = folds[n_fold] 

        # no change needed, query works fine with some missing
    train_data = data.query("SANGER_MODEL_ID in @train_lines")
    validation_data = data.query("SANGER_MODEL_ID in @validation_lines")
    test_data = data.query("SANGER_MODEL_ID in @test_lines")
    
    if transform_into_corr:
        # train, val, test among filtered data
        # these are valid train_, val_ and test_data index
        
        
        n_train = len(train_lines)  
        n_val = len(validation_lines)      
        n_test = len(test_lines)
        
        # Precompute similarity matrices for each data type
        similarity_matrices = {}
        
        if "rnaseq" in typ:
            exp_com = np.corrcoef(np.vstack([filtered_rna.loc[train_lines], 
                                             filtered_rna.loc[validation_lines], 
                                             filtered_rna.loc[test_lines]]), rowvar=True)
            train = exp_com[:n_train, :n_train]
            val = exp_com[n_train:n_train+n_val, :n_train]
            test = exp_com[n_train+n_val:, :n_train]
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["rnaseq"] = sim_combined
        
        if "proteomics" in typ:
            prot_com = np.corrcoef(np.vstack([proteomics.loc[train_lines], 
                                              proteomics.loc[validation_lines], 
                                              proteomics.loc[test_lines]]), rowvar=True)
            train = prot_com[:n_train, :n_train]
            val = prot_com[n_train:n_train+n_val, :n_train]
            test = prot_com[n_train+n_val:, :n_train]
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["proteomics"] = sim_combined
        
        if "mutations" in typ:
            train_snp = filtered_mut.loc[train_lines].astype(bool)
            val_snp = filtered_mut.loc[validation_lines].astype(bool)
            test_snp = filtered_mut.loc[test_lines].astype(bool)
            
            train = 1 - pairwise_distances(train_snp.values, train_snp.values, metric="jaccard")
            val = 1 - pairwise_distances(val_snp.values, train_snp.values, metric="jaccard")
            test = 1 - pairwise_distances(test_snp.values, train_snp.values, metric="jaccard")
    
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["mutations"] = sim_combined
        
        if "methylations" in typ:
            methyl_com = np.corrcoef(np.vstack([methylations.loc[train_lines], 
                                                methylations.loc[validation_lines], 
                                                methylations.loc[test_lines]]), rowvar=True)
            train = methyl_com[:n_train, :n_train]
            val = methyl_com[n_train:n_train+n_val, :n_train]
            test = methyl_com[n_train+n_val:, :n_train]
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["mathylations"] = sim_combined
        
        if "cnvs" in typ:
            cnv_com = np.corrcoef(np.vstack([cnvs.loc[train_lines], 
                                             cnvs.loc[validation_lines], 
                                             cnvs.loc[test_lines]]), rowvar=True)
            train= cnv_com[:n_train, :n_train]
            val= cnv_com[n_train:n_train+n_val, :n_train]
            test= cnv_com[n_train+n_val:, :n_train]
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["cnvs"] = sim_combined
            
        cell_dict = {}

        # 
        for cell in unique_cell_lines:
            cell_dict[cell] = {}
            for data_type in typ:
                sim_matrices = similarity_matrices[data_type]
                sim_tensor = torch.Tensor(sim_matrices)
                cell_idx = np.where(unique_cell_lines == cell)[0][0]
                cell_dict[cell][data_type] = sim_tensor[cell_idx]
    
    return (OmicsDataset_dict(cell_dict, drug_dict, train_data),
    OmicsDataset_dict(cell_dict, drug_dict, validation_data),
    OmicsDataset_dict(cell_dict, drug_dict, test_data))

In [16]:
n_fold = 0
fp_radius = 2
transform_into_corr = True
typ = ["rnaseq", "mutations", "cnvs"]

# drug
smile_dict = pd.read_csv("data/smiles.csv", index_col=0)
fp = scripts.FingerprintFeaturizer(R = fp_radius)
drug_dict = fp(smile_dict.iloc[:, 1], smile_dict.iloc[:, 0])

# loading all datasets
driver_genes = pd.read_csv("data/driver_genes.csv").loc[:, "symbol"].dropna()

rnaseq = pd.read_csv("data/rnaseq_normcount.csv", index_col=0)
driver_columns = rnaseq.columns.isin(driver_genes)
filtered_rna = rnaseq.loc[:, driver_columns]

proteomics = pd.read_csv("data/proteomics.csv", index_col=0)

mutation = pd.read_csv("data/binary_mutations.csv")
mutation.columns = mutation.iloc[0]
mutation = mutation.iloc[2:,:].set_index("gene_symbol")
driver_columns = mutation.columns.isin(driver_genes)
filtered_mut = mutation.loc[:, driver_columns]
filtered_mut = filtered_mut.astype(float)

methylations = pd.read_csv("data/methylations.csv",index_col = 0).sort_index(ascending = True)

cnvs = pd.read_csv("data/copy_number_variations.csv",index_col= 0)

# concatenate all dataset 
# inner join based on index: model_ids with NaN are automatically filtered out 
data_concat = pd.concat([filtered_rna, proteomics, filtered_mut, methylations, cnvs], axis=1, join='inner')


# Filter data by common indices in all modalities
filtered_rna = filtered_rna[filtered_rna.index.isin(data_concat.index)]
proteomics = proteomics[proteomics.index.isin(data_concat.index)]
filtered_mut = filtered_mut[filtered_mut.index.isin(data_concat.index)]
methylations = methylations[methylations.index.isin(data_concat.index)]
cnvs = cnvs[cnvs.index.isin(data_concat.index)]

# Initialize cell_dict
cell_dict = {}

if not transform_into_corr:
    for cell in data_concat.index:
        # Initialize a sub-dictionary for each cell
        cell_dict[cell] = {}
        
        # Add data for each type specified in typ
        if "rnaseq" in typ:
            cell_dict[cell]["rnaseq"] = torch.Tensor(filtered_rna.loc[cell].to_numpy())
        if "proteomics" in typ:
            cell_dict[cell]["proteomics"] = torch.Tensor(proteomics.loc[cell].to_numpy())
        if "mutations" in typ:
            cell_dict[cell]["mutations"] = torch.Tensor(filtered_mut.loc[cell].to_numpy())
        if "methylations" in typ:
            cell_dict[cell]["methylations"] = torch.Tensor(methylations.loc[cell].to_numpy())
        if "cnvs" in typ:
            cell_dict[cell]["cnvs"] = torch.Tensor(cnvs.loc[cell].to_numpy())

# GDSC
GDSC1 = pd.read_csv("data/GDSC1.csv", index_col=0)
# default, remove data where lines or drugs are missing:
data = GDSC1.query("SANGER_MODEL_ID in @data_concat.index & DRUG_ID in @drug_dict.keys()")
unique_cell_lines = data.loc[:, "SANGER_MODEL_ID"].unique()

np.random.seed(420) # for comparibility, don't change it!
np.random.shuffle(unique_cell_lines)
folds = np.array_split(unique_cell_lines, 10)
test_lines = folds[0] 
train_idxs = list(range(10))
train_idxs.remove(n_fold)
np.random.seed(420)
validation_idx = np.random.choice(train_idxs)
train_idxs.remove(validation_idx)
train_lines = np.concatenate([folds[idx] for idx in train_idxs])
validation_lines = folds[validation_idx]
test_lines = folds[n_fold] 

    # no change needed, query works fine with some missing
train_data = data.query("SANGER_MODEL_ID in @train_lines")
validation_data = data.query("SANGER_MODEL_ID in @validation_lines")
test_data = data.query("SANGER_MODEL_ID in @test_lines")





In [27]:
transform_into_corr = True
if transform_into_corr:
    # train, val, test among filtered data
    # these are valid train_, val_ and test_data index
    
    
    n_train = len(train_lines)  
    n_val = len(validation_lines)      
    n_test = len(test_lines)
    
    # Precompute similarity matrices for each data type
    similarity_matrices = {}
    
    if "rnaseq" in typ:
        exp_com = np.corrcoef(np.vstack([filtered_rna.loc[train_lines], 
                                         filtered_rna.loc[validation_lines], 
                                         filtered_rna.loc[test_lines]]), rowvar=True)
        train = exp_com[:n_train, :n_train]
        val = exp_com[n_train:n_train+n_val, :n_train]
        test = exp_com[n_train+n_val:, :n_train]
        sim_combined = np.vstack([train, val, test])
        similarity_matrices["rnaseq"] = sim_combined
    
    if "proteomics" in typ:
        prot_com = np.corrcoef(np.vstack([proteomics.loc[train_lines], 
                                          proteomics.loc[validation_lines], 
                                          proteomics.loc[test_lines]]), rowvar=True)
        train = prot_com[:n_train, :n_train]
        val = prot_com[n_train:n_train+n_val, :n_train]
        test = prot_com[n_train+n_val:, :n_train]
        sim_combined = np.vstack([train, val, test])
        similarity_matrices["proteomics"] = sim_combined
    
    if "mutations" in typ:
        train_snp = filtered_mut.loc[train_lines].astype(bool)
        val_snp = filtered_mut.loc[validation_lines].astype(bool)
        test_snp = filtered_mut.loc[test_lines].astype(bool)
        
        train = 1 - pairwise_distances(train_snp.values, train_snp.values, metric="jaccard")
        val = 1 - pairwise_distances(val_snp.values, train_snp.values, metric="jaccard")
        test = 1 - pairwise_distances(test_snp.values, train_snp.values, metric="jaccard")

        sim_combined = np.vstack([train, val, test])
        similarity_matrices["mutations"] = sim_combined
    
    if "methylations" in typ:
        methyl_com = np.corrcoef(np.vstack([methylations.loc[train_lines], 
                                            methylations.loc[validation_lines], 
                                            methylations.loc[test_lines]]), rowvar=True)
        train = methyl_com[:n_train, :n_train]
        val = methyl_com[n_train:n_train+n_val, :n_train]
        test = methyl_com[n_train+n_val:, :n_train]
        sim_combined = np.vstack([train, val, test])
        similarity_matrices["mathylations"] = sim_combined
    
    if "cnvs" in typ:
        cnv_com = np.corrcoef(np.vstack([cnvs.loc[train_lines], 
                                         cnvs.loc[validation_lines], 
                                         cnvs.loc[test_lines]]), rowvar=True)
        train= cnv_com[:n_train, :n_train]
        val= cnv_com[n_train:n_train+n_val, :n_train]
        test= cnv_com[n_train+n_val:, :n_train]
        sim_combined = np.vstack([train, val, test])
        similarity_matrices["cnvs"] = sim_combined
        
    cell_dict = {}

    # 
    for cell in unique_cell_lines:
            cell_dict[cell] = {}
            for data_type in typ:
                sim_matrices = similarity_matrices[data_type]
                sim_tensor = torch.Tensor(sim_matrices)
                cell_idx = np.where(unique_cell_lines == cell)[0][0]
                cell_dict[cell][data_type] = sim_tensor[cell_idx]

In [29]:
config = {"features" : {"fp_radius":2,
                        "transform_into_corr": True},
          "optimizer": {"batch_size": 220,
                        "clip_norm":19,
                        "learning_rate": 0.0004592646200179472,
                        "stopping_patience":15},
          "model":{"embed_dim":485,
                 "hidden_dim":696, # hidden layer의 차원
                 "dropout":0.48541242824674574, # 40퍼센트의 노드를 랜덤하게 드랍아웃 
                 "n_layers": 4, # 3개의 hidden layer를 사용
                 "norm": "batchnorm"}, # batch normalization을 사용하여 모델이 학습 중 출력 분포를 정규화하여 학습을 안정화
         "env": {"fold": 0, # 0번째 fold를 사용하여 학습. 이는 음 n_fold에 들어갈 값을 의미하는 듯 하다. 
                "device":"cuda:1", # GPU자원을 사용할 장치를 지정한다. 
                 "max_epochs": 100, # 최대 epoch 수 
                 "search_hyperparameters":False}} # hyper parameter 이미 있으니 안쓴다.

In [30]:
train_dataset, val_dataset, test_dataset = get_data_corr(n_fold = config["env"]["fold"],
                                                           fp_radius = config["features"]["fp_radius"], 
                                                           transform_into_corr = config["features"]["transform_into_corr"],
                                                           typ = ("rnaseq", "mutations", "cnvs"))



In [40]:
class MultimodalAutoencoder(nn.Module):
    def __init__(self, input_dim, num_modalities = 1,  hidden_dim = 150, latent_dim = 75, fusion_dim = 150, dropout = 0.2):
        # get input as a dictionary
        super(MultimodalAutoencoder, self).__init__()
        # EEEEEEEEEEncoder
        self.input_dim = input_dim
        self.num_modalities = num_modalities
        self.do = nn.Dropout(dropout)

        ############# 여기서 input_dict를 사용하지 않도록 모델 구성을 바꿔야함.. 이런씨발
        self.omics_encoder = nn.ModuleList([
            nn.Sequential(
                nn.Linear(input_dim, hidden_dim), # input 
                nn.ReLU(),
                nn.Linear(hidden_dim, latent_dim) # encoder hidden layer: 150, 75 as the value from the paper. so we start from this 
            )                                 # I dont get why they used 150, 75 for dimension, but we can tune it later
            for _ in range(num_modalities)
        ])
        # fused latent feature 
        self.fusion_layer = nn.Sequential( # I think we need a fusion layer here, to combine the data modalities
            nn.Linear(latent_dim * num_modalities, fusion_dim),
            nn.ReLU(),
            nn.Linear(fusion_dim, latent_dim) # This concatenate latent features of all omics data, and fusion them and make its dim final latent dim
        )                                     # This is the only way I can think of to fuse omics data
        # decoder
        self.omics_decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim * num_modalities)
        )
        # I actually dont understand this step in paper. they said that decoder has symmetric structure as encoder,
        # but the data after MDA they provided, has weird dimension(363x90) which makes no sense. this is the point that i cant understand

    def forward(self, x):
        latent_features = [] # get dictionary as an input 
        for i, encoder in enumerate(self.omics_encoder):
            start_idx = i * self.input_dim
            end_idx = start_idx + self.input_dim
            x_modality = x[:, start_idx:end_idx]
            latent_features.append(encoder(self.do(x_modality)))
            
        latent_fused = torch.cat(latent_features, dim=1)
        latent_final = self.fusion_layer(latent_fused)
        decoded = self.omics_decoder(latent_final)
        return decoded, latent_final

# Model training

In [41]:
dropout_rate = 0.5
learning_rate = 1e-3
batch_size = 256
num_epochs = 1000
num_modality = 3

train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = batch_size,
        shuffle=True,
        drop_last=True
    )

sample_batch = next(iter(train_loader))
sample_omics_data = sample_batch[0]  # 첫 번째 요소는 omics_data (dict)
mod_len = sample_omics_data.shape[1] // num_modality
#input_dict = {f"modality_{i+1}": mod_len for i in range(num_modality)}

model = MultimodalAutoencoder(input_dim = mod_len, num_modalities = 3, dropout = 0.8)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

MultimodalAutoencoder(
  (do): Dropout(p=0.8, inplace=False)
  (omics_encoder): ModuleList(
    (0): Sequential(
      (0): Linear(in_features=587, out_features=150, bias=True)
      (1): ReLU()
      (2): Linear(in_features=150, out_features=75, bias=True)
    )
    (1): Sequential(
      (0): Linear(in_features=587, out_features=150, bias=True)
      (1): ReLU()
      (2): Linear(in_features=150, out_features=75, bias=True)
    )
    (2): Sequential(
      (0): Linear(in_features=587, out_features=150, bias=True)
      (1): ReLU()
      (2): Linear(in_features=150, out_features=75, bias=True)
    )
  )
  (fusion_layer): Sequential(
    (0): Linear(in_features=225, out_features=150, bias=True)
    (1): ReLU()
    (2): Linear(in_features=150, out_features=75, bias=True)
  )
  (omics_decoder): Sequential(
    (0): Linear(in_features=75, out_features=150, bias=True)
    (1): ReLU()
    (2): Linear(in_features=150, out_features=1761, bias=True)
  )
)

In [43]:
omics_data_batch

tensor([[ 0.2081,  0.2020, -0.0809,  ...,  0.0844,  0.0484, -0.0141],
        [-0.3419, -0.3384,  0.2526,  ...,  0.1527, -0.0380,  0.1270],
        [-0.3253, -0.4495,  0.2317,  ...,  0.0783,  0.2118,  0.2633],
        ...,
        [-0.0866, -0.1850,  0.0264,  ...,  0.1203,  0.1833,  0.2244],
        [ 0.1804,  0.3409, -0.0882,  ...,  0.0277,  0.2924,  0.0820],
        [-0.0522, -0.1777,  0.1209,  ..., -0.0610,  0.2157,  0.1006]],
       device='cuda:0')

In [None]:
# model training
x = 0
for epoch in range(num_epochs):
    total_loss = []
    model.train()
    
    # train step
    for batch in train_loader:
        inputs = batch[0].to(device)

        optimizer.zero_grad()
        reconstructed, latent = model(inputs)

        #target_data = torch.cat([inputs[modality] for modality in inputs], dim=1)
        # I think, here inappropriate loss is used. 
        # From what I know, Autoencoder reconstructs input feature, and calculate loss through comparing input feature and reconstructed feature.
        # So, calculating MSE comparing with target_data, which is drug sensitivity, is nonsense. 
        #loss = criterion(reconstructed, target_data)
        
        # here is a new loss function, but it still outputs nan value as a loss...
        loss = criterion(reconstructed, inputs)
        if x < 3:
            
            print(f"Inputs: {inputs}")
            print(f"Reconstructed: {reconstructed}")
        x += 1
        
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config["optimizer"]["clip_norm"])
        optimizer.step()

        total_loss.append(loss.item())
        
    train_loss = np.mean(total_loss)
    scheduler.step(train_loss)

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {train_loss:.5f}")
    
print("Training complete!")


Inputs: tensor([[-0.0871, -0.0700,  0.2474,  ...,  0.0938,  0.5390,  0.0820],
        [ 0.1755,  0.2270, -0.3000,  ..., -0.0374,  0.4547,  0.0854],
        [ 0.3202,  0.3223, -0.2270,  ...,  0.1257,  0.2778,  0.1543],
        ...,
        [-0.0774,  0.0447, -0.1058,  ..., -0.0303,  0.3275,  0.1131],
        [-0.1593, -0.0764, -0.0935,  ..., -0.0150,  0.2229,  0.2515],
        [-0.1343, -0.2704,  0.1957,  ...,  0.0759,  0.0411, -0.0151]],
       device='cuda:0')
Reconstructed: tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
Inputs: tensor([[-0.0124,  0.0450, -0.1593,  ...,  0.0401,  0.3951,  0.0974],
        [-0.1035, -0.1015,  0.3489,  ...,  0.1084,  0.4336,  0.0931],
        [ 0.0218, -0.0903,  0.2

In [39]:
# checking structure of the variables 

x = 0
for batch in train_loader:
    #omics_data = {modality: data.to(device) for modality, data in batch.items()}
    #print(type(batch))
#    print(batch)
    
    x +=1
    if x > 0:
        break

#input_dict.items()
print(sample_omics_data.shape[1])
#print(n_train*3)
print(len(batch[0][0]))

1638
1638
