# Package & Definition

In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchmetrics
from torchmetrics import MeanSquaredError
from sklearn.impute import KNNImputer
import scripts
from functools import lru_cache
import optuna
from sklearn.metrics import pairwise_distances
from sklearn.exceptions import DataConversionWarning
import warnings
from torch.utils.data import Dataset
# 경고 무시 설정
warnings.filterwarnings("ignore", category=DataConversionWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# dataset for sim_matrix
class OmicsDataset_dict(Dataset): 
    def __init__(self, omic_dict, drug_dict, data): 
        self.omic_dict = omic_dict
        self.drug_dict = drug_dict
        self.cell_mapped_ids = {key:i for i, key in enumerate(self.omic_dict.keys())}
        # omic_dict의 키를 고유한 인덱스로 매핑
        # enumerate는 키들을 순서대로 열거하여 (인덱스, 키) 형태의 튜플로 반환
        # 딕셔너리 컴프레헨션: 각 키를 key로, 각 키의 인덱스를 i로 사용하여 {key:i}형태로 매핑된 딕셔너리 만듬.
        self.drug_mapped_ids = {key:i for i, key in enumerate(self.drug_dict.keys())}
        self.data = data
        
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx): # idx = train_data
        instance = self.data.iloc[idx] 
        cell_id = instance.iloc[0]
        drug_id = instance.iloc[1]
        target = instance.iloc[2]
        
        #omics_data = { # usage of dictionary here causes a problem or crash with collate_fn function in Dataloader 
        #    cell_id : {
        #        data_type: self.omic_dict[cell_id][data_type] for data_type in self.omic_dict[cell_id].keys()
        #    }
        #}
        
        return (torch.cat([self.omic_dict[cell_id][modality] for modality in self.omic_dict[cell_id].keys()]), 
                self.drug_dict[drug_id],
                torch.Tensor([target]),
                torch.Tensor([self.cell_mapped_ids[cell_id]]),
                torch.Tensor([self.drug_mapped_ids[drug_id]])) 

In [5]:
# MDA model
class MultimodalAutoencoder(nn.Module):
    def __init__(self, input_dims,  hidden_dim_encoders = 150, embed_dim = 75, fusion_dim = 150, dropout_encoders = 0.2):
        # get input as a dictionary
        super(MultimodalAutoencoder, self).__init__()
        # EEEEEEEEEEncoder
        self.input_dims = input_dims
        self.num_modalities = len(input_dims)
        self.do = nn.Dropout(dropout_encoders)

        self.omics_encoder = nn.ModuleList([
            nn.Sequential(
                nn.Linear(mod_input_dim, hidden_dim_encoders), # input 
                nn.ReLU(),
                nn.Linear(hidden_dim_encoders, embed_dim) # encoder hidden layer: 150, 75 as the value from the paper. so we start from this 
            )                                 # I dont get why they used 150, 75 for dimension, but we can tune it later
            for mod_input_dim in self.input_dims
        ])
        # fused latent feature 
        self.fusion_layer = nn.Sequential( # I think we need a fusion layer here, to combine the data modalities
            nn.Linear(embed_dim * self.num_modalities, fusion_dim),
            nn.ReLU(),
            nn.Linear(fusion_dim, embed_dim) # This concatenate latent features of all omics data, and fusion them and make its dim final latent dim
        )                                     # This is the only way I can think of to fuse omics data
        # decoder
        self.omics_decoder = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim_encoders),
            nn.ReLU(),
            nn.Linear(hidden_dim_encoders, sum(input_dims))
        )
        # I actually dont understand this step in paper. they said that decoder has symmetric structure as encoder,
        # but the data after MDA they provided, has weird dimension(363x90) which makes no sense. this is the point that i cant understand

    def forward(self, x):
        latent_features = [] # get dictionary as an input 
        start_idx = 0
        for i, encoder in enumerate(self.omics_encoder):
            mod_input_dim = self.input_dims[i]
            x_modality = x[:, start_idx:start_idx + mod_input_dim]
            latent_features.append(encoder(self.do(x_modality)))
            start_idx += mod_input_dim
            
        latent_fused = torch.cat(latent_features, dim=1)
        latent_final = self.fusion_layer(latent_fused)
        decoded = self.omics_decoder(latent_final)
        return decoded, latent_final

In [6]:
# Main resnet model
class ResNet(nn.Module):
    def __init__(self, embed_dim=256, hidden_dim=1024, dropout=0.1, n_layers=6, norm="layernorm"):
        super().__init__()
        self.mlps = nn.ModuleList()
        
        # Determine normalization layer
        if norm == "layernorm":
            norm_layer = nn.LayerNorm
        elif norm == "batchnorm":
            norm_layer = nn.BatchNorm1d
        else:
            norm_layer = nn.Identity
        
        # Create MLP layers
        for _ in range(n_layers):
            self.mlps.append(
                nn.Sequential(
                    nn.Linear(embed_dim, hidden_dim),
                    norm_layer(hidden_dim),
                    nn.ReLU(),
                    nn.Dropout(dropout),
                    nn.Linear(hidden_dim, embed_dim)
                )
            )
        
        self.lin = nn.Linear(embed_dim, 1)  # Output layer

    def forward(self, x):
        for layer in self.mlps:
            
            x = (layer(x) + x) / 2  # Residual connection
        return self.lin(x)


class Main_model(nn.Module):
    def __init__(self, embed_dim=256, hidden_dim=1024, dropout=0.1, n_layers=6, norm="layernorm", 
                 dropout_omics=0.4, dropout_omics_finetuning=0.4):
        super().__init__()
        
        # ResNet as backbone
        self.resnet = ResNet(embed_dim, hidden_dim, dropout, n_layers, norm)
        
        # Modified embed_d: Two-layer MLP with dropout after ReLU
        self.embed_d = nn.Sequential(
            nn.LazyLinear(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, embed_dim)
        )
        
        # Modified embed_c: Two-layer MLP with dropout after ReLU and before first Linear Layer
        self.embed_c = nn.Sequential(
            nn.Dropout(dropout_omics_finetuning),
            nn.LazyLinear(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_omics),
            nn.Linear(hidden_dim, embed_dim)
        )

    def forward(self, c, d):
        # Combine embedded inputs and pass through ResNet
        return self.resnet(self.embed_d(d) + self.embed_c(c))


In [7]:
# get Data function
@lru_cache(maxsize=None)
def get_data_corr(n_fold = 0, fp_radius = 2, transform_into_corr = True, typ = ["rnaseq", "mutations", "cnvs"],
                  #reconstructed = None
                 ):
    # drug
    smile_dict = pd.read_csv("data/smiles.csv", index_col=0)
    fp = scripts.FingerprintFeaturizer(R = fp_radius)
    drug_dict = fp(smile_dict.iloc[:, 1], smile_dict.iloc[:, 0])
    
    # loading all datasets
    driver_genes = pd.read_csv("data/driver_genes.csv").loc[:, "symbol"].dropna()

    rnaseq = pd.read_csv("data/rnaseq_normcount.csv", index_col=0)
    driver_columns = rnaseq.columns.isin(driver_genes)
    filtered_rna = rnaseq.loc[:, driver_columns]
    
    proteomics = pd.read_csv("data/proteomics.csv", index_col=0)
    
    mutation = pd.read_csv("data/binary_mutations.csv")
    mutation.columns = mutation.iloc[0]
    mutation = mutation.iloc[2:,:].set_index("gene_symbol")
    driver_columns = mutation.columns.isin(driver_genes)
    filtered_mut = mutation.loc[:, driver_columns]
    filtered_mut = filtered_mut.astype(float)

    methylations = pd.read_csv("data/methylations.csv",index_col = 0).sort_index(ascending = True)

    cnvs = pd.read_csv("data/copy_number_variations.csv",index_col= 0)

    # concatenate all dataset 
    # inner join based on index: model_ids with NaN are automatically filtered out 
    data_concat = pd.concat([filtered_rna, proteomics, filtered_mut, methylations, cnvs], axis=1, join='inner')
    
    
    # Filter data by common indices in all modalities
    filtered_rna = filtered_rna[filtered_rna.index.isin(data_concat.index)]
    proteomics = proteomics[proteomics.index.isin(data_concat.index)]
    filtered_mut = filtered_mut[filtered_mut.index.isin(data_concat.index)]
    methylations = methylations[methylations.index.isin(data_concat.index)]
    cnvs = cnvs[cnvs.index.isin(data_concat.index)]
    
    # Initialize cell_dict
    cell_dict = {}

    if not transform_into_corr : #and reconstructed is None:

        dims = []
        if "rnaseq" in typ:
            dims.append(filtered_rna.shape[1])
        if "proteomics" in typ:
            dims.append(proteomics.shape[1])
        if "mutations" in typ:
            dims.append(filtered_mut.shape[1])
        if "methylations" in typ:
            dims.append(methylations.shape[1])
        if "cnvs" in typ:
            dims.append(cnvs.shape[1])
        
        for cell in data_concat.index:
            # Initialize a sub-dictionary for each cell
            concatenated_data = []
            
            # Add data for each type specified in typ
            if "rnaseq" in typ:
                concatenated_data.append(filtered_rna.loc[cell].to_numpy())
            if "proteomics" in typ:
                concatenated_data.append(proteomics.loc[cell].to_numpy())
            if "mutations" in typ:
                concatenated_data.append(filtered_mut.loc[cell].to_numpy())
            if "methylations" in typ:
                concatenated_data.append(methylations.loc[cell].to_numpy())
            if "cnvs" in typ:
                concatenated_data.append(cnvs.loc[cell].to_numpy())

            cell_dict[cell] = torch.Tensor(np.concatenate(concatenated_data))
            
#    if reconstructed is not None:
#        for cell_idx, cell in enumerate(data_concat.index):
#            # cell_dict에 reconstructed 텐서의 각 행(cell 데이터) 저장
#            cell_dict[cell] = reconstructed[cell_idx]
        

    # GDSC
    GDSC1 = pd.read_csv("data/GDSC1.csv", index_col=0)
    # default, remove data where lines or drugs are missing:
    data = GDSC1.query("SANGER_MODEL_ID in @data_concat.index & DRUG_ID in @drug_dict.keys()")
    unique_cell_lines = data.loc[:, "SANGER_MODEL_ID"].unique()

    np.random.seed(420) # for comparibility, don't change it!
    np.random.shuffle(unique_cell_lines)
    folds = np.array_split(unique_cell_lines, 10)
    test_lines = folds[0] 
    train_idxs = list(range(10))
    train_idxs.remove(n_fold)
    np.random.seed(420)
    validation_idx = np.random.choice(train_idxs)
    train_idxs.remove(validation_idx)
    train_lines = np.concatenate([folds[idx] for idx in train_idxs])
    validation_lines = folds[validation_idx]
    test_lines = folds[n_fold] 

        # no change needed, query works fine with some missing
    train_data = data.query("SANGER_MODEL_ID in @train_lines")
    validation_data = data.query("SANGER_MODEL_ID in @validation_lines")
    test_data = data.query("SANGER_MODEL_ID in @test_lines")
    
    if transform_into_corr: #and reconstructed is None:
        # train, val, test among filtered data
        # these are valid train_, val_ and test_data index
        
        
        n_train = len(train_lines)  
        n_val = len(validation_lines)      
        n_test = len(test_lines)
        
        # Precompute similarity matrices for each data type
        similarity_matrices = {}
        dims = []
        
        if "rnaseq" in typ:
            exp_com = np.corrcoef(np.vstack([filtered_rna.loc[train_lines], 
                                             filtered_rna.loc[validation_lines], 
                                             filtered_rna.loc[test_lines]]), rowvar=True)
            train = exp_com[:n_train, :n_train]
            val = exp_com[n_train:n_train+n_val, :n_train]
            test = exp_com[n_train+n_val:, :n_train]
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["rnaseq"] = sim_combined
            dims.append(len(similarity_matrices["rnaseq"][0]))
        
        if "proteomics" in typ:
            prot_com = np.corrcoef(np.vstack([proteomics.loc[train_lines], 
                                              proteomics.loc[validation_lines], 
                                              proteomics.loc[test_lines]]), rowvar=True)
            train = prot_com[:n_train, :n_train]
            val = prot_com[n_train:n_train+n_val, :n_train]
            test = prot_com[n_train+n_val:, :n_train]
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["proteomics"] = sim_combined
            dims.append(len(similarity_matrices["proteomics"][0]))
        
        if "mutations" in typ:
            train_snp = filtered_mut.loc[train_lines].astype(bool)
            val_snp = filtered_mut.loc[validation_lines].astype(bool)
            test_snp = filtered_mut.loc[test_lines].astype(bool)
            
            train = 1 - pairwise_distances(train_snp.values, train_snp.values, metric="jaccard")
            val = 1 - pairwise_distances(val_snp.values, train_snp.values, metric="jaccard")
            test = 1 - pairwise_distances(test_snp.values, train_snp.values, metric="jaccard")
    
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["mutations"] = sim_combined
            dims.append(len(similarity_matrices["mutations"][0]))
        
        if "methylations" in typ:
            methyl_com = np.nan_to_num(np.corrcoef(np.vstack([methylations.loc[train_lines], 
                                                methylations.loc[validation_lines], 
                                                methylations.loc[test_lines]]), rowvar=True))
            train = methyl_com[:n_train, :n_train]
            val = methyl_com[n_train:n_train+n_val, :n_train]
            test = methyl_com[n_train+n_val:, :n_train]
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["methylations"] = sim_combined
            dims.append(len(similarity_matrices["methylations"][0]))
        
        if "cnvs" in typ:
            cnv_com = np.nan_to_num(np.corrcoef(np.vstack([cnvs.loc[train_lines], # nan-generation problem fixed 
                                             cnvs.loc[validation_lines], 
                                             cnvs.loc[test_lines]]), rowvar=True))
            train= cnv_com[:n_train, :n_train]
            val= cnv_com[n_train:n_train+n_val, :n_train]
            test= cnv_com[n_train+n_val:, :n_train]
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["cnvs"] = sim_combined
            dims.append(len(similarity_matrices["cnvs"][0]))
            
        cell_dict = {}

        # 
        for cell in unique_cell_lines:
            cell_dict[cell] = {}
            for data_type in typ:
                sim_matrices = similarity_matrices[data_type]
                sim_tensor = torch.Tensor(sim_matrices)
                cell_idx = np.where(unique_cell_lines == cell)[0][0]
                cell_dict[cell][data_type] = sim_tensor[cell_idx]
    
        return (OmicsDataset_dict(cell_dict, drug_dict, train_data),
        OmicsDataset_dict(cell_dict, drug_dict, validation_data),
        OmicsDataset_dict(cell_dict, drug_dict, test_data),
        dims)

    return (scripts.OmicsDataset(cell_dict, drug_dict, train_data),
    scripts.OmicsDataset(cell_dict, drug_dict, validation_data),
    scripts.OmicsDataset(cell_dict, drug_dict, test_data),
    dims)

In [8]:
def chain_train_step(autoencoder, resnet, optimizer, loader, config, device):
    """
    Autoencoder와 ResNet을 동시에 학습하는 End-to-End Training Step
    """

    # 손실 함수 정의
    loss_fn_recon = nn.MSELoss()  # Autoencoder의 Reconstruction Loss
    loss_fn_pred = nn.MSELoss()   # ResNet의 Prediction Loss
    ls = []  # ResNet의 Loss 저장
    ls_recon = []  # Autoencoder의 Reconstruction Loss 저장

    autoencoder.train()  # Autoencoder 학습 모드
    resnet.train()  # ResNet 학습 모드
    
    for batch in loader:
        optimizer.zero_grad()

        # 1️⃣ Autoencoder Forward (Latent Feature 생성)
        reconstructed, latent_features = autoencoder(batch[0].to(device))  # (batch_size, latent_dim)
        
        # 2️⃣ ResNet Forward (Latent Feature + Drug 데이터)
        out = resnet(latent_features, batch[1].to(device))

        # 3️⃣ Loss 계산
        loss_recon = loss_fn_recon(reconstructed, batch[0].to(device))  # Autoencoder Loss
        loss_pred = loss_fn_pred(out.squeeze(), batch[2].to(device).squeeze())  # ResNet Loss

        # 4️⃣ Total Loss = Reconstruction Loss + Prediction Loss
        loss = loss_recon + loss_pred

        # 5️⃣ Backpropagation
        loss.backward() # 두 losses를 모두 backpropagation한다.
        torch.nn.utils.clip_grad_norm_(list(autoencoder.parameters()) + list(resnet.parameters()), config["optimizer"]["clip_norm"])
        optimizer.step()

        # 6️⃣ Loss 저장
        ls.append(loss_pred.item())
        ls_recon.append(loss_recon.item())

    return np.mean(ls), np.mean(ls_recon)


def train_auto_resnet_chain(config, train_dataset, validation_dataset=None, use_momentum=True, callback_epoch=None):
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config["optimizer"]["batch_size"],
        drop_last=True,
        shuffle=True
    )

    if validation_dataset is not None:
        val_loader = torch.utils.data.DataLoader(
            validation_dataset,
            batch_size=config["optimizer"]["batch_size"],
            drop_last=False,
            shuffle=False
        )

    autoencoder = MultimodalAutoencoder(
        input_dims=input_dims,
        hidden_dim_encoders=config["model"]["hidden_dim_encoders"],
        embed_dim=config["model"]["embed_dim"],
        fusion_dim=config["model"]["fusion_dim"],
        dropout_encoders=config["model"]["dropout_encoders"]
    )

    resnet = Main_model(
        embed_dim=config["model"]["embed_dim"],
        hidden_dim=config["model"]["hidden_dim"], 
        dropout=config["model"]["dropout"], 
        n_layers=config["model"]["n_layers"],  
        dropout_omics=config["model"]["dropout_omics"], 
        dropout_omics_finetuning=config["model"]["dropout_omics_finetuning"],
        norm=config["model"]["norm"]
    )

    optimizer = torch.optim.Adam(list(autoencoder.parameters()) + list(resnet.parameters()), config["optimizer"]["learning_rate"])
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=5)
    early_stop = scripts.EarlyStop(config["optimizer"]["stopping_patience"])
    device = torch.device(config["env"]["device"])
    autoencoder.to(device)
    resnet.to(device)
    autoencoder.train()
    resnet.train()

    metrics = torchmetrics.MetricTracker(torchmetrics.MetricCollection({
        "R_cellwise_residuals": scripts.GroupwiseMetric(
            metric=torchmetrics.functional.pearson_corrcoef,
            grouping="drugs",
            average="macro",
            residualize=True
        ),
        "R_cellwise": scripts.GroupwiseMetric(
            metric=torchmetrics.functional.pearson_corrcoef,
            grouping="cell_lines",
            average="macro",
            residualize=False
        ),
        "MSE": torchmetrics.MeanSquaredError()
    }))
    metrics.to(device)

    num_epochs = config["env"]["max_epochs"]
    
    for epoch in range(num_epochs):
        ### 1️⃣ Training Step ###
        train_loss_pred, train_loss_recon = chain_train_step(autoencoder, resnet, optimizer, train_loader, config, device)
        total_train_loss = train_loss_pred + train_loss_recon
            
        # Learning Rate Scheduler 업데이트
        lr_scheduler.step(total_train_loss)

        ### 2️⃣ Validation Step ###
        if validation_dataset is not None:
            with torch.no_grad():
                validation_metrics = evaluate_step(autoencoder, resnet, val_loader, metrics, device)
    
                # Momentum을 사용한 Validation Metric 업데이트
                if epoch > 0 and use_momentum:
                    val_target = 0.2 * val_target + 0.8 * validation_metrics['R_cellwise_residuals']
                else:
                    val_target = validation_metrics['R_cellwise_residuals']
        else:
            val_target = None
        
        # 로그 출력
        if callback_epoch is None:
            print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {total_train_loss} | Validation R: {val_target}")
        else:
            callback_epoch(epoch, val_target)

        # Early Stopping 체크
        if early_stop(total_train_loss):
            print("⏹ Early Stopping Triggered. Stopping Training.")
            break

    print("✅ Training Complete!")

    return val_target, autoencoder, resnet
    

    

In [9]:
# evaluation step
def evaluate_step(autoencoder, resnet, loader, metrics, device):
    metrics.increment()
    autoencoder.eval()  # Autoencoder Evaluation Mode
    resnet.eval()  # ResNet Evaluation Mode

    predictions = {"cell_line": [], "drug_id": [], "prediction": [], "target": []}

    for batch in loader:
        with torch.no_grad():
            _, latent_features = autoencoder(batch[0].to(device))  # Autoencoder에서 Latent Feature 생성
            out = resnet(latent_features, batch[1].to(device))  # ResNet 예측

            metrics.update(out.squeeze(),
                           batch[2].to(device).squeeze(),
                           cell_lines=batch[3].to(device).squeeze(),
                           drugs=batch[4].to(device).squeeze())

            # 결과 저장
            predictions["cell_line"].extend(batch[3].squeeze().tolist())  
            predictions["drug_id"].extend(batch[4].squeeze().cpu().tolist())    
            predictions["prediction"].extend(out.squeeze().tolist()) 
            predictions["target"].extend(batch[2].squeeze().cpu().tolist())    

    return {it[0]: it[1].item() for it in metrics.compute().items()}


In [9]:
# config
config = {"features" : {"fp_radius":2,
                        "use_correlation_representation": True,
                        "num_modalities": 4},
          "optimizer": {"batch_size": 220,
                        "clip_norm":19,
                        "learning_rate": 0.0004592646200179472,
                        "stopping_patience":15,
                        "pre_batch_size": 200,
                        "lr_pretraining": 0.0004592646200179472},
          "model":{"embed_dim":485, # shared
                 "hidden_dim":696, 
                 "dropout":0.48541242824674574, 
                 "n_layers": 4, 
                 "norm": "batchnorm", 
                 "hidden_dim_encoders": 256, # ENCODER
                 "fusion_dim": 700, # ENCODER
                 "dropout_encoders": 0.2,
                 "dropout_omics": 0.4, # second
                 "dropout_omics_finetuning": 0.4, # first
                 "pre_training_epochs": 100}, 
         "env": {"fold": 0,  
                "device":"cuda:0", 
                 "max_epochs": 100, 
                 "search_hyperparameters":False}} 

In [2]:
#Best Hyperparameters: {'embed_dim': 493, 'hidden_dim_encoders': 868, 'fusion_dim': 592, 'dropout_encoders': 0.2670033232985919, 'pre_training_epochs': 160, 'hidden_dim': 1982, 'dropout': 0.2965716546354523, 'n_layers': 5, 'dropout_omics': 0.8626326149215306, 'dropout_omics_finetuning': 0.5489897453017749, 'batch_size': 261, 'clip_norm': 10, 'learning_rate': 3.3761769665931243e-06, 'lr_pretraining': 1.435539966576822e-06, 'pre_batch_size': 81}
# new best config
# config
config = {"features" : {"fp_radius":2,
                        "use_correlation_representation": True,
                        "num_modalities": 4},
          "optimizer": {"batch_size": 261,
                        "clip_norm":10,
                        "learning_rate": 3.3761769665931243e-06,
                        "stopping_patience":15,
                        "pre_batch_size": 81,
                        "lr_pretraining": 1.435539966576822e-06},
          "model":{"embed_dim":493, # shared
                 "hidden_dim":1982, 
                 "dropout":0.2965716546354523, 
                 "n_layers": 5, 
                 "norm": "batchnorm", 
                 "hidden_dim_encoders": 868, # ENCODER
                 "fusion_dim": 592, # ENCODER
                 "dropout_encoders": 0.2670033232985919,
                 "dropout_omics": 0.8626326149215306, # second
                 "dropout_omics_finetuning": 0.5489897453017749, # first
                 "pre_training_epochs": 160}, 
         "env": {"fold": 0,  
                "device":"cuda:0", 
                 "max_epochs": 100, 
                 "search_hyperparameters":False}} 

# Hyperparameter Optimization

In [10]:
import optuna

def train_model_optuna(trial, config):
    """
    Optuna를 활용한 하이퍼파라미터 최적화 함수
    - Autoencoder + ResNet을 함께 최적화
    """

    ### 1️⃣ 하이퍼파라미터 샘플링 ###
    # Autoencoder 관련
    config["model"]["embed_dim"] = trial.suggest_int("embed_dim", 64, 512)
    config["model"]["hidden_dim_encoders"] = trial.suggest_int("hidden_dim_encoders", 64, 1024)
    config["model"]["fusion_dim"] = trial.suggest_int("fusion_dim", 64, 1024)
    config["model"]["dropout_encoders"] = trial.suggest_float("dropout_encoders", 0.1, 0.5)
    config["model"]["pre_training_epochs"] = trial.suggest_int("pre_training_epochs", 10, 200)

    # ResNet 관련
    config["model"]["hidden_dim"] = trial.suggest_int("hidden_dim", 256, 2048)
    config["model"]["dropout"] = trial.suggest_float("dropout", 0.1, 0.5)
    config["model"]["n_layers"] = trial.suggest_int("n_layers", 2, 10)
    config["model"]["dropout_omics"] = trial.suggest_float("dropout_omics", 0.1, 0.9)
    config["model"]["dropout_omics_finetuning"] = trial.suggest_float("dropout_omics_finetuning", 0.1, 0.9)

    # Optimizer 관련
    config["optimizer"]["batch_size"] = trial.suggest_int("batch_size", 64, 512)
    config["optimizer"]["clip_norm"] = trial.suggest_int("clip_norm", 5, 20)
    config["optimizer"]["learning_rate"] = trial.suggest_float("learning_rate", 1e-6, 1e-2, log=True)
    config["optimizer"]["lr_pretraining"] = trial.suggest_float("lr_pretraining", 1e-6, 1e-2, log=True)
    config["optimizer"]["pre_batch_size"] = trial.suggest_int("pre_batch_size", 64, 512)

    ### 2️⃣ 데이터 로딩 ###
    train_dataset, val_dataset, test_dataset, input_dims = get_data_corr(
        n_fold=config["env"]["fold"],
        fp_radius=config["features"]["fp_radius"],
        transform_into_corr=config["features"]["use_correlation_representation"],
        typ=("rnaseq", "proteomics", "mutations", "methylations")
    )

    ### 3️⃣ 모델 학습 (Autoencoder + ResNet 동시 학습) ###
    try:
        val_target, autoencoder, resnet = train_auto_resnet_chain(
            config, train_dataset, val_dataset, use_momentum=True
        )
        print(f"Trial {trial.number}: Validation R_cellwise_residuals = {val_target:.5f}")

        return val_target  # 최적화 목표 (Validation R값 최대화)

    except Exception as e:
        print(f"Error in trial {trial.number}: {e}")
        return -float("inf")  # 실패 시 최소 값 반환


'\ndef train_model_optuna(trial, config):\n\n    ### 1️⃣ Pre-training 하이퍼파라미터 샘플링 ###\n    config["model"]["embed_dim"] = trial.suggest_int("embed_dim", 64, 512)\n    config["model"]["hidden_dim_encoders"] = trial.suggest_int("hidden_dim_encoders", 64, 2048)\n    config["model"]["fusion_dim"] = trial.suggest_int("fusion_dim", 64, 2048)\n    config["model"]["dropout_encoders"] = trial.suggest_float("dropout_encoders", 0.0, 0.5)\n    config["model"]["pre_training_epochs"] = trial.suggest_int("pre_training_epochs", 1, 500)\n    config["optimizer"]["pre_batch_size"] = trial.suggest_int("pre_batch_size", 128, 512)\n    config["optimizer"]["lr_pretraining"] = trial.suggest_float("lr_pretraining", 1e-6, 1e-1, log=True)\n\n    ### 2️⃣ Pre-training 실행 ###\n    try:\n        #pre_raw_dataset = torch.utils.data.ConcatDataset([pre_raw_train_dataset, pre_raw_val_dataset])\n        pre_trained_model, pre_training_losses = pre_train_step(config, pre_raw_train_dataset, input_dims=pre_raw_dims)\n\n    

In [11]:
if config["env"]["search_hyperparameters"]:
    study_name = "auto_resnet_chain_opt"
    storage_name = f"sqlite:///studies/{study_name}.db"

    study = optuna.create_study(
        study_name=study_name,
        storage=storage_name,
        direction='maximize',  # Validation R 값을 최대화
        load_if_exists=True,
        pruner=optuna.pruners.MedianPruner(n_startup_trials=30, n_warmup_steps=5, interval_steps=5)
    )

    study.optimize(lambda trial: train_model_optuna(trial, config), n_trials=40)

    best_config = study.best_params
    print("Best Hyperparameters:", best_config)

    # 최적의 하이퍼파라미터를 config에 반영
    config["model"]["embed_dim"] = best_config["embed_dim"]
    config["model"]["hidden_dim_encoders"] = best_config["hidden_dim_encoders"]
    config["model"]["fusion_dim"] = best_config["fusion_dim"]
    config["model"]["dropout_encoders"] = best_config["dropout_encoders"]
    config["model"]["pre_training_epochs"] = best_config["pre_training_epochs"]
    config["model"]["hidden_dim"] = best_config["hidden_dim"]
    config["model"]["dropout"] = best_config["dropout"]
    config["model"]["n_layers"] = best_config["n_layers"]
    config["model"]["dropout_omics"] = best_config["dropout_omics"]
    config["model"]["dropout_omics_finetuning"] = best_config["dropout_omics_finetuning"]
    config["optimizer"]["batch_size"] = best_config["batch_size"]
    config["optimizer"]["clip_norm"] = best_config["clip_norm"]
    config["optimizer"]["learning_rate"] = best_config["learning_rate"]
    config["optimizer"]["lr_pretraining"] = best_config["lr_pretraining"]
    config["optimizer"]["pre_batch_size"] = best_config["pre_batch_size"]

# 2️⃣ 최적 하이퍼파라미터를 적용한 후 Training 실행
train_dataset, val_dataset, test_dataset, input_dims = get_data_corr(
    n_fold=config["env"]["fold"],
    fp_radius=config["features"]["fp_radius"],
    transform_into_corr=config["features"]["use_correlation_representation"],
    typ=("rnaseq", "proteomics", "mutations", "methylations")
)

val_target, autoencoder, resnet = train_auto_resnet_chain(config, train_dataset, val_dataset, use_momentum=True)

print(f"Final Training Complete! Final R_cellwise_residuals: {val_target:.5f}")

'\n# 1️⃣ Optuna를 사용한 Hyperparameter Optimization (Validation 데이터 필요)\nif config["env"]["search_hyperparameters"]:\n    study_name = "raw_pretrained"\n    storage_name = f"sqlite:///studies/{study_name}.db"\n\n    study = optuna.create_study(\n        study_name=study_name,\n        storage=storage_name,\n        direction=\'maximize\',  # Fine-tuning 성능을 최적화 (R_cellwise_residuals 최대화)\n        load_if_exists=True,\n        pruner=optuna.pruners.MedianPruner(n_startup_trials=30, n_warmup_steps=5, interval_steps=5)\n    )\n\n    study.optimize(lambda trial: train_model_optuna(trial, config), n_trials=40)\n\n    best_config = study.best_params\n    print("Best Hyperparameters:", best_config)\n\n    # 최적의 하이퍼파라미터를 config에 반영\n    config["model"]["embed_dim"] = best_config["embed_dim"]\n    config["model"]["hidden_dim_encoders"] = best_config["hidden_dim_encoders"]\n    config["model"]["fusion_dim"] = best_config["fusion_dim"]\n    config["model"]["dropout_encoders"] = best_config["dropout_

# Model training 
## Get data

In [None]:
# data loading with raw data
train_dataset, val_dataset, test_dataset, input_dims= get_data_corr(n_fold = config["env"]["fold"],
                                                           fp_radius = config["features"]["fp_radius"], 
                                                           transform_into_corr = False,
                                                           typ = ("rnaseq", "proteomics", "mutations", "methylations"))



여기서 Autoencoder의 evaluation은 수행하지 않는다. Autoencoder를 단독으로 평가하는 것이 Test distribution과 맞지 않을 수 있기 때문이다. 여기서 이걸 먼저 평가하게 되면 Test set의 정보가 노출되어 편향된 결과가 나타날 수 있다. 

또한, 단순 Reconstruction Loss로 이를 중간에 평가하는 것이 애초에 의미가 없을 가능성이 높다. 

그렇다면, 두 모델을 따로 트레이닝 할 것인지, 한번에 트레이닝 할 것인지 정해야겠다. 

1. Autoencoder pretraining -> encoder frozen, used as feature extractor -> Resnet training
   - Autoencoder가 단순 Reconstruction Loss로 학습되므로, ResNet에 필요한 Feature를 충분히 학습하지 못할 가능성이 있음.
3. Autoencoder -> latent feature -> Resnet
   - 설계가 복잡해짐.
   - 두 모델을 한번에 트레이닝하므로, 학습이 불안정할 수 있음. 


둘다 해볼건데, 여기서는 2번을 할거다. 간략적인 플로우는 다음과 같다.

- autoencoder와 resnet을 동시에 학습.
- 이 과정에서 reconstruction loss와 prediction loss를 합산하여 backpropagation한다. 

## Autoencoder training

여기서는 Autoencoder와 Resnet을 한 시스템으로 묶어버리도록 하겠다. 

In [31]:

_, autoencoder, resnet = train_auto_resnet_chain(config, torch.utils.data.ConcatDataset([train_dataset, val_dataset]), validation_dataset=None, use_momentum=True, callback_epoch=None)

metrics = torchmetrics.MetricTracker(torchmetrics.MetricCollection(
    {"R_cellwise_residuals":scripts.GroupwiseMetric(metric=torchmetrics.functional.pearson_corrcoef,
                          grouping="drugs",
                          average="macro",
                          residualize=True),
    "R_cellwise":scripts.GroupwiseMetric(metric=torchmetrics.functional.pearson_corrcoef,
                          grouping="cell_lines",
                          average="macro",
                          residualize=False),
    "MSE":torchmetrics.MeanSquaredError()}))
device = torch.device(config["env"]["device"])
metrics.to(device)
test_dataloader = torch.utils.data.DataLoader(test_dataset,
                                       batch_size=config["optimizer"]["batch_size"],
                                       drop_last=False,
                                      shuffle=False,
                                      pin_memory=True)

Epoch 1/100 | Train Loss: 4.755145411885786 | Validation R: None
Epoch 2/100 | Train Loss: 2.0263939628552516 | Validation R: None
Epoch 3/100 | Train Loss: 1.9226895056460565 | Validation R: None
Epoch 4/100 | Train Loss: 1.8374938751545333 | Validation R: None
Epoch 5/100 | Train Loss: 1.7872781598335092 | Validation R: None
Epoch 6/100 | Train Loss: 1.7407941228384192 | Validation R: None
Epoch 7/100 | Train Loss: 1.7052333491698761 | Validation R: None
Epoch 8/100 | Train Loss: 1.681087858386567 | Validation R: None
Epoch 9/100 | Train Loss: 1.6516902321269884 | Validation R: None
Epoch 10/100 | Train Loss: 1.6366012364824116 | Validation R: None
Epoch 11/100 | Train Loss: 1.6168349851991173 | Validation R: None
Epoch 12/100 | Train Loss: 1.60428890237834 | Validation R: None
Epoch 13/100 | Train Loss: 1.5756864525681222 | Validation R: None
Epoch 14/100 | Train Loss: 1.5626385465933201 | Validation R: None
Epoch 15/100 | Train Loss: 1.5503592393976495 | Validation R: None
Epoch 16

# Evaluation

트레이닝된 Autoencoder의 encoder와, Resnet을 하나의 시스템으로 사용한다. 

In [32]:
autoencoder_resnet_chain = evaluate_step(autoencoder, resnet, test_dataloader, metrics, device)
print(f"main model final metrics: {autoencoder_resnet_chain}")

  return torch.linalg.solve(A, Xy).T


main model final metrics: {'MSE': 1.5947002172470093, 'R_cellwise': 0.8947927355766296, 'R_cellwise_residuals': 0.3000485599040985}


# 디버깅

In [16]:
trial_configs = {}

# 로그 파일 읽기
with open("slurm-984849.out", "r") as f:
    for line in f:
        line = line.strip()
        
        if "Trial" in line and "finished with value" in line:
            try:
                # Trial ID 추출
                parts = line.split("Trial")
                trial_id = int(parts[1].split()[0])  # "Trial X"에서 X만 추출
                trial_configs[trial_id] = line
            except (IndexError, ValueError) as e:
                print(f"⚠️ Trial ID 추출 오류 발생: {line} | 오류: {e}")

# 가독성 좋게 출력
print("\n📌 오류 발생한 Trial 목록:\n")
for trial_id, trial_info in sorted(trial_configs.items()):
    print(f"🔹 Trial {trial_id}:")
    print(f"   {trial_info}\n")




📌 오류 발생한 Trial 목록:

🔹 Trial 0:
   [I 2025-03-05 03:04:33,360] Trial 0 finished with value: -inf and parameters: {'embed_dim': 160, 'hidden_dim_encoders': 383, 'fusion_dim': 536, 'dropout_encoders': 0.3968670519414298, 'pre_training_epochs': 57, 'hidden_dim': 1972, 'dropout': 0.2785211502629435, 'n_layers': 8, 'dropout_omics': 0.13501467796110936, 'dropout_omics_finetuning': 0.520701483169283, 'batch_size': 448, 'clip_norm': 11, 'learning_rate': 0.00012791306882120737, 'lr_pretraining': 0.0005278582460096422, 'pre_batch_size': 212}. Best is trial 0 with value: -inf.

🔹 Trial 1:
   [I 2025-03-05 03:04:48,224] Trial 1 finished with value: -inf and parameters: {'embed_dim': 277, 'hidden_dim_encoders': 813, 'fusion_dim': 546, 'dropout_encoders': 0.29253026920656744, 'pre_training_epochs': 135, 'hidden_dim': 1684, 'dropout': 0.417402013600224, 'n_layers': 3, 'dropout_omics': 0.5554952178297965, 'dropout_omics_finetuning': 0.7135591469446555, 'batch_size': 233, 'clip_norm': 5, 'learning_rate

In [19]:
config

{'features': {'fp_radius': 2,
  'use_correlation_representation': True,
  'num_modalities': 4},
 'optimizer': {'batch_size': 136,
  'clip_norm': 19,
  'learning_rate': 0.00011257996059322072,
  'stopping_patience': 15,
  'pre_batch_size': 492,
  'lr_pretraining': 8.066275067561858e-06},
 'model': {'embed_dim': 131,
  'hidden_dim': 1825,
  'dropout': 0.49993161850928136,
  'n_layers': 9,
  'norm': 'batchnorm',
  'hidden_dim_encoders': 759,
  'fusion_dim': 428,
  'dropout_encoders': 0.32533743849934327,
  'dropout_omics': 0.5255490177787787,
  'dropout_omics_finetuning': 0.6911248618812798,
  'pre_training_epochs': 73},
 'env': {'fold': 0,
  'device': 'cuda:0',
  'max_epochs': 100,
  'search_hyperparameters': False}}