# Package & Definition

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchmetrics
from torchmetrics import MeanSquaredError
from sklearn.impute import KNNImputer
import scripts
from functools import lru_cache
import optuna
from sklearn.metrics import pairwise_distances
from sklearn.exceptions import DataConversionWarning
import warnings
from torch.utils.data import Dataset
from datetime import datetime
import uuid 
# 경고 무시 설정
warnings.filterwarnings("ignore", category=DataConversionWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# dataset for sim_matrix
class OmicsDataset_dict(Dataset): 
    def __init__(self, omic_dict, drug_dict, data): 
        self.omic_dict = omic_dict
        self.drug_dict = drug_dict
        self.cell_mapped_ids = {key:i for i, key in enumerate(self.omic_dict.keys())}
        # omic_dict의 키를 고유한 인덱스로 매핑
        # enumerate는 키들을 순서대로 열거하여 (인덱스, 키) 형태의 튜플로 반환
        # 딕셔너리 컴프레헨션: 각 키를 key로, 각 키의 인덱스를 i로 사용하여 {key:i}형태로 매핑된 딕셔너리 만듬.
        self.drug_mapped_ids = {key:i for i, key in enumerate(self.drug_dict.keys())}
        self.data = data
        
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx): # idx = train_data
        instance = self.data.iloc[idx] 
        cell_id = instance.iloc[0]
        drug_id = instance.iloc[1]
        target = instance.iloc[2]
        
        #omics_data = { # usage of dictionary here causes a problem or crash with collate_fn function in Dataloader 
        #    cell_id : {
        #        data_type: self.omic_dict[cell_id][data_type] for data_type in self.omic_dict[cell_id].keys()
        #    }
        #}
        
        return (torch.cat([self.omic_dict[cell_id][modality] for modality in self.omic_dict[cell_id].keys()]), 
                self.drug_dict[drug_id],
                torch.Tensor([target]),
                torch.Tensor([self.cell_mapped_ids[cell_id]]),
                torch.Tensor([self.drug_mapped_ids[drug_id]])) 

In [3]:
# MDA model
class MultimodalAutoencoder(nn.Module):
    def __init__(self, input_dims,  hidden_dim_encoders = 150, embed_dim = 75, fusion_dim = 150, dropout_encoders = 0.2):
        # get input as a dictionary
        super(MultimodalAutoencoder, self).__init__()
        # EEEEEEEEEEncoder
        self.input_dims = input_dims
        self.num_modalities = len(input_dims)
        self.do = nn.Dropout(dropout_encoders)

        self.omics_encoder = nn.ModuleList([
            nn.Sequential(
                nn.Linear(mod_input_dim, hidden_dim_encoders), # input 
                nn.ReLU(),
                nn.Linear(hidden_dim_encoders, embed_dim) # encoder hidden layer: 150, 75 as the value from the paper. so we start from this 
            )                                 # I dont get why they used 150, 75 for dimension, but we can tune it later
            for mod_input_dim in self.input_dims
        ])
        # fused latent feature 
        self.fusion_layer = nn.Sequential( # I think we need a fusion layer here, to combine the data modalities
            nn.Linear(embed_dim * self.num_modalities, fusion_dim),
            nn.ReLU(),
            nn.Linear(fusion_dim, embed_dim) # This concatenate latent features of all omics data, and fusion them and make its dim final latent dim
        )                                     # This is the only way I can think of to fuse omics data
        # decoder
        self.omics_decoder = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim_encoders),
            nn.ReLU(),
            nn.Linear(hidden_dim_encoders, sum(input_dims))
        )
        # I actually dont understand this step in paper. they said that decoder has symmetric structure as encoder,
        # but the data after MDA they provided, has weird dimension(363x90) which makes no sense. this is the point that i cant understand

    def forward(self, x):
        latent_features = [] # get dictionary as an input 
        start_idx = 0
        for i, encoder in enumerate(self.omics_encoder):
            mod_input_dim = self.input_dims[i]
            x_modality = x[:, start_idx:start_idx + mod_input_dim]
            latent_features.append(encoder(self.do(x_modality)))
            start_idx += mod_input_dim
            
        latent_fused = torch.cat(latent_features, dim=1)
        latent_final = self.fusion_layer(latent_fused)
        decoded = self.omics_decoder(latent_final)
        return decoded, latent_final

In [4]:
# Main resnet model
class ResNet(nn.Module):
    def __init__(self, embed_dim=256, hidden_dim=1024, dropout=0.1, n_layers=6, norm="layernorm"):
        super().__init__()
        self.mlps = nn.ModuleList()
        
        # Determine normalization layer
        if norm == "layernorm":
            norm_layer = nn.LayerNorm
        elif norm == "batchnorm":
            norm_layer = nn.BatchNorm1d
        else:
            norm_layer = nn.Identity
        
        # Create MLP layers
        for _ in range(n_layers):
            self.mlps.append(
                nn.Sequential(
                    nn.Linear(embed_dim, hidden_dim),
                    norm_layer(hidden_dim),
                    nn.ReLU(),
                    nn.Dropout(dropout),
                    nn.Linear(hidden_dim, embed_dim)
                )
            )
        
        self.lin = nn.Linear(embed_dim, 1)  # Output layer

    def forward(self, x):
        for layer in self.mlps:
            
            x = (layer(x) + x) / 2  # Residual connection
        return self.lin(x)


class Main_model(nn.Module):
    def __init__(self, embed_dim=256, hidden_dim=1024, dropout=0.1, n_layers=6, norm="layernorm", 
                 dropout_omics=0.4, dropout_omics_finetuning=0.4):
        super().__init__()
        
        # ResNet as backbone
        self.resnet = ResNet(embed_dim, hidden_dim, dropout, n_layers, norm)
        
        # Modified embed_d: Two-layer MLP with dropout after ReLU
        self.embed_d = nn.Sequential(
            nn.LazyLinear(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, embed_dim)
        )
        
        # Modified embed_c: Two-layer MLP with dropout after ReLU and before first Linear Layer
        self.embed_c = nn.Sequential(
            nn.Dropout(dropout_omics_finetuning),
            nn.LazyLinear(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_omics),
            nn.Linear(hidden_dim, embed_dim)
        )

    def forward(self, c, d):
        # Combine embedded inputs and pass through ResNet
        return self.resnet(self.embed_d(d) + self.embed_c(c))


In [5]:
# get Data function
@lru_cache(maxsize=None)
def get_data_corr(n_fold = 0, fp_radius = 2, transform_into_corr = True, typ = ["rnaseq", "mutations", "cnvs"],
                  #reconstructed = None
                 ):
    # drug
    smile_dict = pd.read_csv("data/smiles.csv", index_col=0)
    fp = scripts.FingerprintFeaturizer(R = fp_radius)
    drug_dict = fp(smile_dict.iloc[:, 1], smile_dict.iloc[:, 0])
    
    # loading all datasets
    driver_genes = pd.read_csv("data/driver_genes.csv").loc[:, "symbol"].dropna()

    rnaseq = pd.read_csv("data/rnaseq_normcount.csv", index_col=0)
    driver_columns = rnaseq.columns.isin(driver_genes)
    filtered_rna = rnaseq.loc[:, driver_columns]
    
    proteomics = pd.read_csv("data/proteomics.csv", index_col=0)
    
    mutation = pd.read_csv("data/binary_mutations.csv")
    mutation.columns = mutation.iloc[0]
    mutation = mutation.iloc[2:,:].set_index("gene_symbol")
    driver_columns = mutation.columns.isin(driver_genes)
    filtered_mut = mutation.loc[:, driver_columns]
    filtered_mut = filtered_mut.astype(float)

    methylations = pd.read_csv("data/methylations.csv",index_col = 0).sort_index(ascending = True)

    cnvs = pd.read_csv("data/copy_number_variations.csv",index_col= 0)

    # concatenate all dataset 
    # inner join based on index: model_ids with NaN are automatically filtered out 
    data_concat = pd.concat([filtered_rna, proteomics, filtered_mut, methylations, cnvs], axis=1, join='inner')
    
    
    # Filter data by common indices in all modalities
    filtered_rna = filtered_rna[filtered_rna.index.isin(data_concat.index)]
    proteomics = proteomics[proteomics.index.isin(data_concat.index)]
    filtered_mut = filtered_mut[filtered_mut.index.isin(data_concat.index)]
    methylations = methylations[methylations.index.isin(data_concat.index)]
    cnvs = cnvs[cnvs.index.isin(data_concat.index)]
    
    # Initialize cell_dict
    cell_dict = {}

    if not transform_into_corr : #and reconstructed is None:

        dims = []
        if "rnaseq" in typ:
            dims.append(filtered_rna.shape[1])
        if "proteomics" in typ:
            dims.append(proteomics.shape[1])
        if "mutations" in typ:
            dims.append(filtered_mut.shape[1])
        if "methylations" in typ:
            dims.append(methylations.shape[1])
        if "cnvs" in typ:
            dims.append(cnvs.shape[1])
        
        for cell in data_concat.index:
            # Initialize a sub-dictionary for each cell
            concatenated_data = []
            
            # Add data for each type specified in typ
            if "rnaseq" in typ:
                concatenated_data.append(filtered_rna.loc[cell].to_numpy())
            if "proteomics" in typ:
                concatenated_data.append(proteomics.loc[cell].to_numpy())
            if "mutations" in typ:
                concatenated_data.append(filtered_mut.loc[cell].to_numpy())
            if "methylations" in typ:
                concatenated_data.append(methylations.loc[cell].to_numpy())
            if "cnvs" in typ:
                concatenated_data.append(cnvs.loc[cell].to_numpy())

            cell_dict[cell] = torch.Tensor(np.concatenate(concatenated_data))
            
#    if reconstructed is not None:
#        for cell_idx, cell in enumerate(data_concat.index):
#            # cell_dict에 reconstructed 텐서의 각 행(cell 데이터) 저장
#            cell_dict[cell] = reconstructed[cell_idx]
        

    # GDSC
    GDSC1 = pd.read_csv("data/GDSC1.csv", index_col=0)
    # default, remove data where lines or drugs are missing:
    data = GDSC1.query("SANGER_MODEL_ID in @data_concat.index & DRUG_ID in @drug_dict.keys()")
    unique_cell_lines = data.loc[:, "SANGER_MODEL_ID"].unique()

    np.random.seed(420) # for comparibility, don't change it!
    np.random.shuffle(unique_cell_lines)
    folds = np.array_split(unique_cell_lines, 10)
    test_lines = folds[0] 
    train_idxs = list(range(10))
    train_idxs.remove(n_fold)
    np.random.seed(420)
    validation_idx = np.random.choice(train_idxs)
    train_idxs.remove(validation_idx)
    train_lines = np.concatenate([folds[idx] for idx in train_idxs])
    validation_lines = folds[validation_idx]
    test_lines = folds[n_fold] 

        # no change needed, query works fine with some missing
    train_data = data.query("SANGER_MODEL_ID in @train_lines")
    validation_data = data.query("SANGER_MODEL_ID in @validation_lines")
    test_data = data.query("SANGER_MODEL_ID in @test_lines")
    
    if transform_into_corr: #and reconstructed is None:
        # train, val, test among filtered data
        # these are valid train_, val_ and test_data index
        
        
        n_train = len(train_lines)  
        n_val = len(validation_lines)      
        n_test = len(test_lines)
        
        # Precompute similarity matrices for each data type
        similarity_matrices = {}
        dims = []
        
        if "rnaseq" in typ:
            exp_com = np.corrcoef(np.vstack([filtered_rna.loc[train_lines], 
                                             filtered_rna.loc[validation_lines], 
                                             filtered_rna.loc[test_lines]]), rowvar=True)
            train = exp_com[:n_train, :n_train]
            val = exp_com[n_train:n_train+n_val, :n_train]
            test = exp_com[n_train+n_val:, :n_train]
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["rnaseq"] = sim_combined
            dims.append(len(similarity_matrices["rnaseq"][0]))
        
        if "proteomics" in typ:
            prot_com = np.corrcoef(np.vstack([proteomics.loc[train_lines], 
                                              proteomics.loc[validation_lines], 
                                              proteomics.loc[test_lines]]), rowvar=True)
            train = prot_com[:n_train, :n_train]
            val = prot_com[n_train:n_train+n_val, :n_train]
            test = prot_com[n_train+n_val:, :n_train]
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["proteomics"] = sim_combined
            dims.append(len(similarity_matrices["proteomics"][0]))
        
        if "mutations" in typ:
            train_snp = filtered_mut.loc[train_lines].astype(bool)
            val_snp = filtered_mut.loc[validation_lines].astype(bool)
            test_snp = filtered_mut.loc[test_lines].astype(bool)
            
            train = 1 - pairwise_distances(train_snp.values, train_snp.values, metric="jaccard")
            val = 1 - pairwise_distances(val_snp.values, train_snp.values, metric="jaccard")
            test = 1 - pairwise_distances(test_snp.values, train_snp.values, metric="jaccard")
    
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["mutations"] = sim_combined
            dims.append(len(similarity_matrices["mutations"][0]))
        
        if "methylations" in typ:
            methyl_com = np.nan_to_num(np.corrcoef(np.vstack([methylations.loc[train_lines], 
                                                methylations.loc[validation_lines], 
                                                methylations.loc[test_lines]]), rowvar=True))
            train = methyl_com[:n_train, :n_train]
            val = methyl_com[n_train:n_train+n_val, :n_train]
            test = methyl_com[n_train+n_val:, :n_train]
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["methylations"] = sim_combined
            dims.append(len(similarity_matrices["methylations"][0]))
        
        if "cnvs" in typ:
            cnv_com = np.nan_to_num(np.corrcoef(np.vstack([cnvs.loc[train_lines], # nan-generation problem fixed 
                                             cnvs.loc[validation_lines], 
                                             cnvs.loc[test_lines]]), rowvar=True))
            train= cnv_com[:n_train, :n_train]
            val= cnv_com[n_train:n_train+n_val, :n_train]
            test= cnv_com[n_train+n_val:, :n_train]
            sim_combined = np.vstack([train, val, test])
            similarity_matrices["cnvs"] = sim_combined
            dims.append(len(similarity_matrices["cnvs"][0]))
            
        cell_dict = {}

        # 
        for cell in unique_cell_lines:
            cell_dict[cell] = {}
            for data_type in typ:
                sim_matrices = similarity_matrices[data_type]
                sim_tensor = torch.Tensor(sim_matrices)
                cell_idx = np.where(unique_cell_lines == cell)[0][0]
                cell_dict[cell][data_type] = sim_tensor[cell_idx]
    
        return (OmicsDataset_dict(cell_dict, drug_dict, train_data),
        OmicsDataset_dict(cell_dict, drug_dict, validation_data),
        OmicsDataset_dict(cell_dict, drug_dict, test_data),
        dims)

    return (scripts.OmicsDataset(cell_dict, drug_dict, train_data),
    scripts.OmicsDataset(cell_dict, drug_dict, validation_data),
    scripts.OmicsDataset(cell_dict, drug_dict, test_data),
    dims)

In [6]:
# Autoencoder training step

def autoencoder_train_step(config, train_dataset, input_dims):
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config["optimizer"]["pre_batch_size"],
        shuffle=True,
        drop_last=True
    )

    # Autoencoder 모델 초기화
    autoencoder = MultimodalAutoencoder(
        input_dims=input_dims,
        hidden_dim_encoders=config["model"]["hidden_dim_encoders"],
        embed_dim=config["model"]["embed_dim"],
        fusion_dim=config["model"]["fusion_dim"],
        dropout_encoders=config["model"]["dropout_encoders"]
    )

    optimizer = torch.optim.Adam(autoencoder.parameters(), lr=config["optimizer"]["lr_pretraining"])
    criterion = nn.MSELoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True)
    device = torch.device(config["env"]["device"])
    
    autoencoder.to(device)
    autoencoder.train()

    pre_training_epochs = config["model"]["pre_training_epochs"]
    total_losses = []

    for epoch in range(pre_training_epochs):
        total_loss = []
        
        for batch in train_loader:
            inputs = batch[0].to(device)

            optimizer.zero_grad()
            reconstructed, latent_features = autoencoder(inputs)  # Latent Feature 생성

            loss = criterion(reconstructed, inputs)  # MSE Loss 계산
            
            loss.backward()
            optimizer.step()

            total_loss.append(loss.item())

        train_loss = np.mean(total_loss)
        total_losses.append(train_loss)
        scheduler.step(train_loss)

        print(f'Epoch {epoch + 1}/{pre_training_epochs}, Train Loss: {train_loss:.5f}')

    print("Pre-training complete!")

    return autoencoder, total_losses



In [7]:
# model training function
def train_step(model, autoencoder, optimizer, loader, config, device):
    loss = nn.MSELoss()
    ls = []
    model.train()
    for batch in loader:
        optimizer.zero_grad()
        with torch.no_grad():  # Autoencoder는 업데이트되지 않도록 설정
            _, latent_features = autoencoder(batch[0].to(device))  # (batch_size, latent_dim)
        out = model(latent_features, batch[1].to(device))
        l = loss(out.squeeze(), batch[2].to(device).squeeze())
        l.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), config["optimizer"]["clip_norm"])
        ls += [l.item()]
        optimizer.step()
    return np.mean(ls)


# 여기서, autoencoder는 pre-trained 되어있고, frozen 상태이다. 즉, 어떤 데이터를 어떤 조합으로 넣던간에 항상 동일한 latent feature를 생성한다.
# 따라서, 학습 전에, 모든 데이터에 대해 한번만 autoencoder를 사용하더라도 문제가 없을..줄 알았는데!
# autoencoder 과정이 dropout을 포함하고 있기 때문에, 데이터의 분포에 따라 latent feature가 달라질 수 있다. 
# 따라서, 매 train_step마다 데이터를 autoencoder를 거치도록 하겠다. 
def latent_resnet_training(config, train_dataset, autoencoder, validation_dataset=None, use_momentum=True, callback_epoch=None):
    device = torch.device(config["env"]["device"])
    autoencoder.eval()
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config["optimizer"]["batch_size"],
        drop_last=True,
        shuffle=True
    )

    if validation_dataset is not None:
        val_loader = torch.utils.data.DataLoader(
            validation_dataset,
            batch_size=config["optimizer"]["batch_size"],
            drop_last=False,
            shuffle=False
        )

    ### Fine-tuning 모델 생성 ###
    model = Main_model(
        embed_dim=config["model"]["embed_dim"],
        hidden_dim=config["model"]["hidden_dim"], 
        dropout=config["model"]["dropout"], 
        n_layers=config["model"]["n_layers"],  
        dropout_omics=config["model"]["dropout_omics"], 
        dropout_omics_finetuning=config["model"]["dropout_omics_finetuning"],
        norm=config["model"]["norm"]
    )
    #----
    
    #with torch.no_grad():
    #    ae_fusion_weight = autoencoder.omics_encoder[-1][-1].weight  # Autoencoder Encoder 마지막 Linear Layer 가중치
    #    model.embed_c[-1].weight.copy_(ae_fusion_weight)  # ResNet의 embed_c 마지막 Linear Layer 가중치로 복사
    
    #----
    optimizer = torch.optim.Adam(model.parameters(), config["optimizer"]["learning_rate"])
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=5)
    early_stop = scripts.EarlyStop(config["optimizer"]["stopping_patience"])
    model.to(device)

    metrics = torchmetrics.MetricTracker(torchmetrics.MetricCollection({
        "R_cellwise_residuals": scripts.GroupwiseMetric(
            metric=torchmetrics.functional.pearson_corrcoef,
            grouping="drugs",
            average="macro",
            residualize=True
        ),
        "R_cellwise": scripts.GroupwiseMetric(
            metric=torchmetrics.functional.pearson_corrcoef,
            grouping="cell_lines",
            average="macro",
            residualize=False
        ),
        "MSE": torchmetrics.MeanSquaredError()
    }))
    metrics.to(device)

    ### Fine-tuning Loop ###
    for epoch in range(config["env"]["max_epochs"]):
        train_loss = train_step(model, autoencoder, optimizer, train_loader, config, device)

        if epoch == 0:  
            optimizer = torch.optim.Adam(model.parameters(), lr=config["optimizer"]["learning_rate"])
        
        lr_scheduler.step(train_loss)
        
        if validation_dataset is not None:
            validation_metrics = evaluate_step(model, autoencoder, val_loader, metrics, device)
            if epoch > 0 and use_momentum:
                val_target = 0.2 * val_target + 0.8 * validation_metrics['R_cellwise_residuals']
            else:
                val_target = validation_metrics['R_cellwise_residuals']
        else:
            val_target = None

        if callback_epoch is None:
            print(f"epoch : {epoch}: train loss: {train_loss} Smoothed R interaction (validation) {val_target}")
        else:
            callback_epoch(epoch, val_target)

        if early_stop(train_loss):
            break

    return val_target, model

In [8]:
# evaluation step

def evaluate_step(model, autoencoder, loader, metrics, device, save_predictions=False, model_name = "model", dataset_name = "dataset"):
    metrics.increment()
    autoencoder.to(device)
    model.to(device) # ensure model is on the correct device
    autoencoder.eval()
    model.eval()
    

    # Storage for predictions if saving is enabled
    predictions = {"cell_line": [], "drug_id": [], "prediction": [], "target": []}

    for x in loader:
        with torch.no_grad():
            _, latent_features = autoencoder(x[0].to(device))
            out = model(latent_features, x[1].to(device))
            metrics.update(out.squeeze(),
                           x[2].to(device).squeeze(),
                           cell_lines=x[3].to(device).squeeze(),
                           drugs=x[4].to(device).squeeze())
            
            # Save predictions if required
            if save_predictions:
                predictions["cell_line"].extend(x[3].squeeze().tolist())  
                predictions["drug_id"].extend(x[4].squeeze().cpu().tolist())    
                predictions["prediction"].extend(out.squeeze().tolist()) 
                predictions["target"].extend(x[2].squeeze().cpu() .tolist())    

    # Compute and return metrics
    metrics_dict = {it[0]: it[1].item() for it in metrics.compute().items()}

    # Save predictions to a CSV file if required
    if save_predictions:
        df = pd.DataFrame(predictions)
        filename = generate_filename(model_name, dataset_name, extension="csv")
        df.to_csv("results/" + filename, index=False)
        print(f"Predictions saved to: results/{filename}")

    return metrics_dict
        
def generate_filename(model_name="model1", dataset_name="dataset", extension="csv"):
    time = datetime.now().strftime("%Y%m%d_%H:%M:%S")
    unique_id = uuid.uuid4()
    filename = f"pred_{model_name}_{dataset_name}_{time}_{unique_id}.{extension}"
    return filename

In [14]:
# config
config = {"features" : {"fp_radius":2,
                        "use_correlation_representation": True,
                        "num_modalities": 4},
          "optimizer": {"batch_size": 220,
                        "clip_norm":19,
                        "learning_rate": 0.0004592646200179472,
                        "stopping_patience":15,
                        "pre_batch_size": 200,
                        "lr_pretraining": 0.0004592646200179472},
          "model":{"embed_dim":485, # shared
                 "hidden_dim":696, 
                 "dropout":0.48541242824674574, 
                 "n_layers": 4, 
                 "norm": "batchnorm", 
                 "hidden_dim_encoders": 256, # ENCODER, 원래 256
                 "fusion_dim": 700, # ENCODER
                 "dropout_encoders": 0.2,
                 "dropout_omics": 0.4, # second
                 "dropout_omics_finetuning": 0.4, # first
                 "pre_training_epochs": 100}, 
         "env": {"fold": 0,  
                "device":"cuda:0", 
                 "max_epochs": 100, 
                 "search_hyperparameters":False}} 



In [16]:
# new best hyperparameter
config = {"features" : {"fp_radius":2,
                        "use_correlation_representation": True,
                        "num_modalities": 4},
          "optimizer": {"batch_size": 469,
                        "clip_norm":19,
                        "learning_rate": 0.0006230634247016341,
                        "stopping_patience":15,
                        "pre_batch_size": 474,
                        "lr_pretraining": 3.9761097548681355e-06},
          "model":{"embed_dim":215, # shared
                 "hidden_dim":696, 
                 "dropout":0.48541242824674574, 
                 "n_layers": 4, 
                 "norm": "batchnorm", 
                 "hidden_dim_encoders": 696, # ENCODER, 원래 256
                 "fusion_dim": 505, # ENCODER
                 "dropout_encoders": 0.3626415306459327,
                 "dropout_omics": 0.5398956377457799, # second
                 "dropout_omics_finetuning": 0.42563415722955866, # first
                 "pre_training_epochs": 184}, 
         "env": {"fold": 0,  
                "device":"cuda:0", 
                 "max_epochs": 100, 
                 "search_hyperparameters":False}} 

# Hyperparameter Optimization

In [41]:
def train_model_optuna(trial, config):
    """
    Optuna를 활용한 하이퍼파라미터 최적화 함수
    - Pre-training (Autoencoder)
    - Fine-tuning (ResNet)
    """

    ### 1️⃣ Pre-training 하이퍼파라미터 샘플링 ###
    config["model"]["embed_dim"] = trial.suggest_int("embed_dim", 64, 512)
    config["model"]["hidden_dim_encoders"] = trial.suggest_int("hidden_dim_encoders", 64, 2048)
    config["model"]["fusion_dim"] = trial.suggest_int("fusion_dim", 64, 2048)
    config["model"]["dropout_encoders"] = trial.suggest_float("dropout_encoders", 0.0, 0.5)
    config["model"]["pre_training_epochs"] = trial.suggest_int("pre_training_epochs", 1, 500)
    config["optimizer"]["pre_batch_size"] = trial.suggest_int("pre_batch_size", 128, 512)
    config["optimizer"]["lr_pretraining"] = trial.suggest_float("lr_pretraining", 1e-6, 1e-1, log=True)

    ### 2️⃣ Pre-training 실행 ###
    try:
        # Train + Validation 데이터 합쳐서 Pre-training 진행
        pre_trained_autoencoder, pre_training_losses = autoencoder_train_step(
            config, train_dataset, input_dims=input_dims
        )

        # Pre-training 동안의 최소 loss 반환 (낮을수록 좋음)
        pre_training_score = min(pre_training_losses)
        print(f"Trial {trial.number}: Pre-training Loss = {pre_training_score:.5f}")

    except Exception as e:
        print(f"Pre-training Error: {e}")
        return float("inf")  # Pre-training 실패 시, 최대 loss 반환

    ### 3️⃣ Fine-tuning 하이퍼파라미터 샘플링 ###
    config["model"]["dropout_omics"] = trial.suggest_float("dropout_omics", 0.0, 0.9)
    config["model"]["dropout_omics_finetuning"] = trial.suggest_float("dropout_omics_finetuning", 0.0, 0.9)
    config["optimizer"]["learning_rate"] = trial.suggest_float("learning_rate", 1e-6, 1e-1, log=True)
    config["optimizer"]["batch_size"] = trial.suggest_int("batch_size", 64, 512)

    ### 4️⃣ Fine-tuning 실행 (Latent Representation 생성 포함) ###
    try:
        val_target, fine_tuned_resnet = latent_resnet_training(
            config, train_dataset, pre_trained_autoencoder, val_dataset
        )

        print(f"Trial {trial.number}: Fine-tuning R_cellwise_residuals = {val_target:.5f}")
        return val_target  # Fine-tuning 성능 반환 (높을수록 좋음)

    except Exception as e:
        print(f"Fine-tuning Error: {e}")
        return -float("inf")  # Fine-tuning 실패 시, 최소 성능 반환


In [None]:
if config["env"]["search_hyperparameters"]:
    study_name = "optimized_model"
    storage_name = f"sqlite:///studies/{study_name}.db"

    study = optuna.create_study(
        study_name=study_name,
        storage=storage_name,
        direction='maximize',  # Fine-tuning 성능을 최적화 (R_cellwise_residuals 최대화)
        load_if_exists=True,
        pruner=optuna.pruners.MedianPruner(n_startup_trials=30, n_warmup_steps=5, interval_steps=5)
    )

    study.optimize(lambda trial: train_model_optuna(trial, config), n_trials=40)

    best_config = study.best_params
    print("Best Hyperparameters:", best_config)

    # 최적의 하이퍼파라미터를 config에 반영
    config["model"]["embed_dim"] = best_config["embed_dim"]
    config["model"]["hidden_dim_encoders"] = best_config["hidden_dim_encoders"]
    config["model"]["fusion_dim"] = best_config["fusion_dim"]
    config["model"]["dropout_encoders"] = best_config["dropout_encoders"]
    config["model"]["pre_training_epochs"] = best_config["pre_training_epochs"]
    config["optimizer"]["pre_batch_size"] = best_config["pre_batch_size"]
    config["optimizer"]["lr_pretraining"] = best_config["lr_pretraining"]
    config["model"]["dropout_omics"] = best_config["dropout_omics"]
    config["model"]["dropout_omics_finetuning"] = best_config["dropout_omics_finetuning"]
    config["optimizer"]["learning_rate"] = best_config["learning_rate"]
    config["optimizer"]["batch_size"] = best_config["batch_size"]

# 2️⃣ 최적 하이퍼파라미터를 적용한 후, Training (Train + Validation 합쳐서 사용)
train_val_dataset = torch.utils.data.ConcatDataset([train_dataset, val_dataset])

# Pre-training 실행
pre_trained_autoencoder, pre_training_losses = autoencoder_train_step(config, train_val_dataset, input_dims=input_dims)

# Fine-tuning 실행 (이제 Train + Validation을 합쳐서 사용)
fine_tuned_resnet, final_target = latent_resnet_training(
    config, train_val_dataset, autoencoder, pre_trained_autoencoder, validation_dataset=None  # ✅ Validation 없이 학습
)

print(f"Final Training Complete! Final R_cellwise_residuals: {final_target:.5f}")

# Model training 
## Get data

In [10]:
# data loading with raw data
train_dataset, val_dataset, test_dataset, input_dims= get_data_corr(n_fold = config["env"]["fold"],
                                                           fp_radius = config["features"]["fp_radius"], 
                                                           transform_into_corr = True,
                                                           typ = ("rnaseq", "proteomics", "mutations", "methylations"))



여기서 Autoencoder의 evaluation은 수행하지 않는다. Autoencoder를 단독으로 평가하는 것이 Test distribution과 맞지 않을 수 있기 때문이다. 여기서 이걸 먼저 평가하게 되면 Test set의 정보가 노출되어 편향된 결과가 나타날 수 있다. 

또한, 단순 Reconstruction Loss로 이를 중간에 평가하는 것이 애초에 의미가 없을 가능성이 높다. 

그렇다면, 두 모델을 따로 트레이닝 할 것인지, 한번에 트레이닝 할 것인지 정해야겠다. 

1. Autoencoder pretraining -> encoder frozen, used as feature extractor -> Resnet training
   - Autoencoder가 단순 Reconstruction Loss로 학습되므로, ResNet에 필요한 Feature를 충분히 학습하지 못할 가능성이 있음.
3. Autoencoder -> latent feature -> Resnet
   - 설계가 복잡해짐.
   - 두 모델을 한번에 트레이닝하므로, 학습이 불안정할 수 있음. 


둘다 해볼건데, 여기서는 1번을 할거다. 간략적인 플로우는 다음과 같다.

- Train + Val 데이터로 Autoencoder 학습
- Decoder는 사용하지만, 나중에는 Encoder만 저장
- MSE Loss로 Reconstruction 학습
- Autoencoder를 저장해서 이후 ResNet에서 사용 가능하도록 함. latent feature 생성 목적
- Autoencoder를 따로 평가하지 않음.

## Autoencoder training

여기서는 Autoencoder와 Resnet을 한 시스템으로 묶어버리도록 하겠다. 

In [15]:
# pre training with raw data! final version. 
autoencoder, autoencoder_losses = autoencoder_train_step(config, torch.utils.data.ConcatDataset([train_dataset, val_dataset]), input_dims = input_dims)

Epoch 1/100, Train Loss: 0.00676
Epoch 2/100, Train Loss: 0.00160
Epoch 3/100, Train Loss: 0.00123
Epoch 4/100, Train Loss: 0.00102
Epoch 5/100, Train Loss: 0.00090
Epoch 6/100, Train Loss: 0.00082
Epoch 7/100, Train Loss: 0.00075
Epoch 8/100, Train Loss: 0.00070
Epoch 9/100, Train Loss: 0.00065
Epoch 10/100, Train Loss: 0.00061
Epoch 11/100, Train Loss: 0.00057
Epoch 12/100, Train Loss: 0.00054
Epoch 13/100, Train Loss: 0.00051
Epoch 14/100, Train Loss: 0.00049
Epoch 15/100, Train Loss: 0.00047
Epoch 16/100, Train Loss: 0.00045
Epoch 17/100, Train Loss: 0.00043
Epoch 18/100, Train Loss: 0.00042
Epoch 19/100, Train Loss: 0.00041
Epoch 20/100, Train Loss: 0.00039
Epoch 21/100, Train Loss: 0.00039
Epoch 22/100, Train Loss: 0.00038
Epoch 23/100, Train Loss: 0.00037
Epoch 24/100, Train Loss: 0.00037
Epoch 25/100, Train Loss: 0.00036
Epoch 26/100, Train Loss: 0.00036
Epoch 27/100, Train Loss: 0.00035
Epoch 28/100, Train Loss: 0.00035
Epoch 29/100, Train Loss: 0.00035
Epoch 30/100, Train Los

## Resnet training

In [16]:
#weight_path_raw = "trained_models/pretrained_raw_omics.pth"
_, Resnet = latent_resnet_training(
                             config,
                             torch.utils.data.ConcatDataset([train_dataset, val_dataset]), 
                             autoencoder = autoencoder,
                             use_momentum=False)
metrics = torchmetrics.MetricTracker(torchmetrics.MetricCollection(
    {"R_cellwise_residuals":scripts.GroupwiseMetric(metric=torchmetrics.functional.pearson_corrcoef,
                          grouping="drugs",
                          average="macro",
                          residualize=True),
    "R_cellwise":scripts.GroupwiseMetric(metric=torchmetrics.functional.pearson_corrcoef,
                          grouping="cell_lines",
                          average="macro",
                          residualize=False),
    "MSE":torchmetrics.MeanSquaredError()}))
device = torch.device(config["env"]["device"])
metrics.to(device)
test_dataloader = torch.utils.data.DataLoader(test_dataset,
                                       batch_size=config["optimizer"]["batch_size"],
                                       drop_last=False,
                                      shuffle=False,
                                      pin_memory=True)

lesgo
epoch : 0: train loss: 2.183289214471362 Smoothed R interaction (validation) None
epoch : 1: train loss: 1.8301144769076807 Smoothed R interaction (validation) None
epoch : 2: train loss: 1.7184799908118136 Smoothed R interaction (validation) None
epoch : 3: train loss: 1.6683552845741527 Smoothed R interaction (validation) None
epoch : 4: train loss: 1.6316558051905026 Smoothed R interaction (validation) None
epoch : 5: train loss: 1.6059475608160823 Smoothed R interaction (validation) None
epoch : 6: train loss: 1.5791467571140663 Smoothed R interaction (validation) None
epoch : 7: train loss: 1.5496696347948646 Smoothed R interaction (validation) None
epoch : 8: train loss: 1.523333319406427 Smoothed R interaction (validation) None
epoch : 9: train loss: 1.5053276323122795 Smoothed R interaction (validation) None
epoch : 10: train loss: 1.4783500055005436 Smoothed R interaction (validation) None
epoch : 11: train loss: 1.466653089588151 Smoothed R interaction (validation) None

# Evaluation

트레이닝된 Autoencoder의 encoder와, Resnet을 하나의 시스템으로 사용한다. 

In [17]:
autoencoder_resnet_septrain = evaluate_step(Resnet, autoencoder, test_dataloader, metrics, device, save_predictions = True, model_name = "autoencoder_resnet_septrain", dataset_name = "raw_data")
print(f"main model final metrics: {autoencoder_resnet_septrain}")

Predictions saved to: results/pred_autoencoder_resnet_septrain_raw_data_20250310_23:02:57_3b363c2f-e5bc-44cb-94b1-52d32211b7ce.csv
main model final metrics: {'MSE': 2.2382700443267822, 'R_cellwise': 0.8708129525184631, 'R_cellwise_residuals': -0.02552889473736286}
