In [2]:
# %% [1] ───────────────────────────────────────────────────────────────
# 공통 import & 설정
import warnings, random, argparse
from pathlib import Path

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score

SEED = 100
N_VAR = 100
LR = 1e-3
PATIENCE = 10
MAX_EPOCHS = 500          # 노트북에선 짧게 돌려보며 조정
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

DATA_ROOT = Path("./data/b4_g4")          # ★ 경로 확인
RES_ROOT  = Path("./results/b4_g4/")
GMT_FILE  = Path("../biological_knowledge/simulation/SimulationPathways_DeepHisCoM_L1.gmt")

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("DEVICE:", DEVICE)


DEVICE: cpu


In [5]:
# %% [2] ───────────────────────────────────────────────────────────────
# GMT 로드 & 간단 확인
PATHS = []
PATH2GENE = {}

with open(GMT_FILE) as f:
    for ln in f:
        parts = ln.rstrip("\n").split("\t")
        if len(parts) >= 3:
            pid   = parts[0]
            genes = parts[3:]
            PATHS.append(pid)
            PATH2GENE[pid] = genes

print("총 pathway 수:", len(PATHS))
print("예시 5개:", PATHS[:5])
print("첫 pathway 유전자 10개:", PATH2GENE[PATHS[0]][:10])


총 pathway 수: 7
예시 5개: ['True Positive Signal (TGF-beta receptor signaling in EMT)', 'R-HSA-170834-1', 'R-HSA-9006936-1', 'R-HSA-170834-2', 'R-HSA-170834-3']
첫 pathway 유전자 10개: ['ARHGEF18', 'CGN', 'F11R', 'FKBP1A', 'PARD3', 'PARD6A', 'PRKCZ', 'RHOA', 'RPS27A', 'SMURF1']


In [6]:
# %% [3] ───────────────────────────────────────────────────────────────
# 분석할 시뮬레이션/변이 ID 직접 지정 (노트북에서 수동·반복적으로 바꿔보세요)
sim      = 1
variant  = 'gene_permutation'      # 'label_permutation' 도 가능
vid      = 0                       # 0 = original, 1–100 = permutation ID

# -------- 데이터 폴더 확인 --------
dpath = (
    DATA_ROOT / f"{sim}"
    if vid == 0
    else DATA_ROOT / f"{sim}" / variant.replace("_", "-") / f"{vid}"
)
print("데이터 폴더:", dpath)
assert dpath.exists(), "❌ 해당 폴더가 없습니다!"


데이터 폴더: data/b4_g4/1


In [7]:
# %% [4] ───────────────────────────────────────────────────────────────
# (1) somatic_mutation, CNV 파일 로드
som_df = pd.read_csv(dpath / "somatic_mutation_paper.csv", index_col=0)
cnv_df = pd.read_csv(dpath / "P1000_data_CNA_paper.csv", index_col=0)

print("somatic shape:", som_df.shape)
print("cnv shape    :", cnv_df.shape)
display(som_df.iloc[:3, :6])

# (2) CNV split → deletion / amplification
delm_df = (cnv_df == -2).astype(int)
ampm_df = (cnv_df ==  2).astype(int)

# (3) 유전자 union & 정렬
genes = sorted(set(som_df.columns).union(cnv_df.columns))
print("총 유전자 수:", len(genes))

# (4) numpy array 변환 (결측은 0)
som_arr = som_df[genes].fillna(0).values.astype(np.float32)
del_arr = delm_df[genes].fillna(0).values.astype(np.float32)
amp_arr = ampm_df[genes].fillna(0).values.astype(np.float32)

# (5) stack → X
X_all = np.stack([som_arr, del_arr, amp_arr], axis=2)  # [n, g, 3]
print("X_all shape:", X_all.shape)

# (6) y
y_all = (
    pd.read_csv(dpath / "response.csv", index_col=0)["response"]
    .values.astype(np.float32)
)
print("y_all shape:", y_all.shape)


somatic shape: (1011, 101)
cnv shape    : (1011, 101)


Unnamed: 0_level_0,ACVR1B,ACVR1C,ACVR2A,ACVR2B,ACVRL1,AMH
Tumor_Sample_Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00-029N9_LN,0.0,0.0,0.0,0,0.0,0.0
01-087MM_BONE,0.0,0.0,0.0,0,0.0,0.0
01-095N1_LN,0.0,0.0,0.0,0,0.0,0.0


총 유전자 수: 101
X_all shape: (1011, 101, 3)
y_all shape: (1011,)


In [8]:
# %% [5] ───────────────────────────────────────────────────────────────
# split IDs (train / val+test)
splits = dpath / "splits"
tr_ids = pd.read_csv(splits / "training_set_0.csv")["id"].tolist()
va_ids = pd.read_csv(splits / "validation_set.csv")["id"].tolist()
te_ids = pd.read_csv(splits / "test_set.csv")["id"].tolist()

print(f"train {len(tr_ids)}  |  val {len(va_ids)}  |  test {len(te_ids)}")

# samples → 정수 인덱스로 변환
samples = som_df.index.tolist()
sample2idx = {sid: i for i, sid in enumerate(samples)}

tr_idx = [sample2idx[s] for s in tr_ids]
va_idx = [sample2idx[s] for s in va_ids + te_ids]   # 검증+테스트

# numpy 슬라이스
tr_X, tr_y = X_all[tr_idx], y_all[tr_idx]
va_X, va_y = X_all[va_idx], y_all[va_idx]

print("train X", tr_X.shape, "val X", va_X.shape)


train 808  |  val 101  |  test 102
train X (808, 101, 3) val X (203, 101, 3)


In [9]:
# %% [6] ───────────────────────────────────────────────────────────────
# Pathway → gene 인덱스 매핑 (dict: pid → [idx, idx…])
g2i = {g: i for i, g in enumerate(genes)}
pid2idx = {
    pid: [g2i[g] for g in PATH2GENE[pid] if g in g2i]
    for pid in PATHS
}
print("예시 pathway 3개 인덱스 길이:",
      {k: len(v) for k, v in list(pid2idx.items())[:3]})
num_pathways = len(pid2idx)


예시 pathway 3개 인덱스 길이: {'True Positive Signal (TGF-beta receptor signaling in EMT)': 16, 'R-HSA-170834-1': 57, 'R-HSA-9006936-1': 28}


In [10]:
# %% [7] ───────────────────────────────────────────────────────────────
# DataLoader 준비
class MultiOmicsDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y).float().unsqueeze(1)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

tr_loader = DataLoader(MultiOmicsDataset(tr_X, tr_y),
                       batch_size=len(tr_y), shuffle=True)
va_loader = DataLoader(MultiOmicsDataset(va_X, va_y),
                       batch_size=len(va_y), shuffle=False)


In [15]:
pid2idx

{'True Positive Signal (TGF-beta receptor signaling in EMT)': [7,
  24,
  28,
  29,
  49,
  50,
  58,
  60,
  62,
  74,
  82,
  83,
  84,
  88,
  89,
  90],
 'R-HSA-170834-1': [8,
  9,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  26,
  27,
  34,
  37,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  59,
  61,
  63,
  64,
  65,
  67,
  68,
  69,
  72,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  85,
  86,
  87,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  100],
 'R-HSA-9006936-1': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  10,
  11,
  12,
  13,
  14,
  23,
  25,
  30,
  31,
  32,
  33,
  35,
  36,
  38,
  39,
  48,
  66,
  70,
  71,
  73,
  99],
 'R-HSA-170834-2': [8,
  9,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  26,
  27,
  34,
  59,
  61,
  63,
  64,
  65,
  67,
  68,
  69,
  72,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  85,
  86,
  87,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  100],
 'R-HSA-170834-3': [40,
  41,
  42,
  

In [None]:


# %% [9] ───────────────────────────────────────────────────────────────
net = HisCoM_MultiOmics_WS(tr_X.shape[1], pid2idx).to(DEVICE)# 예) Linear(in_features=8, out_features=1, bias=False)


In [19]:
net.path_linear

ModuleDict(
  (True Positive Signal (TGF-beta receptor signaling in EMT)): Linear(in_features=16, out_features=1, bias=False)
  (R-HSA-170834-1): Linear(in_features=57, out_features=1, bias=False)
  (R-HSA-9006936-1): Linear(in_features=28, out_features=1, bias=False)
  (R-HSA-170834-2): Linear(in_features=41, out_features=1, bias=False)
  (R-HSA-170834-3): Linear(in_features=15, out_features=1, bias=False)
  (R-HSA-9006936-2): Linear(in_features=13, out_features=1, bias=False)
  (R-HSA-9006936-3): Linear(in_features=15, out_features=1, bias=False)
)