### 모델 학습용 코드 구현 및 실행

- 학습별 코드 분리 (구분선 사용 및 해당 모델 이름 작성)
- 학습된 파라미터는 ./parameters 에 .pth 형식으로 저장하여 사용

In [1]:
input_file_path = ['./data/2022Data_part1.csv', './data/2022Data_part2.csv']

### Colab 사용시 주석 제거

# !rm -rf SKN19_2ND_5TEAM
# !git clone https://github.com/SKNetworks-AI19-250818/SKN19_2ND_5TEAM.git
# %cd SKN19_2ND_5TEAM

# import sys
# sys.path.append('/content/SKN19_2ND_5TEAM')
# input_file_path = ['/content/SKN19_2ND_5TEAM/data/encoded_dataset.csv']

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch.utils.data import random_split, DataLoader, ConcatDataset
import torch.nn as nn
import torch.optim as optim

import modules.DataAnalysis as DataAnalysis
import modules.ModelAnalysis as ModelAnalysis
import modules.DataModify as DataModify
from modules.DataSelect import DataPreprocessing

import modules.Models as Models

In [3]:

print(torch.__version__)
print(torch.version.cuda)


2.8.0+cu126
12.6


In [4]:
# 랜덤 시드 고정 : 결과 비교용
Models.set_seed(42)

dp = DataPreprocessing()

# device 설정 (cuda 사용 가능 시 cuda 사용)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Dataset 로드
dataset = DataModify.CancerDataset(
    target_column='target_label',              # target column
    time_column='Survival months_bin_3m',      # Survival months
    file_paths=input_file_path,
    transform=dp.run                           # 기존에 정제가 완료된 데이터를 사용할 경우 None
)

sui_input_file_path = ['./data/Suicide.csv']
sui_dataset = DataModify.CancerDataset(
    target_column='target_label',              # target column
    time_column='Survival months_bin_3m',      # Survival months
    file_paths=sui_input_file_path,
    transform=dp.run                           # 기존에 정제가 완료된 데이터를 사용할 경우 None
)

Using device: cuda


  df = pd.read_csv(path)


In [None]:
# train set size 설정 및 분리
# 전체 길이
n = len(dataset)

# 비율 설정
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

# 각 세트 크기 계산
train_size = int(n * train_ratio)
val_size = int(n * val_ratio)
test_size = n - train_size - val_size  # 합이 정확히 맞도록 조정

# 분리 수행
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
train_dataset = ConcatDataset([train_dataset, sui_dataset])

batch_size = 64

# 데이터를 로드
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)


# 모델 초기화
input_dim = dataset.data.shape[1]   # input dimension : data의 feature의 개수
hidden_size = (128, 64)             # 1번째, 2번째 hidden layer의 size
time_bins = 91                      # 3개월 단위로 time을 split하여 각 구간으로 삼음 -> 270개월+ 는 하나로 취급
num_events = 4                      # 사건의 개수

# 모델 선언
model = Models.DeepHitSurvWithSEBlockConcat(input_dim, hidden_size, time_bins, num_events, dropout=.2).to(device)

# 손실함수 및 optimizer 선언
optimizer = optim.Adam(model.parameters(), lr=1e-3)


In [None]:
# 모델 학습
def train_epoch(model, loader, optimizer, device=device):
    # 모델을 train 모드로 설정
    model.train()
    # loss 변수 선언
    total_loss, total_lik, total_rank = 0, 0, 0

    # loader에서 불러온 데이터를 기반으로 학습
    for x, times, events in loader:
        x, times, events = x.to(device), times.to(device), events.to(device)


        optimizer.zero_grad()
        logits, pmf, cif = model(x)
        loss, L_lik, L_rank = Models.deephit_loss(pmf, cif, times, events)

        loss.backward()
        optimizer.step()

        total_loss += loss.item() * x.size(0)
        total_lik += L_lik.item() * x.size(0)
        total_rank += L_rank.item() * x.size(0)

    n = len(loader.dataset)
    return total_loss/n, total_lik/n, total_rank/n

# 모델 평가
def evaluate(model, loader, device=device):
    # 모델을 평가 모드로 설정
    model.eval()
    total_loss, total_lik, total_rank = 0, 0, 0
    
    with torch.no_grad():
        for x, times, events in loader:
            x, times, events = x.to(device), times.to(device), events.to(device)

            logits, pmf, cif = model(x)
            loss, L_lik, L_rank = Models.deephit_loss(pmf, cif, times, events)

            total_loss += loss.item() * x.size(0)
            total_lik += L_lik.item() * x.size(0)
            total_rank += L_rank.item() * x.size(0)

    n = len(loader.dataset)
    return total_loss/n, total_lik/n, total_rank/n

def get_cif_from_model(model, loader, device=device):
    model.eval()
    all_cif = []
    all_times = []
    all_events = []
    with torch.no_grad():
        for x, times, events in loader:
            x = x.to(device)
            logits, pmf, cif = model(x)
            all_cif.append(cif.cpu())
            all_times.append(times)
            all_events.append(events)
    all_cif = torch.cat(all_cif, dim=0)  # (num_samples, num_events, time_bins)
    all_times = torch.cat(all_times, dim=0)
    all_events = torch.cat(all_events, dim=0)
    return all_cif, all_times, all_events

In [None]:
n_epochs = 20
for epoch in range(1, n_epochs+1):
    train_loss, train_lik, train_rank = train_epoch(model, train_loader, optimizer)
    val_loss, val_lik, val_rank = evaluate(model, val_loader)

    print(f"[{epoch:03d}] "
          f"Train Loss={train_loss:.4f} (L={train_lik:.4f}, R={train_rank:.4f}) | "
          f"Val Loss={val_loss:.4f} (L={val_lik:.4f}, R={val_rank:.4f})")


[001] Train Loss=0.9030 (L=0.8940, R=0.0182) | Val Loss=0.7239 (L=0.7179, R=0.0119)
[002] Train Loss=0.7527 (L=0.7450, R=0.0153) | Val Loss=0.6945 (L=0.6876, R=0.0139)
[003] Train Loss=0.7387 (L=0.7312, R=0.0148) | Val Loss=0.6916 (L=0.6848, R=0.0135)
[004] Train Loss=0.7323 (L=0.7250, R=0.0147) | Val Loss=0.6860 (L=0.6800, R=0.0121)
[005] Train Loss=0.7294 (L=0.7221, R=0.0146) | Val Loss=0.6871 (L=0.6814, R=0.0113)
[006] Train Loss=0.7269 (L=0.7196, R=0.0145) | Val Loss=0.6844 (L=0.6783, R=0.0123)
[007] Train Loss=0.7256 (L=0.7183, R=0.0145) | Val Loss=0.6790 (L=0.6727, R=0.0126)
[008] Train Loss=0.7248 (L=0.7175, R=0.0146) | Val Loss=0.6817 (L=0.6756, R=0.0122)
[009] Train Loss=0.7233 (L=0.7161, R=0.0145) | Val Loss=0.6863 (L=0.6797, R=0.0132)
[010] Train Loss=0.7225 (L=0.7153, R=0.0145) | Val Loss=0.6854 (L=0.6783, R=0.0143)
[011] Train Loss=0.7221 (L=0.7149, R=0.0144) | Val Loss=0.6819 (L=0.6755, R=0.0128)
[012] Train Loss=0.7220 (L=0.7147, R=0.0145) | Val Loss=0.6744 (L=0.6685, R=

In [None]:
torch.save(model.state_dict(), "./data/parameters/deephit_model_feature_concat.pth")

In [None]:
# train set CIF 추출
cif_train, times_train, events_train = get_cif_from_model(model, train_loader)

# 사건별 마지막 CIF를 입력으로 사용
X_risk = cif_train[:, :, -2].numpy()  # (num_samples, num_events)
weights = [0.2, 1, 0.3, 5]

risk_target = np.zeros(X_risk.shape[0])
for i in range(len(events_train)):
    t_i = min(times_train[i], cif_train.shape[2]-2)  # 최대값 제한
    if events_train[i] >= 0:
        risk_target[i] = cif_train[i, events_train[i], t_i].item()
    else:
        risk_target[i] = cif_train[i, :, t_i].sum().item()  # 검열 처리

risk_model = Models.WeightedCoxRiskEstimator(num_events=X_risk.shape[1], weights=weights, device=device)
risk_model.fit(X_risk, times_train, events_train)

torch.save(risk_model.event_linears.state_dict(), "./data/parameters/risk_model_event_linears.pth")

In [None]:
risk_scores = risk_model.predict(X_risk)

print("최대값:", np.max(risk_scores))
print("최소값:", np.min(risk_scores))
print("평균값:", np.mean(risk_scores))
print("앞 10개 값:", risk_scores[:10])

최대값: 99.712364
최소값: 18.784138
평균값: 77.81959
앞 10개 값: [99.17208  67.95394  76.25765  98.17578  44.27909  58.175556 98.68389
 98.74225  99.566154 96.214615]


In [None]:

# tensor → numpy 변환
events_np = events_train.numpy()

# 사건 라벨 종류 (-1은 검열)
unique_events = np.unique(events_np)

print("=== 라벨별 Risk Score 통계 ===")
for e in unique_events:
    mask = (events_np == e)
    scores_e = risk_scores[mask]

    if len(scores_e) == 0:
        continue

    print(f"\nEvent {e}:")
    print(f"  개수: {len(scores_e)}")
    print(f"  최대값: {np.max(scores_e):.4f}")
    print(f"  최소값: {np.min(scores_e):.4f}")
    print(f"  평균값: {np.mean(scores_e):.4f}")



=== 라벨별 Risk Score 통계 ===

Event -1:
  개수: 366123
  최대값: 99.7118
  최소값: 18.7841
  평균값: 75.8396

Event 0:
  개수: 41868
  최대값: 99.7124
  최소값: 18.9264
  평균값: 93.9902

Event 1:
  개수: 5857
  최대값: 99.7050
  최소값: 19.3771
  평균값: 83.4820

Event 2:
  개수: 6164
  최대값: 99.7020
  최소값: 19.8268
  평균값: 81.9528

Event 3:
  개수: 2441
  최대값: 99.6347
  최소값: 19.5633
  평균값: 73.4187
