### 모델 학습용 코드 구현 및 실행

- 학습별 코드 분리 (구분선 사용 및 해당 모델 이름 작성)
- 학습된 파라미터는 ./parameters 에 .pth 형식으로 저장하여 사용

In [14]:
input_file_path = ['./data/encoded_dataset.csv']

### Colab 사용시 주석 제거

# !rm -rf SKN19_2ND_5TEAM
# !git clone https://github.com/SKNetworks-AI19-250818/SKN19_2ND_5TEAM.git
# %cd SKN19_2ND_5TEAM

# import sys
# sys.path.append('/content/SKN19_2ND_5TEAM')
# input_file_path = ['/content/SKN19_2ND_5TEAM/data/encoded_dataset.csv']

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch.utils.data import random_split, DataLoader
import torch.nn as nn
import torch.optim as optim
import pickle

import modules.DataAnalysis as DataAnalysis
import modules.ModelAnalysis as ModelAnalysis
import modules.DataModify as DataModify
from modules.DataModify import DataPreprocessing

import modules.Models as Models

In [16]:

print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())


2.8.0+cu126
12.6
91002


In [17]:
df = pd.read_csv("./data/Suicide2010-2021.csv")
dm = DataModify.DataPreprocessing(df)
df_sui,_,_ = dm.run()
df_sui.to_csv("./data/Suicide_encode.csv")

In [18]:
# 랜덤 시드 고정 : 결과 비교용
Models.set_seed(42)

# device 설정 (cuda 사용 가능 시 cuda 사용)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Dataset 로드
dataset = DataModify.CancerDataset(
    target_column='target_label',              # target column
    time_column='Survival months_bin_3m',      # Survival months
    file_paths=input_file_path,
    transform=None          # 기존에 정제가 완료된 데이터를 사용할 경우 None
)


# train set size 설정 및 분리
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

batch_size = 64

# 데이터를 로드
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)


# 모델 초기화
input_dim = dataset.data.shape[1]   # input dimension : data의 feature의 개수
hidden_size = (128, 64)             # 1번째, 2번째 hidden layer의 size
time_bins = 91                      # 3개월 단위로 time을 split하여 각 구간으로 삼음 -> 270개월+ 는 하나로 취급
num_events = 4                      # 사건의 개수

# 모델 선언
model = Models.DeepHitSurvWithSEBlock(input_dim, hidden_size, time_bins, num_events, dropout=.2).to(device)

# 손실함수 및 optimizer 선언
optimizer = optim.Adam(model.parameters(), lr=1e-3)


Using device: cuda


In [19]:
# 모델 학습
def train_epoch(model, loader, optimizer, device=device):
    # 모델을 train 모드로 설정
    model.train()
    # loss 변수 선언
    total_loss, total_lik, total_rank = 0, 0, 0

    # loader에서 불러온 데이터를 기반으로 학습
    for x, times, events in loader:
        x, times, events = x.to(device), times.to(device), events.to(device)


        optimizer.zero_grad()
        logits, pmf, cif = model(x)
        loss, L_lik, L_rank = Models.deephit_loss(pmf, cif, times, events)

        loss.backward()
        optimizer.step()

        total_loss += loss.item() * x.size(0)
        total_lik += L_lik.item() * x.size(0)
        total_rank += L_rank.item() * x.size(0)

    n = len(loader.dataset)
    return total_loss/n, total_lik/n, total_rank/n

# 모델 평가
def evaluate(model, loader, device=device):
    # 모델을 평가 모드로 설정
    model.eval()
    total_loss, total_lik, total_rank = 0, 0, 0
    
    with torch.no_grad():
        for x, times, events in loader:
            x, times, events = x.to(device), times.to(device), events.to(device)

            logits, pmf, cif = model(x)
            loss, L_lik, L_rank = Models.deephit_loss(pmf, cif, times, events)

            total_loss += loss.item() * x.size(0)
            total_lik += L_lik.item() * x.size(0)
            total_rank += L_rank.item() * x.size(0)

    n = len(loader.dataset)
    return total_loss/n, total_lik/n, total_rank/n

def get_cif_from_model(model, loader, device=device):
    model.eval()
    all_cif = []
    all_times = []
    all_events = []
    with torch.no_grad():
        for x, times, events in loader:
            x = x.to(device)
            logits, pmf, cif = model(x)
            all_cif.append(cif.cpu())
            all_times.append(times)
            all_events.append(events)
    all_cif = torch.cat(all_cif, dim=0)  # (num_samples, num_events, time_bins)
    all_times = torch.cat(all_times, dim=0)
    all_events = torch.cat(all_events, dim=0)
    return all_cif, all_times, all_events

In [20]:
n_epochs = 20
for epoch in range(1, n_epochs+1):
    train_loss, train_lik, train_rank = train_epoch(model, train_loader, optimizer)
    val_loss, val_lik, val_rank = evaluate(model, val_loader)

    print(f"[{epoch:03d}] "
          f"Train Loss={train_loss:.4f} (L={train_lik:.4f}, R={train_rank:.4f}) | "
          f"Val Loss={val_loss:.4f} (L={val_lik:.4f}, R={val_rank:.4f})")


[001] Train Loss=1.1529 (L=1.1394, R=0.0271) | Val Loss=0.8056 (L=0.7959, R=0.0193)
[002] Train Loss=0.7883 (L=0.7791, R=0.0184) | Val Loss=0.7620 (L=0.7519, R=0.0202)
[003] Train Loss=0.7551 (L=0.7463, R=0.0176) | Val Loss=0.7274 (L=0.7194, R=0.0160)
[004] Train Loss=0.7470 (L=0.7384, R=0.0172) | Val Loss=0.7271 (L=0.7193, R=0.0156)
[005] Train Loss=0.7436 (L=0.7351, R=0.0169) | Val Loss=0.7256 (L=0.7173, R=0.0167)
[006] Train Loss=0.7411 (L=0.7326, R=0.0169) | Val Loss=0.7394 (L=0.7295, R=0.0198)
[007] Train Loss=0.7394 (L=0.7310, R=0.0168) | Val Loss=0.7228 (L=0.7156, R=0.0143)
[008] Train Loss=0.7386 (L=0.7302, R=0.0169) | Val Loss=0.7216 (L=0.7146, R=0.0140)
[009] Train Loss=0.7370 (L=0.7286, R=0.0168) | Val Loss=0.7229 (L=0.7141, R=0.0176)
[010] Train Loss=0.7367 (L=0.7283, R=0.0168) | Val Loss=0.7183 (L=0.7110, R=0.0145)
[011] Train Loss=0.7353 (L=0.7270, R=0.0168) | Val Loss=0.7257 (L=0.7174, R=0.0165)
[012] Train Loss=0.7351 (L=0.7268, R=0.0167) | Val Loss=0.7188 (L=0.7121, R=

In [21]:
torch.save(model.state_dict(), "./data/parameters/deephit_model_without_drop_cols.pth")

In [49]:
from sklearn.base import BaseEstimator, RegressorMixin

class WeightedCoxRiskEstimator(BaseEstimator, RegressorMixin):
    def __init__(self, num_events=4, lr=1e-2, epochs=100, weights=None, verbose=False, device='cpu'):
        self.num_events = num_events
        self.lr = lr
        self.epochs = epochs
        self.verbose = verbose
        self.device = device
        
        # 가중치는 학습에 사용되지 않음
        if weights is None:
            self.weights = torch.ones(num_events, device=device) / num_events
        else:
            self.weights = torch.tensor(weights, dtype=torch.float32, device=device)

    def _cox_ph_loss(self, risk_score, times, events):
        risk_score = risk_score.squeeze()
        loss = 0.0
        uncensored_idx = (events >= 0).nonzero()[0]
        if len(uncensored_idx) == 0:
            return torch.tensor(0.0, device=self.device)
        for i in uncensored_idx:
            t_i = times[i]
            mask = times >= t_i
            loss += - (risk_score[i] - torch.log(torch.exp(risk_score[mask]).sum()))
        return loss / len(uncensored_idx)

    def fit(self, X, times, events):
        X = torch.tensor(X, dtype=torch.float32, device=self.device)
        times = torch.tensor(times, dtype=torch.float32, device=self.device)
        events = torch.tensor(events, dtype=torch.float32, device=self.device)

        B, K = X.shape
        assert K == self.num_events

        # 사건별 단층 선형 회귀(SLP)
        self.event_linears = nn.ModuleList([nn.Linear(1, 1) for _ in range(K)]).to(self.device)

        optimizer = optim.Adam(self.event_linears.parameters(), lr=self.lr)

        for epoch in range(self.epochs):
            optimizer.zero_grad()

            # 사건별 위험점수
            r_list = [self.event_linears[k](X[:, k:k+1]) for k in range(K)]
            r_stack = torch.cat(r_list, dim=1)  # (B, K)

            # 학습 시에는 단순 평균 (weights 미적용)
            risk_score = r_stack.mean(dim=1)

            # Cox loss 계산
            loss = self._cox_ph_loss(risk_score, times, events)
            loss.backward()
            optimizer.step()

            if self.verbose and (epoch % 10 == 0 or epoch == self.epochs - 1):
                print(f"Epoch {epoch+1}/{self.epochs}, Loss: {loss.item():.6f}")

        return self

    def predict(self, X):
        X = torch.tensor(X, dtype=torch.float32, device=self.device)
        r_list = [self.event_linears[k](X[:, k:k+1]) for k in range(self.num_events)]
        r_stack = torch.cat(r_list, dim=1)

        # 예측 시에만 가중치 적용
        risk_score = (r_stack * self.weights).sum(dim=1)

        # Sigmoid 적용 후 0~100 스케일
        risk_score_scaled = torch.sigmoid(risk_score) * 100

        return risk_score_scaled.detach().cpu().numpy()
   

In [None]:
# train set CIF 추출
cif_train, times_train, events_train = get_cif_from_model(model, train_loader)

# 사건별 마지막 CIF를 입력으로 사용
X_risk = cif_train[:, :, -2].numpy()  # (num_samples, num_events)
weights = [1.5, 3, 0.5, 3]

risk_target = np.zeros(X_risk.shape[0])
for i in range(len(events_train)):
    t_i = min(times_train[i], cif_train.shape[2]-2)  # 최대값 제한
    if events_train[i] >= 0:
        risk_target[i] = cif_train[i, events_train[i], t_i].item()
    else:
        risk_target[i] = cif_train[i, :, t_i].sum().item()  # 검열 처리

risk_model = WeightedCoxRiskEstimator(num_events=X_risk.shape[1], weights=weights, device=device)
risk_model.fit(X_risk, times_train, events_train)

torch.save(risk_model.event_linears.state_dict(), "./data/parameters/risk_model_event_linears.pth")

  times = torch.tensor(times, dtype=torch.float32, device=self.device)
  events = torch.tensor(events, dtype=torch.float32, device=self.device)


In [70]:
risk_scores = risk_model.predict(X_risk)

print("최대값:", np.max(risk_scores))
print("최소값:", np.min(risk_scores))
print("평균값:", np.mean(risk_scores))
print("앞 10개 값:", risk_scores[:10])

최대값: 91.94423
최소값: 5.954586
평균값: 68.19512
앞 10개 값: [ 7.1735663 18.823639  24.198622  12.140998  54.68644    7.11794
 15.090263  52.445255  90.72409   91.4985   ]


In [71]:

# tensor → numpy 변환
events_np = events_train.numpy()

# 사건 라벨 종류 (-1은 검열)
unique_events = np.unique(events_np)

print("=== 라벨별 Risk Score 통계 ===")
for e in unique_events:
    mask = (events_np == e)
    scores_e = risk_scores[mask]

    if len(scores_e) == 0:
        continue

    print(f"\nEvent {e}:")
    print(f"  개수: {len(scores_e)}")
    print(f"  최대값: {np.max(scores_e):.4f}")
    print(f"  최소값: {np.min(scores_e):.4f}")
    print(f"  평균값: {np.mean(scores_e):.4f}")



=== 라벨별 Risk Score 통계 ===

Event -1:
  개수: 366514
  최대값: 91.9442
  최소값: 5.9546
  평균값: 66.4709

Event 0:
  개수: 42018
  최대값: 91.8404
  최소값: 6.5019
  평균값: 84.3570

Event 1:
  개수: 5800
  최대값: 91.8333
  최소값: 6.4398
  평균값: 66.2527

Event 2:
  개수: 6140
  최대값: 91.8419
  최소값: 6.4507
  평균값: 62.3290

Event 3:
  개수: 95
  최대값: 91.7119
  최소값: 6.8284
  평균값: 69.6589
