### 모델 학습용 코드 구현 및 실행

- 학습별 코드 분리 (구분선 사용 및 해당 모델 이름 작성)
- 학습된 파라미터는 ./parameters 에 .pth 형식으로 저장하여 사용

In [6]:
input_file_path = ['./data/encoded_dataset.csv']

### Colab 사용시 주석 제거

# !rm -rf SKN19_2ND_5TEAM
# !git clone https://github.com/SKNetworks-AI19-250818/SKN19_2ND_5TEAM.git
# %cd SKN19_2ND_5TEAM

# import sys
# sys.path.append('/content/SKN19_2ND_5TEAM')
# input_file_path = ['/content/SKN19_2ND_5TEAM/data/encoded_dataset.csv']

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch.utils.data import random_split, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

import modules.DataAnalysis as DataAnalysis
import modules.ModelAnalysis as ModelAnalysis
import modules.DataModify as DataModify
from modules.DataModify import DataPreprocessing

import modules.Models as Models

In [8]:

print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())


2.8.0+cu126
12.6
91002


In [9]:
df = pd.read_csv("./data/Suicide2010-2021.csv")
dm = DataModify.DataPreprocessing(df)
df_sui,_,_ = dm.run()
df_sui.to_csv("./data/Suicide_encode.csv")

In [None]:
# 랜덤 시드 고정 : 결과 비교용
Models.set_seed(42)

# device 설정 (cuda 사용 가능 시 cuda 사용)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Dataset 로드
dataset = DataModify.CancerDataset(
    target_column='target_label',              # target column
    time_column='Survival months_bin_3m',      # Survival months
    file_paths=input_file_path,
    transform=None          # 기존에 정제가 완료된 데이터를 사용할 경우 None
)


# train set size 설정 및 분리
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

batch_size = 64

# 데이터를 로드
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)


# 모델 초기화
input_dim = dataset.data.shape[1]   # input dimension : data의 feature의 개수
hidden_size = (128, 64)             # 1번째, 2번째 hidden layer의 size
time_bins = 91                      # 3개월 단위로 time을 split하여 각 구간으로 삼음 -> 270개월+ 는 하나로 취급
num_events = 4                      # 사건의 개수

# 모델 선언
model = Models.DeepHitSurvWithSEBlock(input_dim, hidden_size, time_bins, num_events, dropout=.2).to(device)

# 손실함수 및 optimizer 선언
optimizer = optim.Adam(model.parameters(), lr=1e-3)


Using device: cuda


In [11]:
# 모델 학습
def train_epoch(model, loader, optimizer, device=device):
    # 모델을 train 모드로 설정
    model.train()
    # loss 변수 선언
    total_loss, total_lik, total_rank = 0, 0, 0

    # loader에서 불러온 데이터를 기반으로 학습
    for x, times, events in loader:
        x, times, events = x.to(device), times.to(device), events.to(device)


        optimizer.zero_grad()
        logits, pmf, cif = model(x)
        loss, L_lik, L_rank = Models.deephit_loss(pmf, cif, times, events)

        loss.backward()
        optimizer.step()

        total_loss += loss.item() * x.size(0)
        total_lik += L_lik.item() * x.size(0)
        total_rank += L_rank.item() * x.size(0)

    n = len(loader.dataset)
    return total_loss/n, total_lik/n, total_rank/n

# 모델 평가
def evaluate(model, loader, device=device):
    # 모델을 평가 모드로 설정
    model.eval()
    total_loss, total_lik, total_rank = 0, 0, 0
    
    with torch.no_grad():
        for x, times, events in loader:
            x, times, events = x.to(device), times.to(device), events.to(device)

            logits, pmf, cif = model(x)
            loss, L_lik, L_rank = Models.deephit_loss(pmf, cif, times, events)

            total_loss += loss.item() * x.size(0)
            total_lik += L_lik.item() * x.size(0)
            total_rank += L_rank.item() * x.size(0)

    n = len(loader.dataset)
    return total_loss/n, total_lik/n, total_rank/n


In [None]:
n_epochs = 20
for epoch in range(1, n_epochs+1):
    train_loss, train_lik, train_rank = train_epoch(model, train_loader, optimizer)
    val_loss, val_lik, val_rank = evaluate(model, val_loader)

    print(f"[{epoch:03d}] "
          f"Train Loss={train_loss:.4f} (L={train_lik:.4f}, R={train_rank:.4f}) | "
          f"Val Loss={val_loss:.4f} (L={val_lik:.4f}, R={val_rank:.4f})")


[001] Train Loss=1.0986 (L=1.0852, R=0.0268) | Val Loss=0.8470 (L=0.8387, R=0.0166)
[002] Train Loss=0.8156 (L=0.8063, R=0.0186) | Val Loss=0.7857 (L=0.7778, R=0.0159)
[003] Train Loss=0.7974 (L=0.7885, R=0.0177) | Val Loss=0.7784 (L=0.7709, R=0.0149)
[004] Train Loss=0.7890 (L=0.7803, R=0.0174) | Val Loss=0.7664 (L=0.7596, R=0.0136)
[005] Train Loss=0.7842 (L=0.7756, R=0.0171) | Val Loss=0.7760 (L=0.7694, R=0.0132)
[006] Train Loss=0.7815 (L=0.7730, R=0.0171) | Val Loss=0.7683 (L=0.7605, R=0.0156)
[007] Train Loss=0.7761 (L=0.7678, R=0.0166) | Val Loss=0.7569 (L=0.7492, R=0.0153)
[008] Train Loss=0.7682 (L=0.7603, R=0.0159) | Val Loss=0.7554 (L=0.7481, R=0.0147)
[009] Train Loss=0.7663 (L=0.7583, R=0.0159) | Val Loss=0.7629 (L=0.7549, R=0.0161)
[010] Train Loss=0.7649 (L=0.7570, R=0.0158) | Val Loss=0.7543 (L=0.7473, R=0.0140)
[011] Train Loss=0.7629 (L=0.7551, R=0.0157) | Val Loss=0.7604 (L=0.7531, R=0.0146)
[012] Train Loss=0.7621 (L=0.7542, R=0.0157) | Val Loss=0.7553 (L=0.7476, R=

In [13]:
torch.save(model.state_dict(), "./data/parameters/deephit_model_without_drop_cols.pth")