In [24]:
import os
import gc
import json
import torch
import logging
import warnings
import numpy as np
import pandas as pd
import xgboost as xgb
from tqdm import tqdm
import torch.nn as nn
from pathlib import Path
from torch.cuda import Event
from typing import List, Dict, Tuple
from datetime import datetime
import torch.distributed as dist
from xgboost import XGBClassifier
from torch.utils.checkpoint import checkpoint
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import Dataset, DataLoader
from torch.nn.parallel import DistributedDataParallel
from transformers import AdamW, get_linear_schedule_with_warmup, get_scheduler, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split, StratifiedKFold
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib
import re

In [25]:
df = pd.read_csv("/content/drive/MyDrive/University/4-2/정보기술학회/data/medical_data.csv", encoding = 'utf-8')
df.shape

(48915, 11)

In [26]:
import re

# 데이터 전처리 및 준비
def preprocess_data(data):
    data.dropna(subset=['증상', '진료과목코드', '주상병코드'], inplace=True)
    return data

def clean_text(text):
    text = text.lower()  # 소문자 변환
    text = re.sub(r"[^가-힣a-zA-Z0-9\\s]", "", text)  # 특수문자 제거
    text = re.sub(r"\\s+", " ", text).strip()  # 공백 정리
    return text

# Custom Dataset 정의
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokens = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            max_length=self.max_length
        )
        return {
            'input_ids': tokens['input_ids'].squeeze(0),
            'attention_mask': tokens['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# KM-BERT 임베딩 함수 (Batch 처리)
def get_embeddings_with_dataset(dataset, model, batch_size=64, num_workers=4):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
    embeddings = []

    for batch in tqdm(dataloader, desc="Generating embeddings"):
        input_ids = batch["input_ids"].squeeze(1).to(device)
        attention_mask = batch["attention_mask"].squeeze(1).to(device)
        with torch.no_grad():
            output = model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(output)

    return np.vstack(embeddings)

# 모델 학습 및 평가 클래스 정의
class ModelTrainer:
    def __init__(self, model, train_loader, val_loader, test_loader, device, num_classes, num_epochs=10):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.test_loader = test_loader
        self.device = device
        self.num_classes = num_classes
        self.num_epochs = num_epochs

        # Optimizer and Scheduler
        self.optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
        num_training_steps = len(train_loader) * self.num_epochs
        num_warmup_steps = num_training_steps // 10
        self.scheduler = get_scheduler(
            "linear",
            optimizer=self.optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )

        # Loss function
        self.criterion = nn.CrossEntropyLoss()

    def train_epoch(self):
        """한 에폭의 학습을 수행하는 메서드"""
        self.model.train()
        total_loss = 0
        all_preds = []
        all_labels = []

        progress_bar = tqdm(self.train_loader, desc="Training")
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            labels = batch['label'].to(self.device)

            self.optimizer.zero_grad()
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = self.criterion(logits, labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            self.scheduler.step()

            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

            progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

        epoch_loss = total_loss / len(self.train_loader)
        epoch_accuracy = accuracy_score(all_labels, all_preds)
        epoch_f1 = f1_score(all_labels, all_preds, average='weighted')

        return {
            'loss': epoch_loss,
            'accuracy': epoch_accuracy,
            'f1': epoch_f1
        }

    def evaluate(self, dataloader, mode='val'):
        """Validation 또는 Test 평가 메서드"""
        self.model.eval()
        total_loss = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in tqdm(dataloader, desc=f"Evaluating ({mode})"):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['label'].to(self.device)

                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = self.criterion(logits, labels)

                total_loss += loss.item()
                preds = torch.argmax(logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_loss = total_loss / len(dataloader)
        accuracy = accuracy_score(all_labels, all_preds)
        f1 = f1_score(all_labels, all_preds, average='weighted')

        metrics = {
            f'{mode}_loss': avg_loss,
            f'{mode}_accuracy': accuracy,
            f'{mode}_f1': f1
        }

        return metrics

    def train(self):
        """전체 학습 수행"""
        for epoch in range(self.num_epochs):
            print(f"\nEpoch {epoch + 1}/{self.num_epochs}")

            # Training
            train_metrics = self.train_epoch()
            print(f"Training metrics: {train_metrics}")

            # Validation
            val_metrics = self.evaluate(self.val_loader, mode='val')
            print(f"Validation metrics: {val_metrics}")

        # 최종 Test 평가
        test_metrics = self.evaluate(self.test_loader, mode='test')
        print("\nFinal Test Results:", test_metrics)

        return test_metrics

In [27]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

data = preprocess_data(df)

# Train-Test Split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# 라벨 인코딩
label_encoder_diagnosis = LabelEncoder()
label_encoder_code = LabelEncoder()
train_data['진료과목코드'] = label_encoder_diagnosis.fit_transform(train_data['진료과목코드'])
test_data['진료과목코드'] = label_encoder_diagnosis.transform(test_data['진료과목코드'])
train_data['주상병코드'] = label_encoder_code.fit_transform(train_data['주상병코드'])
test_data['주상병코드'] = label_encoder_code.transform(test_data['주상병코드'])

In [28]:
#gpu 초기화
gc.collect()
torch.cuda.empty_cache()

In [29]:
# KM-BERT 모델 및 토크나이저 준비
tokenizer = AutoTokenizer.from_pretrained("snunlp/KR-BERT-char16424")
model = AutoModelForSequenceClassification.from_pretrained("madatnlp/km-bert", num_labels=len(label_encoder_diagnosis.classes_))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 텍스트 데이터셋 및 데이터로더
train_dataset = TextDataset(train_data['증상'].tolist(), train_data['진료과목코드'].tolist(), tokenizer)
test_dataset = TextDataset(test_data['증상'].tolist(), test_data['진료과목코드'].tolist(), tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# 데이터 확인
for batch in train_dataloader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['label']
    print(f"KM-BERT Input IDs shape: {input_ids.shape}")
    print(f"KM-BERT Attention Mask shape: {attention_mask.shape}")
    print(f"KM-BERT Labels shape: {labels.shape}")
    break

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at madatnlp/km-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KM-BERT Input IDs shape: torch.Size([64, 512])
KM-BERT Attention Mask shape: torch.Size([64, 512])
KM-BERT Labels shape: torch.Size([64])


In [30]:
# ModelTrainer 초기화 및 학습
trainer = ModelTrainer(
    model=model,
    train_loader=train_dataloader,
    val_loader=test_dataloader,
    test_loader=test_dataloader,
    device=device,
    num_classes=len(label_encoder_diagnosis.classes_),
    num_epochs=1
)

trainer.train()


Epoch 1/1


Training: 100%|██████████| 612/612 [12:00<00:00,  1.18s/it, loss=2.2825]


Training metrics: {'loss': 2.2933085967902263, 'accuracy': 0.26773484616170906, 'f1': 0.22950174206191673}


Evaluating (val): 100%|██████████| 153/153 [01:01<00:00,  2.50it/s]


Validation metrics: {'val_loss': 2.1887322812298544, 'val_accuracy': 0.29479709700500867, 'val_f1': 0.23644439492580255}


Evaluating (test): 100%|██████████| 153/153 [01:01<00:00,  2.50it/s]


Final Test Results: {'test_loss': 2.1887322812298544, 'test_accuracy': 0.29479709700500867, 'test_f1': 0.23644439492580255}





{'test_loss': 2.1887322812298544,
 'test_accuracy': 0.29479709700500867,
 'test_f1': 0.23644439492580255}

In [31]:
# KM-BERT 전체 데이터셋 예측
def predict_kmbert(model, dataloader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting with KM-BERT"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=1).cpu().numpy()
            predictions.append(probs)
    return np.vstack(predictions)

kmbert_train_probs = predict_kmbert(model, train_dataloader, device)
kmbert_test_probs = predict_kmbert(model, test_dataloader, device)

# KM-BERT 출력 크기 확인
print(f"KM-BERT Train Probs Shape: {kmbert_train_probs.shape}")
print(f"KM-BERT Test Probs Shape: {kmbert_test_probs.shape}")

Predicting with KM-BERT: 100%|██████████| 612/612 [04:05<00:00,  2.49it/s]
Predicting with KM-BERT: 100%|██████████| 153/153 [01:01<00:00,  2.50it/s]

KM-BERT Train Probs Shape: (39132, 18)
KM-BERT Test Probs Shape: (9783, 18)





In [32]:
# 1차 분류: XGBoost 학습
# 정형 데이터 준비
X_train_tabular = train_data[['성별코드', '연령대코드', '요양일수', '입내원일수', '총처방일수']]
X_test_tabular = test_data[['성별코드', '연령대코드', '요양일수', '입내원일수', '총처방일수']]

one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
scaler = StandardScaler()

X_train_tabular = scaler.fit_transform(one_hot_encoder.fit_transform(X_train_tabular))
X_test_tabular = scaler.transform(one_hot_encoder.transform(X_test_tabular))

# 1차 분류: XGBoost
xgb_model = XGBClassifier()
xgb_model.fit(X_train_tabular, train_data['진료과목코드'])
xgb_train_probs = xgb_model.predict_proba(X_train_tabular)
xgb_test_probs = xgb_model.predict_proba(X_test_tabular)

# XGBoost 출력 크기 확인
print(f"XGBoost Train Probs Shape: {xgb_train_probs.shape}")
print(f"XGBoost Test Probs Shape: {xgb_test_probs.shape}")

XGBoost Train Probs Shape: (39132, 18)
XGBoost Test Probs Shape: (9783, 18)


In [33]:
# 1차 분류: Stack Ensemble
stack_train_input = np.hstack([kmbert_train_probs, xgb_train_probs])
stack_test_input = np.hstack([kmbert_test_probs, xgb_test_probs])

# Stack Ensemble 입력 크기 확인
print(f"Stack Train Input Shape: {stack_train_input.shape}")
print(f"Stack Test Input Shape: {stack_test_input.shape}")

stack_model = XGBClassifier()
stack_model.fit(stack_train_input, train_data['진료과목코드'])
stack_preds = stack_model.predict(stack_test_input)

# 1차 분류 성능 평가
print(f"1차 분류 Accuracy: {accuracy_score(test_data['진료과목코드'], stack_preds):.4f}")
# print("Classification Report for 1차 분류:")
# print(classification_report(test_data['진료과목코드'], stack_preds, target_names=label_encoder_diagnosis.classes_))

Stack Train Input Shape: (39132, 36)
Stack Test Input Shape: (9783, 36)
1차 분류 Accuracy: 0.2540


In [34]:
# 2차 분류: 주상병코드 예측
stack_input_2_train = np.hstack([stack_train_input, train_data['진료과목코드'].values.reshape(-1, 1)])
stack_input_2_test = np.hstack([stack_test_input, test_data['진료과목코드'].values.reshape(-1, 1)])

second_model = XGBClassifier()
second_model.fit(stack_input_2_train, train_data['주상병코드'])
y_second_pred = second_model.predict(stack_input_2_test)

print(f"2차 분류 Accuracy: {accuracy_score(test_data['주상병코드'], y_second_pred):.4f}")
# print("Classification Report for 2차 분류:")
# print(classification_report(test_data['주상병코드'], y_second_pred, target_names=label_encoder_code.classes_))

2차 분류 Accuracy: 0.3609


In [35]:
torch.save(model.state_dict(), "/content/drive/MyDrive/University/4-2/정보기술학회/data/model_filekmbert_finetuned_model.pt")

# 스택 모델 및 2차 모델 저장
joblib.dump(stack_model, "/content/drive/MyDrive/University/4-2/정보기술학회/data/model_filestack_model_1.pkl")
joblib.dump(second_model, "/content/drive/MyDrive/University/4-2/정보기술학회/data/model_filestack_model_2.pkl")

# XGB 모델 저장 (1차 분류용)
joblib.dump(xgb_model, "/content/drive/MyDrive/University/4-2/정보기술학회/data/model_filexgb_model_for_1st_stage.pkl")

# 인코더 및 스케일러 저장
joblib.dump(label_encoder_diagnosis, "/content/drive/MyDrive/University/4-2/정보기술학회/data/model_filelabel_encoder_diagnosis.pkl")
joblib.dump(label_encoder_code, "/content/drive/MyDrive/University/4-2/정보기술학회/data/model_filelabel_encoder_code.pkl")
joblib.dump(one_hot_encoder, "/content/drive/MyDrive/University/4-2/정보기술학회/data/model_fileonehot_encoder.pkl")
joblib.dump(scaler, "/content/drive/MyDrive/University/4-2/정보기술학회/data/model_filescaler.pkl")

print("모델 및 인코더 저장 완료.")

모델 및 인코더 저장 완료.
