In [1]:
# 필요한 라이브러리
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from torch.optim import AdamW
from torch.optim.lr_scheduler import MultiStepLR
from tqdm import tqdm
import copy

# ✅ GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ✅ 데이터 불러오기
df_train = pd.read_csv("/kaggle/input/poiuyt/train.csv")  # 경로 조정 필요

# ✅ label 0과 1에서 각각 250개씩 샘플링
sampled_0 = df_train[df_train['label'] == 0].sample(n=250, random_state=42)
sampled_1 = df_train[df_train['label'] == 1].sample(n=250, random_state=42)
df_sampled = pd.concat([sampled_0, sampled_1]).reset_index(drop=True)

# ✅ 하이퍼파라미터
MAX_LEN = 512
BATCH_SIZE = 32
EPOCHS = 30
NUM_FOLDS = 5

# ✅ 토크나이저
tokenizer = BertTokenizer.from_pretrained("klue/bert-base")

# ✅ Dataset 클래스
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.float)
        }

# ✅ 모델 정의 (LSTM + BERT)
class LSTMClassifier(nn.Module):
    def __init__(self, hidden_dim=256, num_layers=1, freeze_bert=True):
        super(LSTMClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("klue/bert-base")
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        self.lstm = nn.LSTM(input_size=768, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        lstm_out, _ = self.lstm(outputs.last_hidden_state)
        final_hidden = lstm_out[:, -1, :]
        logits = self.fc(final_hidden)
        return self.sigmoid(logits).squeeze()

# ✅ 평가 함수
def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    preds, labels_all = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds += (outputs > 0.5).cpu().tolist()
            labels_all += labels.cpu().tolist()
    return total_loss / len(loader), accuracy_score(labels_all, preds)

# ✅ Stratified K-Fold 훈련
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(skf.split(df_sampled['text'], df_sampled['label'])):
    print(f"\n🔁 Fold {fold + 1}/{NUM_FOLDS}")

    train_df = df_sampled.iloc[train_idx]
    val_df = df_sampled.iloc[val_idx]

    train_dataset = TextDataset(train_df['text'].tolist(), train_df['label'].tolist(), tokenizer, MAX_LEN)
    val_dataset = TextDataset(val_df['text'].tolist(), val_df['label'].tolist(), tokenizer, MAX_LEN)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    model = LSTMClassifier(freeze_bert=False).to(device)
    criterion = nn.BCELoss()
    optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5, weight_decay=0.01)
    scheduler = MultiStepLR(optimizer, milestones=[10, 20], gamma=0.5)

    best_val_acc = 0
    early_stop_counter = 0
    early_stop_patience = 3

    for epoch in range(EPOCHS):
        model.train()
        total_loss, correct, total = 0, 0, 0
        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} (Fold {fold+1})")

        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = (outputs > 0.5).float()
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            loop.set_postfix(loss=loss.item())

        train_acc = correct / total
        val_loss, val_acc = evaluate(model, val_loader, criterion)
        scheduler.step()

        print(f"Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            early_stop_counter = 0
            torch.save(model.state_dict(), f"best_model_fold_{fold+1}.pt")
            print("✅ Best model saved.")
        else:
            early_stop_counter += 1
            if early_stop_counter >= early_stop_patience:
                print("🛑 Early stopping")
                break

    fold_results.append({'fold': fold + 1, 'val_acc': best_val_acc})

# ✅ 결과 출력
results_df = pd.DataFrame(fold_results)
print("\n📊 Fold 결과:\n", results_df)
print("📈 평균 정확도:", results_df['val_acc'].mean())


2025-05-23 08:37:15.722281: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747989435.907975      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747989435.962091      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Device: cuda


tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]


🔁 Fold 1/5


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Epoch 1/30 (Fold 1): 100%|██████████| 13/13 [00:41<00:00,  3.16s/it, loss=0.616]


Train Acc: 0.5775 | Val Acc: 0.5600
✅ Best model saved.


Epoch 2/30 (Fold 1): 100%|██████████| 13/13 [00:44<00:00,  3.41s/it, loss=0.598]


Train Acc: 0.7225 | Val Acc: 0.8000
✅ Best model saved.


Epoch 3/30 (Fold 1): 100%|██████████| 13/13 [00:44<00:00,  3.39s/it, loss=0.397]


Train Acc: 0.9050 | Val Acc: 0.9300
✅ Best model saved.


Epoch 4/30 (Fold 1): 100%|██████████| 13/13 [00:44<00:00,  3.45s/it, loss=0.0832]


Train Acc: 0.9650 | Val Acc: 0.7800


Epoch 5/30 (Fold 1): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.0367]


Train Acc: 0.9800 | Val Acc: 0.9900
✅ Best model saved.


Epoch 6/30 (Fold 1): 100%|██████████| 13/13 [00:44<00:00,  3.44s/it, loss=0.0439]


Train Acc: 0.9900 | Val Acc: 0.9100


Epoch 7/30 (Fold 1): 100%|██████████| 13/13 [00:44<00:00,  3.44s/it, loss=0.0188]


Train Acc: 1.0000 | Val Acc: 0.9900


Epoch 8/30 (Fold 1): 100%|██████████| 13/13 [00:44<00:00,  3.44s/it, loss=0.0156]


Train Acc: 1.0000 | Val Acc: 0.9300
🛑 Early stopping

🔁 Fold 2/5


Epoch 1/30 (Fold 2): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.689]


Train Acc: 0.6125 | Val Acc: 0.7200
✅ Best model saved.


Epoch 2/30 (Fold 2): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.513]


Train Acc: 0.8575 | Val Acc: 0.8600
✅ Best model saved.


Epoch 3/30 (Fold 2): 100%|██████████| 13/13 [00:44<00:00,  3.42s/it, loss=0.139]


Train Acc: 0.9250 | Val Acc: 0.9700
✅ Best model saved.


Epoch 4/30 (Fold 2): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.107] 


Train Acc: 0.9950 | Val Acc: 0.9600


Epoch 5/30 (Fold 2): 100%|██████████| 13/13 [00:44<00:00,  3.45s/it, loss=0.154] 


Train Acc: 0.9900 | Val Acc: 0.9800
✅ Best model saved.


Epoch 6/30 (Fold 2): 100%|██████████| 13/13 [00:44<00:00,  3.44s/it, loss=0.0209]


Train Acc: 1.0000 | Val Acc: 0.9600


Epoch 7/30 (Fold 2): 100%|██████████| 13/13 [00:44<00:00,  3.44s/it, loss=0.0166]


Train Acc: 1.0000 | Val Acc: 0.9900
✅ Best model saved.


Epoch 8/30 (Fold 2): 100%|██████████| 13/13 [00:44<00:00,  3.44s/it, loss=0.0151]


Train Acc: 1.0000 | Val Acc: 0.9600


Epoch 9/30 (Fold 2): 100%|██████████| 13/13 [00:44<00:00,  3.44s/it, loss=0.0106]


Train Acc: 1.0000 | Val Acc: 0.9900


Epoch 10/30 (Fold 2): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.00964]


Train Acc: 1.0000 | Val Acc: 0.9900
🛑 Early stopping

🔁 Fold 3/5


Epoch 1/30 (Fold 3): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.657]


Train Acc: 0.5550 | Val Acc: 0.6200
✅ Best model saved.


Epoch 2/30 (Fold 3): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.581]


Train Acc: 0.7200 | Val Acc: 0.8900
✅ Best model saved.


Epoch 3/30 (Fold 3): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.316]


Train Acc: 0.9100 | Val Acc: 0.8500


Epoch 4/30 (Fold 3): 100%|██████████| 13/13 [00:44<00:00,  3.44s/it, loss=0.105] 


Train Acc: 0.9875 | Val Acc: 0.9800
✅ Best model saved.


Epoch 5/30 (Fold 3): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.033] 


Train Acc: 1.0000 | Val Acc: 0.9700


Epoch 6/30 (Fold 3): 100%|██████████| 13/13 [00:44<00:00,  3.44s/it, loss=0.0212]


Train Acc: 1.0000 | Val Acc: 0.9300


Epoch 7/30 (Fold 3): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.016] 


Train Acc: 1.0000 | Val Acc: 0.9700
🛑 Early stopping

🔁 Fold 4/5


Epoch 1/30 (Fold 4): 100%|██████████| 13/13 [00:44<00:00,  3.42s/it, loss=0.645]


Train Acc: 0.5975 | Val Acc: 0.6100
✅ Best model saved.


Epoch 2/30 (Fold 4): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.58] 


Train Acc: 0.7200 | Val Acc: 0.7700
✅ Best model saved.


Epoch 3/30 (Fold 4): 100%|██████████| 13/13 [00:44<00:00,  3.44s/it, loss=0.289]


Train Acc: 0.9350 | Val Acc: 0.9600
✅ Best model saved.


Epoch 4/30 (Fold 4): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.089] 


Train Acc: 0.9900 | Val Acc: 0.9500


Epoch 5/30 (Fold 4): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.0522]


Train Acc: 0.9975 | Val Acc: 0.9100


Epoch 6/30 (Fold 4): 100%|██████████| 13/13 [00:44<00:00,  3.44s/it, loss=0.0236]


Train Acc: 1.0000 | Val Acc: 1.0000
✅ Best model saved.


Epoch 7/30 (Fold 4): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.0233]


Train Acc: 1.0000 | Val Acc: 0.9300


Epoch 8/30 (Fold 4): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.0149]


Train Acc: 1.0000 | Val Acc: 1.0000


Epoch 9/30 (Fold 4): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.0165]


Train Acc: 1.0000 | Val Acc: 0.8800
🛑 Early stopping

🔁 Fold 5/5


Epoch 1/30 (Fold 5): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.616]


Train Acc: 0.5150 | Val Acc: 0.5600
✅ Best model saved.


Epoch 2/30 (Fold 5): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.623]


Train Acc: 0.6850 | Val Acc: 0.7000
✅ Best model saved.


Epoch 3/30 (Fold 5): 100%|██████████| 13/13 [00:44<00:00,  3.42s/it, loss=0.408]


Train Acc: 0.8675 | Val Acc: 0.8800
✅ Best model saved.


Epoch 4/30 (Fold 5): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.0804]


Train Acc: 0.9550 | Val Acc: 0.9600
✅ Best model saved.


Epoch 5/30 (Fold 5): 100%|██████████| 13/13 [00:44<00:00,  3.44s/it, loss=0.0324]


Train Acc: 1.0000 | Val Acc: 0.9900
✅ Best model saved.


Epoch 6/30 (Fold 5): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.133] 


Train Acc: 0.9975 | Val Acc: 0.9900


Epoch 7/30 (Fold 5): 100%|██████████| 13/13 [00:44<00:00,  3.44s/it, loss=0.0217]


Train Acc: 0.9975 | Val Acc: 0.9500


Epoch 8/30 (Fold 5): 100%|██████████| 13/13 [00:44<00:00,  3.43s/it, loss=0.0112]


Train Acc: 1.0000 | Val Acc: 0.9700
🛑 Early stopping

📊 Fold 결과:
    fold  val_acc
0     1     0.99
1     2     0.99
2     3     0.98
3     4     1.00
4     5     0.99
📈 평균 정확도: 0.99


In [2]:
# ✅ 테스트 데이터 불러오기
df_test = pd.read_csv("/kaggle/input/poiuyt/test.csv")  # 경로 확인 필요
test_dataset = TextDataset(
    texts=df_test['text'].tolist(),
    labels=[0]*len(df_test),  # 더미 라벨
    tokenizer=tokenizer,
    max_len=MAX_LEN
)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# ✅ 모델 체크포인트를 저장해둔 경우 불러오기 예시
fold_model_paths = [f"best_model_fold_{i}.pt" for i in range(5)]

# ✅ 5개 모델로 예측 수행
all_probs = []

for fold in range(5):
    print(f"🔍 Fold {fold+1} 모델 로딩 및 예측")
    model = LSTMClassifier(freeze_bert=False).to(device)
    
    # 훈련 당시 저장된 모델 불러오기
    model.load_state_dict(torch.load(f"best_model_fold_{fold+1}.pt", map_location=device))  # 저장 경로 확인
    model.eval()
    
    probs = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            probs.extend(outputs.cpu().numpy())

    all_probs.append(np.array(probs))

# ✅ 소프트보팅: 평균 확률 → 0.5 기준 예측
avg_probs = np.mean(all_probs, axis=0)
final_preds = (avg_probs > 0.5).astype(int)

# ✅ 결과 저장
df_submission = df_test.copy()
df_submission['soft_voted_label'] = final_preds
df_submission.to_csv("submission_soft_voting.csv", index=False)
print("✅ submission_soft_voting.csv 저장 완료!")


🔍 Fold 1 모델 로딩 및 예측
🔍 Fold 2 모델 로딩 및 예측
🔍 Fold 3 모델 로딩 및 예측
🔍 Fold 4 모델 로딩 및 예측
🔍 Fold 5 모델 로딩 및 예측
✅ submission_soft_voting.csv 저장 완료!


In [3]:
# ✅ 소프트보팅: 평균 확률 → 0.5 기준 예측
avg_probs = np.mean(all_probs, axis=0)
final_preds = (avg_probs > 0.5).astype(int)

# ✅ 결과 저장 (id, label 형식)
df_submission = pd.DataFrame({
    'id': df_test['id'],
    'label': final_preds
})
df_submission.to_csv("submission.csv", index=False)
print("✅ submission.csv 저장 완료!")


✅ submission.csv 저장 완료!
