# 🧠 KoBERT Intent Classification - 실험용

In [1]:
import torch
import torch.nn as nn
from torch.nn.functional import softmax
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, BertTokenizer, logging as transformers_logging
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle

transformers_logging.set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class IntentDataset(Dataset):
    def __init__(self, df, tokenizer, label_encoder, max_len=64):
        self.sentences = df['question'].tolist()
        self.labels = label_encoder.transform(df['intent'].tolist())
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.sentences[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

class KoBERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super(KoBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("monologg/kobert")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        output = self.dropout(pooled_output)
        return self.classifier(output)

In [4]:
df = pd.read_csv("intent_dataset_cleaned.csv")

label_encoder = LabelEncoder()
label_encoder.fit(df['intent'])

train_df, val_df = train_test_split(
    df, test_size=0.2, stratify=df['intent'], random_state=42
)

tokenizer = BertTokenizer.from_pretrained("monologg/kobert")

train_dataset = IntentDataset(train_df, tokenizer, label_encoder)
val_dataset = IntentDataset(val_df, tokenizer, label_encoder)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)



In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = KoBERTClassifier(num_labels=len(label_encoder.classes_)).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()



In [None]:
# 성능 추적용 리스트
train_losses = []
val_accuracies = []
val_confidences = []

best_acc = 0
EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    print(f"📚 Epoch {epoch+1} Loss: {avg_train_loss:.4f}")

    # 🎯 검증
    model.eval()
    preds, targets, confs_all = [], [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            probs = softmax(outputs, dim=1)
            confs, predicted = torch.max(probs, 1)

            preds.extend(predicted.cpu().numpy())
            targets.extend(labels.cpu().numpy())
            confs_all.extend(confs.cpu().numpy())

    acc = accuracy_score(targets, preds)
    avg_conf = sum(confs_all) / len(confs_all)
    val_accuracies.append(acc)
    val_confidences.append(avg_conf)
    print(f"🎯 Val Acc: {acc:.4f} | 🔍 Avg Confidence: {avg_conf:.4f}")

    if acc > best_acc:
        best_acc = acc
        torch.save(model.state_dict(), "best_kobert_model.pt")
        print(f"✅ Best model saved! Acc: {acc:.4f}")

# 라벨 인코더도 함께 저장
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

Epoch 1: 100%|██████████| 245/245 [12:15<00:00,  3.00s/it]


📚 Epoch 1 Loss: 2.4408
🎯 Val Acc: 0.4035 | 🔍 Avg Confidence: 0.2698
✅ Best model saved! Acc: 0.4035


Epoch 2: 100%|██████████| 245/245 [12:22<00:00,  3.03s/it]


📚 Epoch 2 Loss: 1.9067
🎯 Val Acc: 0.4413 | 🔍 Avg Confidence: 0.3944
✅ Best model saved! Acc: 0.4413


Epoch 3: 100%|██████████| 245/245 [12:31<00:00,  3.07s/it]


📚 Epoch 3 Loss: 1.6664
🎯 Val Acc: 0.4729 | 🔍 Avg Confidence: 0.4296
✅ Best model saved! Acc: 0.4729


Epoch 4:   0%|          | 1/245 [00:02<11:39,  2.87s/it]

In [None]:
# 📈 학습 곡선 시각화
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_accuracies, label='Val Accuracy')
plt.plot(val_confidences, label='Avg Confidence')
plt.legend()
plt.title("Training Progress")
plt.xlabel("Epoch")
plt.grid(True)
plt.show()