In [1]:
# ==========================================
# [AI vs Human(자기소개서) 분류 프로젝트]
# - KoBERT(backbone) freeze + Linear head 학습
# - Train/Valid로 best 저장 후, best로 Test 1회 평가
# ==========================================

In [2]:
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from transformers import BertTokenizer, BertModel
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# =========================
# [0] Train / Valid / Test Split 
# =========================
from sklearn.model_selection import train_test_split
import os

ALL_PATH   = "./Data/all.csv"           ## 전체 데이터셋 경로
TEST_PATH  = "./Data/test.csv"          ## test 데이터 저장 경로

RANDOM_STATE = 42   ## 랜덤 상수 지정               
TEST_SIZE  = 0.1


df = pd.read_csv(ALL_PATH, encoding="utf-8-sig")

trainval_df, test_df = train_test_split(
    df,
    test_size=0.1,
    random_state=42,
    stratify=df["label"]
)

test_df.to_csv("./Data/test.csv", index=False, encoding="utf-8-sig")

print("[TRAINVAL]", len(trainval_df))
print("[TEST]", len(test_df))

[TRAINVAL] 144
[TEST] 17


In [4]:
# =========================
# 1) Utils: Seed 고정 (재현성)
# =========================
## 재현성을 위한 시드 고정
def seed_everything(seed: int = 42) :
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [5]:
# =============================================================
# 2) Dataset: CSV -> 토크나이즈 -> 텐서 반환
# =============================================================
class ResumeAIDataset(Dataset):
    """
    KoBERT 분류용 Dataset
    - 입력 CSV: text, label 컬럼 필요
    - 출력: input_ids, attention_mask, labels
    """

    def __init__(self, csv_path, tokenizer, max_len = 256):
        
        
        ## 클래스 맴버 df 초기화
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.df = pd.read_csv(csv_path, encoding="utf-8-sig")
        
        # 필수 컬럼 체크 -> 없을 시 value에러 일으킴
        if "text" not in self.df.columns or "label" not in self.df.columns:
            raise ValueError(f"{csv_path}에는 'text', 'label' 컬럼이 필요합니다.")

        # 텍스트 최소 정리
        self.df["text"] = (
            self.df["text"]
            .astype(str)
            .str.replace("\ufeff", "") ## 엑셀 -> CSV 변환시 생기는 마크(BOM) 제거
            .str.strip() ## 앞뒤 공백 제거
        )

        # 빈 텍스트 제거
        self.df = self.df[self.df["text"] != ""].reset_index(drop=True)

        # label 정수화
        self.df["label"] = self.df["label"].astype(int)


    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, idx: int) -> dict:
        text = self.df.loc[idx, "text"]
        label = int(self.df.loc[idx, "label"])

        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),            # (L,)
            "attention_mask": enc["attention_mask"].squeeze(0),  # (L,)
            "labels": torch.tensor(label, dtype=torch.long)      # ()
        }




In [6]:
# =========================
# 3) Model: KoBERT freeze + Linear head
# =========================
class KoBERTLinearHeadClassifier(nn.Module):
    """
    KoBERT : 한국어 문장을 "의미 좌표(768차원 벡터)"로 바꿔주는 거대한 뇌 언어
    BERT : 문장의 의미를 벡터로 압축하는 모델
    --> 이미 학습된 모델
    
    freeze의 의미 : KoBERT는 이미 학습되어 있으니 더이상 학습 X ==> 더이상 가중치 업데이터 X
    --> 학습 속도 증가, 과적합 저하, 적은 데이터에 적합
    
    KoBERT는 고정(freeze)하고 head(Linear)만 학습하는 분류기
    - forward 출력: logits (B, 2)
    """

    def __init__(self, pretrained_name = "skt/kobert-base-v1", num_labels = 2, dropout = 0.2):
        super().__init__()

        self.bert = BertModel.from_pretrained(pretrained_name)

        # backbone freeze : 학습 안하겠다. 
        for p in self.bert.parameters():
            p.requires_grad = False ## 가중치 업데이트 끄기

        hidden = self.bert.config.hidden_size   # 768
        self.dropout = nn.Dropout(dropout)      # 랜덤으로 20%를 0으로 만듦 -> 특정 신호의 의존 방어, train에만 적용, eval에는 off
        # self.classifier = nn.Linear(hidden, num_labels)   ## 선형 모델로
        self.classifier = nn.Sequential(    ## 비선형 모델로
            nn.Linear(768, 512),
            nn.GELU(),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.GELU(),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.GELU(),
            nn.Dropout(0.3),

            nn.Linear(128, 2)
        )


    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0, :]  # (B, 768) : [CLS]
        x = self.dropout(cls)
        logits = self.classifier(x)           # (B, 2)
        return logits




In [7]:
# =========================
# 4) Train / Evaluate
# =========================
def train_one_epoch(model: nn.Module,
                    loader: DataLoader,
                    optimizer: torch.optim.Optimizer,
                    criterion: nn.Module,
                    device: str) -> float:
    """1 epoch 학습"""
    model.train()
    total_loss = 0.0

    for batch in loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / max(1, len(loader))


@torch.no_grad()
def evaluate(model: nn.Module,
             loader: DataLoader,
             device: str):
    """
    검증/테스트 평가
    - accuracy
    - macro F1 (데이터 적을 때/클래스 불균형에 비교적 안정)
    - confusion matrix
    """
    model.eval()
    y_true, y_pred = [], []

    for batch in loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].cpu().numpy()

        logits = model(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1).cpu().numpy()

        y_true.extend(labels.tolist())
        y_pred.extend(preds.tolist())

    acc = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average="macro")
    cm = confusion_matrix(y_true, y_pred)
    return acc, f1_macro, cm




In [9]:

def main():
    seed_everything(42)

    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    PRETRAINED = "skt/kobert-base-v1"

    MAX_LEN = 512
    BATCH_SIZE = 4
    EPOCHS = 15
    LR = 1e-3

    tokenizer = BertTokenizer.from_pretrained(PRETRAINED)

    # --- K-Fold 설정 ---
    skf = StratifiedKFold(
        n_splits=5,
        shuffle=True,
        random_state=42
    )

    df = trainval_df.reset_index(drop=True)
    X = df["text"].values
    y = df["label"].values


    fold_f1_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
        print(f"\n===== Fold {fold} =====")

        train_df = df.iloc[train_idx].reset_index(drop=True)
        val_df   = df.iloc[val_idx].reset_index(drop=True)

        train_df.to_csv("train_fold.csv", index=False, encoding="utf-8-sig")
        val_df.to_csv("val_fold.csv", index=False, encoding="utf-8-sig")

        train_ds = ResumeAIDataset("train_fold.csv", tokenizer, MAX_LEN)
        val_ds   = ResumeAIDataset("val_fold.csv", tokenizer, MAX_LEN)

        train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
        val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

        # fold마다 새 모델
        model = KoBERTLinearHeadClassifier(pretrained_name=PRETRAINED).to(DEVICE)
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.AdamW(model.classifier.parameters(), lr=LR)

        # --- 학습 ---
        for epoch in range(1, EPOCHS + 1):
            train_loss = train_one_epoch(
                model, train_loader, optimizer, criterion, DEVICE
            )

        # --- 평가 ---
        _, val_f1, _ = evaluate(model, val_loader, DEVICE)
        print(f"[Fold {fold}] F1_macro = {val_f1:.4f}")

        fold_f1_scores.append(val_f1)

    print("\n===== K-Fold Result =====")
    print("F1 per fold:", fold_f1_scores)
    print("Mean F1 :", np.mean(fold_f1_scores))
    print("Std  F1 :", np.std(fold_f1_scores))

if __name__ == "__main__":
    main()




===== Fold 1 =====


Loading weights: 100%|██████████| 199/199 [00:00<00:00, 981.27it/s, Materializing param=pooler.dense.weight]                               


[Fold 1] F1_macro = 0.3256

===== Fold 2 =====


Loading weights: 100%|██████████| 199/199 [00:00<00:00, 1002.40it/s, Materializing param=pooler.dense.weight]                              


[Fold 2] F1_macro = 0.3256

===== Fold 3 =====


Loading weights: 100%|██████████| 199/199 [00:00<00:00, 1009.14it/s, Materializing param=pooler.dense.weight]                              


[Fold 3] F1_macro = 0.3256

===== Fold 4 =====


Loading weights: 100%|██████████| 199/199 [00:00<00:00, 1021.11it/s, Materializing param=pooler.dense.weight]                              


[Fold 4] F1_macro = 0.3256

===== Fold 5 =====


Loading weights: 100%|██████████| 199/199 [00:00<00:00, 922.23it/s, Materializing param=pooler.dense.weight]                               


[Fold 5] F1_macro = 0.3333

===== K-Fold Result =====
F1 per fold: [0.32558139534883723, 0.32558139534883723, 0.32558139534883723, 0.32558139534883723, 0.3333333333333333]
Mean F1 : 0.3271317829457364
Std  F1 : 0.0031007751937984327
