In [1]:
# =========================
# DKTC 5-클래스 분류 (GPT 기반, KoGPT2) - GPU 안정 학습 버전
# - 학습: $HOME/work/DL thon/data/train.csv
# - 예측: $HOME/work/DL thon/data/test.csv
# - 제출 템플릿(읽기 전용): $HOME/work/DL thon/data/train.csv  ← 요청대로 템플릿 사용
# - 결과 저장: $HOME/work/DL thon/data/GPT_submission.csv
# =========================

# 0) 필수 라이브러리 설치(없으면 설치)
import sys, subprocess, importlib, os
def ensure(pkg):
    try:
        importlib.import_module(pkg)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
for p in ["transformers", "datasets", "accelerate", "scikit-learn", "pandas", "numpy", "torch"]:
    ensure(p)

# 1) 임포트
import re
import numpy as np
import pandas as pd
from typing import List
import inspect
import gc
import warnings

import torch
from torch import nn
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import StratifiedKFold

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    set_seed,
)

# EarlyStoppingCallback은 구버전에 없을 수 있음(있을 때만 사용)
try:
    from transformers import EarlyStoppingCallback
    HAS_EARLY_STOP = True
except Exception:
    EarlyStoppingCallback = None
    HAS_EARLY_STOP = False

# 2) 고정 경로(요청사항 반영)
TRAIN_PATH = os.getenv("HOME") + "/work/DL thon/data/train.csv"
TEST_PATH = os.getenv("HOME") + "/work/DL thon/data/test.csv"
SUBMISSION_TEMPLATE_PATH = os.getenv("HOME") + "/work/DL thon/data/train.csv"     # 읽기 전용 템플릿(요청대로)
SUBMISSION_SAVE_PATH = os.getenv("HOME") + "/work/DL thon/data/GPT_submission.csv"  # 새 파일로 저장

# 3) 기본 설정
SEED = 42
set_seed(SEED)

MODEL_NAME = "skt/kogpt2-base-v2"  # 한국어 GPT-2 (Decoder-only)
NUM_LABELS = 5
NUM_FOLDS  = 5         # 데이터가 적으면 3 권장
EPOCHS     = 3
BATCH_SIZE = 8
LR         = 2e-5
WARMUP_RATIO = 0.06
WEIGHT_DECAY = 0.01
MAX_LENGTH = 256

# 4) CSV 안전 읽기(인코딩 자동 감지)
def read_csv_smart(path: str) -> pd.DataFrame:
    encodings = ["utf-8-sig", "utf-8", "cp949", "euc-kr", "ISO-8859-1"]
    last_err = None
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc)
        except Exception as e:
            last_err = e
    raise last_err

# 5) 라벨 정규화 설정
CANONICAL_LABELS = ["협박","갈취","직장 내 괴롭힘","기타 괴롭힘","일상 대화"]
CODE_MAP = {"협박":"00","갈취":"01","직장 내 괴롭힘":"02","기타 괴롭힘":"03","일상 대화":"04"}
ALIASES = {
    "협박": {"협박","협박대화","협박 대화"},
    "갈취": {"갈취","갈취대화","갈취 대화"},
    "직장 내 괴롭힘": {"직장내괴롭힘","직장 내 괴롭힘","직장 내괴롭힘","직장내 괴롭힘","직장 내 괴롭힘대화","직장 내 괴롭힘 대화","직장내괴롭힘대화"},
    "기타 괴롭힘": {"기타 괴롭힘","기타괴롭힘","기타-괴롭힘","기타 괴롭힘대화","기타 괴롭힘 대화"},
    "일상 대화": {"일상 대화","일상대화","일반","일반 대화","일반대화"},
}
def normalize_label_name(x: str) -> str:
    s = str(x).strip()
    s = re.sub(r"(대화|유형|클래스|라벨)$", "", s).strip()
    s_cmp = re.sub(r"[\s\-\_]", "", s)
    for canon, variants in ALIASES.items():
        if s in variants or s_cmp in {re.sub(r'[\s\-\_]', '', v) for v in variants}:
            return canon
    if "협박" in s: return "협박"
    if "갈취" in s: return "갈취"
    if ("직장" in s) and ("괴롭힘" in s): return "직장 내 괴롭힘"
    if "괴롭힘" in s: return "기타 괴롭힘"
    if ("일상" in s) or ("일반" in s): return "일상 대화"
    if s in CANONICAL_LABELS: return s
    return s

# 6) 컬럼 자동 탐지
def infer_text_column(df: pd.DataFrame) -> str:
    candidates = ["text","sentence","utterance","dialogue","dialog","conversation","content","data","message","문장","대화","텍스트","내용"]
    for c in df.columns:
        if df[c].dtype == object:
            sample = df[c].dropna().astype(str).head(20).tolist()
            if any(len(str(t).strip()) > 0 for t in sample):
                if c.lower() in candidates or c in candidates:
                    return c
    obj_cols = [c for c in df.columns if df[c].dtype == object]
    if not obj_cols:
        raise ValueError("텍스트 컬럼(object dtype)을 찾을 수 없습니다.")
    best_col = max(obj_cols, key=lambda c: df[c].dropna().astype(str).str.len().mean())
    return best_col

def infer_label_column(df: pd.DataFrame) -> str:
    candidates = ["label","labels","class","category","target","y","라벨","클래스","카테고리"]
    for c in df.columns:
        if c.lower() in candidates or c in candidates:
            return c
    obj_cols = [c for c in df.columns if df[c].dtype == object]
    for c in obj_cols:
        uniq = set(df[c].dropna().astype(str).unique())
        if any(normalize_label_name(u) in CANONICAL_LABELS for u in uniq):
            return c
    raise ValueError("라벨 컬럼을 자동 탐지할 수 없습니다. label/class 등으로 지정해 주세요.")

# 7) 데이터 로드 (원본 불변)
train_df = read_csv_smart(TRAIN_PATH)
test_df  = read_csv_smart(TEST_PATH)
sub_tmpl = read_csv_smart(SUBMISSION_TEMPLATE_PATH)  # 요청대로 train.csv를 템플릿으로 사용

# 8) 텍스트/라벨 컬럼 파악 및 정규화
TEXT_COL = infer_text_column(train_df)
LABEL_COL = infer_label_column(train_df)

y_raw = train_df[LABEL_COL].astype(str).apply(normalize_label_name)
unknown = sorted(set(y_raw.unique()) - set(CANONICAL_LABELS))
if unknown:
    raise ValueError(f"다음 라벨 매핑 불가: {unknown} → normalize_label_name()에 패턴을 추가하세요.")
name2id = {name: i for i, name in enumerate(CANONICAL_LABELS)}
id2name = {i: name for name, i in name2id.items()}
y = y_raw.map(name2id)

def clean_text(s: str) -> str:
    s = str(s).strip()
    s = re.sub(r"\s+", " ", s)
    return s

train_texts = train_df[TEXT_COL].astype(str).apply(clean_text).tolist()
test_texts  = test_df[infer_text_column(test_df)].astype(str).apply(clean_text).tolist()

# 8-1) FOLD 안전 설정
class_counts = pd.Series(y).value_counts()
min_class_count = int(class_counts.min())
if NUM_FOLDS > min_class_count:
    old = NUM_FOLDS
    NUM_FOLDS = max(2, min_class_count)
    print(f"[안내] 클래스 최소 개수({min_class_count})보다 폴드 수({old})가 큽니다 → {NUM_FOLDS}로 조정")

# 9) 토크나이저/모델
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

config = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    pad_token_id=tokenizer.pad_token_id,
    problem_type="single_label_classification",
    id2label={i: id2name[i] for i in range(NUM_LABELS)},
    label2id={id2name[i]: i for i in range(NUM_LABELS)},
)

def tokenize_batch(texts: List[str]):
    return tokenizer(
        texts,
        padding=False,
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors=None,
    )

# 10) 데이터셋
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.encodings["input_ids"])
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(int(self.labels[idx]))
        return item

# 클래스 가중치 직접 계산(특정 fold에서 일부 클래스가 없어도 안전)
def make_class_weight(labels: List[int], num_labels: int) -> torch.Tensor:
    labels = np.array(labels, dtype=int)
    N = len(labels); K = num_labels
    counts = np.bincount(labels, minlength=K)
    weights = np.zeros(K, dtype=np.float32)
    for c in range(K):
        weights[c] = N / (K * counts[c]) if counts[c] > 0 else 0.0
    return torch.tensor(weights, dtype=torch.float)

# 11) Trainer 서브클래스(가중치 적용) - **kwargs로 추가 인자 흡수
class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights  # device 이동은 compute_loss에서
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.logits
        if self.class_weights is not None:
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        else:
            loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# 12) 평가 지표
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1_macro = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1_macro": f1_macro}

# == TrainingArguments를 버전에 맞춰 구성 (GPU 우선, 자동 안전 재시도 지원) ==
def build_training_args(output_dir: str, fold_seed: int, fp16_try: bool = True) -> TrainingArguments:
    params = set(inspect.signature(TrainingArguments.__init__).parameters.keys())
    desired = dict(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=LR,
        weight_decay=WEIGHT_DECAY,
        lr_scheduler_type="linear",
        warmup_ratio=WARMUP_RATIO,
        evaluation_strategy="epoch",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        logging_steps=50,
        save_total_limit=1,
        seed=fold_seed,
        dataloader_num_workers=2,
        fp16=bool(fp16_try and torch.cuda.is_available()),
        bf16=False,
        gradient_accumulation_steps=1,
        report_to=[],
        # GPU 사용: no_cuda/use_cpu는 지정하지 않음
    )
    kw = {k: v for k, v in desired.items() if k in params}
    eval_key = "evaluation_strategy" if "evaluation_strategy" in params else ("eval_strategy" if "eval_strategy" in params else None)
    save_ok = "save_strategy" in params
    if "load_best_model_at_end" in kw and kw["load_best_model_at_end"]:
        if (eval_key is None) or (not save_ok):
            kw["load_best_model_at_end"] = False
        else:
            if eval_key not in kw:
                kw[eval_key] = kw.get("save_strategy", "epoch")
            else:
                if kw[eval_key] != kw.get("save_strategy", None):
                    kw[eval_key] = kw.get("save_strategy", "epoch")
    return TrainingArguments(**kw)

# 13) Stratified K-Fold 분할 + 학습/예측 (GPU 시도 → 실패 시 안전 모드 재시도)
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)
test_logits_accum = None

for fold, (trn_idx, val_idx) in enumerate(skf.split(train_texts, y), start=1):
    print(f"\n===== FOLD {fold}/{NUM_FOLDS} =====")
    trn_texts = [train_texts[i] for i in trn_idx]
    val_texts = [train_texts[i] for i in val_idx]
    trn_labels = pd.Series(y).iloc[trn_idx].tolist()
    val_labels = pd.Series(y).iloc[val_idx].tolist()

    trn_enc = tokenize_batch(trn_texts)
    val_enc = tokenize_batch(val_texts)
    tst_enc = tokenize_batch(test_texts)

    # 사전 검증: 라벨 범위
    uniq_trn = sorted(set(trn_labels))
    assert all(0 <= int(lb) < NUM_LABELS for lb in uniq_trn), f"라벨이 0~{NUM_LABELS-1} 범위를 벗어났습니다: {uniq_trn}"

    # 데이터셋
    train_ds = SimpleDataset(trn_enc, trn_labels)
    val_ds   = SimpleDataset(val_enc, val_labels)
    test_ds  = SimpleDataset(tst_enc, labels=None)

    # 클래스 가중치
    cls_weights = make_class_weight(trn_labels, NUM_LABELS)

    # 모델
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
    model.config.pad_token_id = tokenizer.pad_token_id
    model.resize_token_embeddings(len(tokenizer))  # 임베딩 크기 동기화

    # 토큰 id 범위 점검(훈련/검증/테스트 전체)
    def max_token_id(enc):
        return max(int(np.max(ids)) for ids in enc["input_ids"])
    max_ids = [max_token_id(trn_enc), max_token_id(val_enc), max_token_id(tst_enc)]
    emb_n = model.get_input_embeddings().num_embeddings
    assert max(max_ids) < emb_n, f"토큰 id({max(max_ids)})가 임베딩 크기({emb_n})를 초과합니다."

    collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)
    out_dir = f"./gpt2_dktc_fold{fold}"

    # 1차: GPU + (가능하면) fp16로 시도
    args = build_training_args(out_dir, SEED + fold, fp16_try=True)
    callbacks = [EarlyStoppingCallback(early_stopping_patience=2)] if HAS_EARLY_STOP else []

    def make_trainer(train_args: TrainingArguments):
        return WeightedTrainer(
            model=model,
            args=train_args,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            data_collator=collator,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
            class_weights=cls_weights,
            callbacks=callbacks,
        )

    trainer = make_trainer(args)

    def train_with_auto_retry(trainer_obj, first_args):
        try:
            return trainer_obj.train(), trainer_obj, first_args
        except RuntimeError as e:
            msg = str(e)
            if "CUDA" in msg or "device-side assert" in msg or "cublas" in msg or "illegal memory access" in msg:
                warnings.warn(f"[경고] CUDA 관련 예외 감지: {msg[:120]}... → 안전 모드 재시도(fp16=False)")
                # 메모리/상태 정리
                if torch.cuda.is_available():
                    torch.cuda.synchronize()
                    torch.cuda.empty_cache()
                gc.collect()
                # 안전 모드: fp16 끄고 동일 설정 재생성
                safe_args = build_training_args(first_args.output_dir, first_args.seed, fp16_try=False)
                safe_trainer = make_trainer(safe_args)
                return safe_trainer.train(), safe_trainer, safe_args
            else:
                raise

    _, trainer, used_args = train_with_auto_retry(trainer, args)

    # 검증 성능 리포트(가능하면)
    try:
        val_pred_logits = trainer.predict(val_ds).predictions
        val_preds = np.argmax(val_pred_logits, axis=-1)
        print(classification_report(
            val_labels, val_preds,
            target_names=[id2name[i] for i in range(NUM_LABELS)],
            digits=4
        ))
    except Exception as e:
        print(f"[경고] classification_report 출력 중 예외: {e}")

    # 테스트 로짓 앙상블(평균)
    test_pred_logits = trainer.predict(test_ds).predictions  # [N_test, num_labels]
    if test_logits_accum is None:
        test_logits_accum = test_pred_logits
    else:
        test_logits_accum += test_pred_logits

# 14) 테스트 최종 예측
test_logits_mean = test_logits_accum / NUM_FOLDS
test_pred_ids = np.argmax(test_logits_mean, axis=-1)  # 0~4
id2name_local = {i: n for i, n in enumerate(CANONICAL_LABELS)}
test_pred_names = [id2name_local[i] for i in test_pred_ids]
test_pred_codes = [CODE_MAP[name] for name in test_pred_names]

# 15) 제출 파일 생성(원본 템플릿은 읽기 전용, 새 파일에 저장)
sub_df = sub_tmpl.copy()
candidate_cols = ["label","class","target","pred","prediction"]
label_col_to_use = None
for c in sub_df.columns:
    if c.lower() in candidate_cols:
        label_col_to_use = c
        break
if label_col_to_use is None:
    label_col_to_use = "label"
    if "label" not in sub_df.columns:
        sub_df["label"] = None

# 템플릿과 test 길이 불일치 시 test 기준으로 생성
if len(sub_df) != len(test_pred_codes):
    print(f"[안내] 템플릿 행수({len(sub_df)}) != test 샘플 수({len(test_pred_codes)}). test 기준 새 프레임 생성.")
    sub_df = pd.DataFrame({label_col_to_use: test_pred_codes})
else:
    sub_df[label_col_to_use] = test_pred_codes

# 저장(원본 파일 미변경)
os.makedirs(os.path.dirname(SUBMISSION_SAVE_PATH), exist_ok=True)
sub_df.to_csv(SUBMISSION_SAVE_PATH, index=False, encoding="utf-8-sig")
print(f"\n제출 파일이 저장되었습니다: {SUBMISSION_SAVE_PATH}")

# 참고: 'Some weights ... score.weight' 경고는 분류 헤드 초기화 알림으로 정상입니다(학습으로 최적화됩니다).



===== FOLD 1/5 =====


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  super().__init__(*args, **kwargs)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid us

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.4496,0.459217,0.862025,0.860784
2,0.2235,0.562917,0.877215,0.877815
3,0.1096,0.780887,0.877215,0.877647


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[경고] classification_report 출력 중 예외: Number of classes, 4, does not match size of target_names, 5. Try specifying the labels parameter


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



===== FOLD 2/5 =====


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.5306,0.394522,0.877215,0.876206
2,0.2506,0.530662,0.889873,0.890191
3,0.0921,0.708104,0.894937,0.894686


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[경고] classification_report 출력 중 예외: Number of classes, 4, does not match size of target_names, 5. Try specifying the labels parameter


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



===== FOLD 3/5 =====


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.4814,0.369532,0.893671,0.89206
2,0.3357,0.473257,0.889873,0.887987
3,0.1377,0.621584,0.889873,0.889185


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/to

[경고] classification_report 출력 중 예외: Number of classes, 4, does not match size of target_names, 5. Try specifying the labels parameter


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



===== FOLD 4/5 =====


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.4329,0.465721,0.873418,0.873552
2,0.2485,0.473327,0.886076,0.886353
3,0.1159,0.604017,0.894937,0.895452


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[경고] classification_report 출력 중 예외: Number of classes, 4, does not match size of target_names, 5. Try specifying the labels parameter


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



===== FOLD 5/5 =====


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.4902,0.524042,0.813924,0.816262
2,0.2815,0.448257,0.865823,0.865987
3,0.0359,0.492572,0.902532,0.902646


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[경고] classification_report 출력 중 예외: Number of classes, 4, does not match size of target_names, 5. Try specifying the labels parameter


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[안내] 템플릿 행수(3950) != test 샘플 수(500). test 기준 새 프레임 생성.

제출 파일이 저장되었습니다: /home/jovyan/work/DL thon/data/GPT_submission.csv
