## 첫 학습 코드

In [None]:
# STEP 1: 라이브러리 임포트
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt

# STEP 2: 디바이스 설정
device = "cuda" if torch.cuda.is_available() else "cpu"
print("사용 디바이스:", device)

# STEP 3: 데이터 로딩 및 분할
df = pd.read_csv("/kaggle/input/train-csv/train.csv")

def build_input(row):
    return f"문장 순서 맞추기: {row['sentence_0']} [SEP] {row['sentence_1']} [SEP] {row['sentence_2']} [SEP] {row['sentence_3']}"

def build_label(row):
    return " ".join([str(row[f"answer_{i}"]) for i in range(4)])

df["inputs"] = df.apply(build_input, axis=1)
df["labels"] = df.apply(build_label, axis=1)

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df[["inputs", "labels"]], preserve_index=False)
val_dataset = Dataset.from_pandas(val_df[["inputs", "labels"]], preserve_index=False)

# STEP 4: 모델 및 토크나이저 로드
model_name = "paust/pko-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# STEP 5: 전처리 함수
def tokenize_function(example):
    model_inputs = tokenizer(example["inputs"], padding="max_length", truncation=True, max_length=256)
    labels = tokenizer(text_target=example["labels"], padding="max_length", truncation=True, max_length=10)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# STEP 6: 토큰화 및 포맷 설정
tokenized_train = train_dataset.map(tokenize_function, remove_columns=["inputs", "labels"])
tokenized_val = val_dataset.map(tokenize_function, remove_columns=["inputs", "labels"])
tokenized_train.set_format(type="torch")
tokenized_val.set_format(type="torch")

# STEP 7: 데이터로더 및 옵티마이저 설정
data_collator = DataCollatorForSeq2Seq(tokenizer, model)
train_loader = DataLoader(tokenized_train, batch_size=4, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(tokenized_val, batch_size=4, collate_fn=data_collator)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# STEP 8: 평가 함수
def evaluate(model, loader):
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    print(">> Evaluation start")
    with torch.no_grad():
        for idx, batch in enumerate(loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            preds = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=10, num_beams=1)
            decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            for pred, label in zip(decoded_preds, decoded_labels):
                if pred.strip() == label.strip():
                    correct += 1
                total += 1

            if idx % 100 == 0:
                print(f"Evaluation progress: {idx}/{len(loader)}")

    print(">> Evaluation done")
    return val_loss / len(loader), correct / total

# STEP 9: 학습 루프
train_losses, val_losses, val_accuracies = [], [], []
epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    avg_val_loss, val_acc = evaluate(model, val_loader)

    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)
    val_accuracies.append(val_acc)

    print(f"Epoch {epoch+1:02d} | Train Loss {avg_train_loss:.4f} | Val Loss {avg_val_loss:.4f} | Val Acc {val_acc:.4f}")

# STEP 10: 그래프 시각화
plt.figure(figsize=(6, 4))
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Val Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss over Epochs")
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(6, 4))
plt.plot(val_accuracies, label="Val Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Accuracy over Epochs")
plt.legend()
plt.grid(True)
plt.show()

# STEP 11: 모델 저장
model.save_pretrained("./best_model")
tokenizer.save_pretrained("./best_model")

# STEP 12: 전체 train.csv에 대한 예측 수행
df_full = pd.read_csv("/kaggle/input/train-csv/train.csv")
df_full["inputs"] = df_full.apply(build_input, axis=1)
full_dataset = Dataset.from_pandas(df_full[["ID", "inputs"]], preserve_index=False)

def tokenize_full(example):
    return tokenizer(example["inputs"], padding="max_length", truncation=True, max_length=256)

tokenized_full = full_dataset.map(tokenize_full)
tokenized_full.set_format(type="torch")
full_loader = DataLoader(tokenized_full, batch_size=4, collate_fn=data_collator)

# STEP 13: 예측 결과 저장
model.eval()
predictions = []

with torch.no_grad():
    for batch in tqdm(full_loader, desc="Predicting train.csv"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=10, num_beams=1)
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for pred in decoded:
            predictions.append([int(x) for x in pred.strip().split()])

submission_df = pd.DataFrame(predictions, columns=["answer_0", "answer_1", "answer_2", "answer_3"])
submission_df.insert(0, "ID", df_full["ID"])
submission_df.to_csv("train_predictions.csv", index=False)
print("train_predictions.csv 저장 완료")


## train.csv 확인 코드

In [None]:
# STEP 12: 전체 train.csv 예측 준비
df_full = pd.read_csv("/kaggle/input/train-csv/train.csv")

def build_input(row):
    return f"문장 순서 맞추기: {row['sentence_0']} [SEP] {row['sentence_1']} [SEP] {row['sentence_2']} [SEP] {row['sentence_3']}"

df_full["inputs"] = df_full.apply(build_input, axis=1)
df_full["inputs"] = df_full["inputs"].astype(str)  # 리스트 오류 방지

# ID 따로 저장
full_ids = df_full["ID"].tolist()

# Dataset 구성
from datasets import Dataset
full_dataset = Dataset.from_pandas(df_full[["inputs"]].copy(), preserve_index=False)

# Tokenizer 적용 (return_tensors 사용 X, 추론에 적합하게)
def tokenize_full(example):
    return tokenizer(example["inputs"], padding="max_length", truncation=True, max_length=256)

tokenized_full = full_dataset.map(tokenize_full)
tokenized_full.set_format(type="torch")

# DataLoader (collator 없이 사용!)
from torch.utils.data import DataLoader

full_loader = DataLoader(tokenized_full, batch_size=4)

# STEP 13: 예측 수행
model.eval()
predictions = []

with torch.no_grad():
    for batch in tqdm(full_loader, desc="Predicting train.csv"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=10,
            num_beams=1
        )
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for pred in decoded:
            try:
                predictions.append([int(x) for x in pred.strip().split()])
            except:
                predictions.append([-1, -1, -1, -1])  # 잘못된 형식 대응

# 저장
import pandas as pd

submission_df = pd.DataFrame(predictions, columns=["answer_0", "answer_1", "answer_2", "answer_3"])
submission_df.insert(0, "ID", full_ids)
submission_df.to_csv("train_predictions.csv", index=False)
print("✅ train_predictions.csv 저장 완료")


## 테스트 코드

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm

# 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델 및 토크나이저 로딩
model_dir = "./best_model"  # 학습된 모델 경로
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir).to(device)
model.eval()

# 테스트 데이터 불러오기
test_df = pd.read_csv("/kaggle/input/test-csv/test.csv")  # 경로 조정 필요 시 수정

# 입력 텍스트 생성
def build_input(row):
    return f"문장 순서 맞추기: {row['sentence_0']} [SEP] {row['sentence_1']} [SEP] {row['sentence_2']} [SEP] {row['sentence_3']}"

test_df["inputs"] = test_df.apply(build_input, axis=1)

# 후처리 함수: 0~3만 유지, 중복/누락 보정
def post_process(pred_str):
    try:
        nums = [int(x) for x in pred_str.strip().split() if x.isdigit()]
        # 중복 제거 후 누락 보완
        nums = list(dict.fromkeys(nums))
        for i in range(4):
            if i not in nums:
                nums.append(i)
        return nums[:4]
    except:
        return [0, 1, 2, 3]

# 추론
results = []
batch_size = 16

for i in tqdm(range(0, len(test_df), batch_size)):
    batch_inputs = test_df["inputs"][i:i+batch_size].tolist()
    encoded = tokenizer(batch_inputs, padding=True, truncation=True, return_tensors="pt", max_length=256).to(device)
    
    with torch.no_grad():
        output_ids = model.generate(
            input_ids=encoded["input_ids"],
            attention_mask=encoded["attention_mask"],
            max_length=10,
            num_beams=5,
            early_stopping=True
        )
    
    decoded = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    processed = [post_process(p) for p in decoded]
    results.extend(processed)

# 결과 저장
submission = pd.DataFrame({
    "ID": test_df["ID"],
    "answer_0": [r[0] for r in results],
    "answer_1": [r[1] for r in results],
    "answer_2": [r[2] for r in results],
    "answer_3": [r[3] for r in results],
})

submission.to_csv("submission.csv", index=False)
print("✅ submission.csv 저장 완료")


사용 디바이스: cpu


Exception: data did not match any variant of untagged enum ModelWrapper at line 251273 column 3

## 튜닝 후 코드

항목	설정 값
Learning Rate	2e-5
Epochs	10
Batch Size	8
Weight Decay	0.01
Gradient Clip	1.0

In [None]:
# STEP 1: 라이브러리 임포트
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt

# STEP 2: 디바이스 설정
device = "cuda" if torch.cuda.is_available() else "cpu"
print("✅ 사용 디바이스:", device)

# STEP 3: 데이터 로딩 및 분할
df = pd.read_csv("/kaggle/input/train-csv/train.csv")

def build_input(row):
    return f"문장 순서 맞추기: {row['sentence_0']} [SEP] {row['sentence_1']} [SEP] {row['sentence_2']} [SEP] {row['sentence_3']}"

def build_label(row):
    return " ".join([str(row[f"answer_{i}"]) for i in range(4)])

df["inputs"] = df.apply(build_input, axis=1)
df["labels"] = df.apply(build_label, axis=1)

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df[["inputs", "labels"]], preserve_index=False)
val_dataset = Dataset.from_pandas(val_df[["inputs", "labels"]], preserve_index=False)

# STEP 4: 모델 및 토크나이저 로드
model_name = "paust/pko-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# STEP 5: 전처리 함수
def tokenize_function(example):
    model_inputs = tokenizer(example["inputs"], padding="max_length", truncation=True, max_length=256)
    labels = tokenizer(text_target=example["labels"], padding="max_length", truncation=True, max_length=10)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# STEP 6: 토큰화 및 포맷 설정
tokenized_train = train_dataset.map(tokenize_function, remove_columns=["inputs", "labels"])
tokenized_val = val_dataset.map(tokenize_function, remove_columns=["inputs", "labels"])
tokenized_train.set_format(type="torch")
tokenized_val.set_format(type="torch")

# STEP 7: 데이터로더 및 옵티마이저 설정
data_collator = DataCollatorForSeq2Seq(tokenizer, model)
train_loader = DataLoader(tokenized_train, batch_size=8, shuffle=True, collate_fn=data_collator)  # batch_size ↑
val_loader = DataLoader(tokenized_val, batch_size=8, collate_fn=data_collator)

# 옵티마이저 설정 (튜닝된 하이퍼파라미터)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# STEP 8: 평가 함수
def evaluate(model, loader):
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            preds = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=10, num_beams=1)
            decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            for pred, label in zip(decoded_preds, decoded_labels):
                if pred.strip() == label.strip():
                    correct += 1
                total += 1

    return val_loss / len(loader), correct / total

# STEP 9: 학습 루프
train_losses, val_losses, val_accuracies = [], [], []
epochs = 10

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # gradient clipping
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    avg_val_loss, val_acc = evaluate(model, val_loader)

    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)
    val_accuracies.append(val_acc)

    print(f"Epoch {epoch+1:02d} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")

# STEP 10: 그래프 시각화
plt.figure(figsize=(6, 4))
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Val Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss over Epochs")
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(6, 4))
plt.plot(val_accuracies, label="Val Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Validation Accuracy over Epochs")
plt.legend()
plt.grid(True)
plt.show()

# STEP 11: 모델 저장
model.save_pretrained("./best_model")
tokenizer.save_pretrained("./best_model")
print("✅ 모델 저장 완료: ./best_model")


## 모델 개선 한번 더

Beam Search (num_beams=5)

입력 Prompt 개선 (자연어 지시어 형태)

Post-processing: 잘못된 순열 보정

In [None]:
# STEP 1: 라이브러리 임포트
import pandas as pd
import torch
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    get_scheduler,
)
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from itertools import permutations
import matplotlib.pyplot as plt
from tqdm import tqdm

# STEP 2: 디바이스 설정
device = "cuda" if torch.cuda.is_available() else "cpu"
print("✅ 사용 디바이스:", device)

# STEP 3: 데이터 로드 및 전처리
df = pd.read_csv("/kaggle/input/train-csv/train.csv")

def build_input(row):
    return f"""\
다음 네 문장을 읽고 가장 자연스러운 순서를 예측하세요:
1. {row['sentence_0']}
2. {row['sentence_1']}
3. {row['sentence_2']}
4. {row['sentence_3']}
정답:"""

def build_label(row):
    return " ".join([str(row[f"answer_{i}"]) for i in range(4)])

df["inputs"] = df.apply(build_input, axis=1)
df["labels"] = df.apply(build_label, axis=1)

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df[["inputs", "labels"]], preserve_index=False)
val_dataset = Dataset.from_pandas(val_df[["inputs", "labels"]], preserve_index=False)

# STEP 4: 모델 및 토크나이저 로드
model_name = "paust/pko-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# STEP 5: 토크나이징 함수 정의
def tokenize_function(example):
    model_inputs = tokenizer(example["inputs"], padding="max_length", truncation=True, max_length=256)
    labels = tokenizer(text_target=example["labels"], padding="max_length", truncation=True, max_length=10)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# STEP 6: 토큰화 및 포맷
tokenized_train = train_dataset.map(tokenize_function, remove_columns=["inputs", "labels"])
tokenized_val = val_dataset.map(tokenize_function, remove_columns=["inputs", "labels"])
tokenized_train.set_format(type="torch")
tokenized_val.set_format(type="torch")

# STEP 7: 데이터로더, 옵티마이저, 스케줄러
data_collator = DataCollatorForSeq2Seq(tokenizer, model)
train_loader = DataLoader(tokenized_train, batch_size=8, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(tokenized_val, batch_size=8, collate_fn=data_collator)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
num_training_steps = len(train_loader) * 10  # epochs = 10
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# STEP 8: 보정 함수
def correct_order(pred_str):
    try:
        pred = [int(x) for x in pred_str.strip().split()]
        if len(pred) == 4 and set(pred) == {0, 1, 2, 3}:
            return pred
    except:
        pass
    return [0, 1, 2, 3]

def safe_parse(label_str):
    try:
        nums = [int(x) for x in label_str.strip().split()]
        return nums if len(nums) == 4 else [0, 1, 2, 3]
    except:
        return [0, 1, 2, 3]

# STEP 9: 평가 함수
def evaluate(model, loader):
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            preds = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=10,
                num_beams=5,
                early_stopping=False
            )
            decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
            decoded_preds = [correct_order(p) for p in decoded_preds]
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            decoded_labels = [safe_parse(label) for label in decoded_labels]

            for pred, label in zip(decoded_preds, decoded_labels):
                if pred == label:
                    correct += 1
                total += 1

    return val_loss / len(loader), correct / total

# STEP 10: 학습 루프 (tqdm 한 줄 출력 포함)
train_losses, val_losses, val_accuracies = [], [], []
epochs = 10
best_acc = 0.0

for epoch in range(epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1:02d}", leave=True)

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    avg_val_loss, val_acc = evaluate(model, val_loader)

    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)
    val_accuracies.append(val_acc)

    print(f"Epoch {epoch+1:02d} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")

    if val_acc > best_acc:
        best_acc = val_acc
        model.save_pretrained("./best_model")
        tokenizer.save_pretrained("./best_model")
        print(f"📦 모델 저장 완료 (Val Acc: {val_acc:.4f})")

# STEP 11: 시각화
plt.figure(figsize=(6, 4))
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Val Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss over Epochs")
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(6, 4))
plt.plot(val_accuracies, label="Val Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Validation Accuracy over Epochs")
plt.legend()
plt.grid(True)
plt.show()


## 같은 모델 튜닝 후 마지막 결과

In [None]:
# STEP 1: 라이브러리
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt

# STEP 2: 디바이스
device = "cuda" if torch.cuda.is_available() else "cpu"
print("✅ 디바이스:", device)

# STEP 3: 데이터 로딩
df = pd.read_csv("/kaggle/input/dataset2/train.csv")
df["label"] = df.apply(lambda row: [row[f"answer_{i}"] for i in range(4)], axis=1)
df["sentences"] = df.apply(lambda row: [row[f"sentence_{i}"] for i in range(4)], axis=1)

# STEP 4: 입력 및 레이블 구성
df["inputs"] = df["sentences"].apply(lambda sents: f"문장 순서 맞추기: {' [SEP] '.join(sents)}")
df["labels"] = df["label"].apply(lambda l: " ".join(map(str, l)))

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df[["inputs", "labels"]], preserve_index=False)
val_dataset = Dataset.from_pandas(val_df[["inputs", "labels"]], preserve_index=False)


# STEP 5: 모델 및 토크나이저
model_name = "paust/pko-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# STEP 6: 전처리
def tokenize_function(example):
    model_inputs = tokenizer(example["inputs"], padding="max_length", truncation=True, max_length=256)
    labels = tokenizer(text_target=example["labels"], padding="max_length", truncation=True, max_length=10)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(tokenize_function, remove_columns=["inputs", "labels"])
tokenized_val = val_dataset.map(tokenize_function, remove_columns=["inputs", "labels"])
tokenized_train.set_format(type="torch")
tokenized_val.set_format(type="torch")

# STEP 7: 학습 설정
data_collator = DataCollatorForSeq2Seq(tokenizer, model)
train_loader = DataLoader(tokenized_train, batch_size=16, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(tokenized_val, batch_size=16, collate_fn=data_collator)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)

# STEP 8: 평가 함수
def evaluate(model, loader):
    model.eval()
    val_loss, correct, total = 0, 0, 0

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            preds = model.generate(input_ids=input_ids, attention_mask=attention_mask,
                                   max_length=10, num_beams=5)
            decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            for pred, label in zip(decoded_preds, decoded_labels):
                if pred.strip() == label.strip():
                    correct += 1
                total += 1
    return val_loss / len(loader), correct / total

# STEP 9: 학습 루프
train_losses, val_losses, val_accuracies = [], [], []
epochs = 10
best_acc = 0.0

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    avg_val_loss, val_acc = evaluate(model, val_loader)
    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)
    val_accuracies.append(val_acc)

    print(f"Epoch {epoch+1:02d} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")

    if val_acc > best_acc:
        best_acc = val_acc
        model.save_pretrained("./best_model")
        tokenizer.save_pretrained("./best_model")
        print("💾 모델 저장 완료 (성능 개선)")

# STEP 10: 시각화
plt.figure(figsize=(6, 4))
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Val Loss")
plt.legend(), plt.title("Loss Curve"), plt.grid()
plt.show()

plt.figure(figsize=(6, 4))
plt.plot(val_accuracies, label="Validation Accuracy")
plt.legend(), plt.title("Accuracy Curve"), plt.grid()
plt.show()


## 진짜 마지막 튜닝 

1e-4 => 5e-5

In [None]:
# STEP 1: 라이브러리
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt

# STEP 2: 디바이스
device = "cuda" if torch.cuda.is_available() else "cpu"
print("✅ 디바이스:", device)

# STEP 3: 데이터 로딩
df = pd.read_csv("/kaggle/input/dataset2/train.csv")
df["label"] = df.apply(lambda row: [row[f"answer_{i}"] for i in range(4)], axis=1)
df["sentences"] = df.apply(lambda row: [row[f"sentence_{i}"] for i in range(4)], axis=1)

# STEP 4: 입력 및 레이블 구성
df["inputs"] = df["sentences"].apply(lambda sents: f"문장 순서 맞추기: {' [SEP] '.join(sents)}")
df["labels"] = df["label"].apply(lambda l: " ".join(map(str, l)))

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df[["inputs", "labels"]], preserve_index=False)
val_dataset = Dataset.from_pandas(val_df[["inputs", "labels"]], preserve_index=False)


# STEP 5: 모델 및 토크나이저
model_name = "paust/pko-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# STEP 6: 전처리
def tokenize_function(example):
    model_inputs = tokenizer(example["inputs"], padding="max_length", truncation=True, max_length=256)
    labels = tokenizer(text_target=example["labels"], padding="max_length", truncation=True, max_length=10)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(tokenize_function, remove_columns=["inputs", "labels"])
tokenized_val = val_dataset.map(tokenize_function, remove_columns=["inputs", "labels"])
tokenized_train.set_format(type="torch")
tokenized_val.set_format(type="torch")

# STEP 7: 학습 설정
data_collator = DataCollatorForSeq2Seq(tokenizer, model)
train_loader = DataLoader(tokenized_train, batch_size=16, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(tokenized_val, batch_size=16, collate_fn=data_collator)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

# STEP 8: 평가 함수
def evaluate(model, loader):
    model.eval()
    val_loss, correct, total = 0, 0, 0

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            preds = model.generate(input_ids=input_ids, attention_mask=attention_mask,
                                   max_length=10, num_beams=5)
            decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            for pred, label in zip(decoded_preds, decoded_labels):
                if pred.strip() == label.strip():
                    correct += 1
                total += 1
    return val_loss / len(loader), correct / total

# STEP 9: 학습 루프
train_losses, val_losses, val_accuracies = [], [], []
epochs = 10
best_acc = 0.0

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    avg_val_loss, val_acc = evaluate(model, val_loader)
    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)
    val_accuracies.append(val_acc)

    print(f"Epoch {epoch+1:02d} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")

    if val_acc > best_acc:
        best_acc = val_acc
        model.save_pretrained("./best_model")
        tokenizer.save_pretrained("./best_model")
        print("💾 모델 저장 완료 (성능 개선)")

# STEP 10: 시각화
plt.figure(figsize=(6, 4))
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Val Loss")
plt.legend(), plt.title("Loss Curve"), plt.grid()
plt.show()

plt.figure(figsize=(6, 4))
plt.plot(val_accuracies, label="Validation Accuracy")
plt.legend(), plt.title("Accuracy Curve"), plt.grid()
plt.show()
