In [None]:
# 1. 라이브러리 불러오기
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
from tqdm import tqdm

# 2. 데이터 로드 및 전처리
train = pd.read_csv('./train.csv')

def make_input(row):
    sents = [row[f"sentence_{i}"] for i in range(4)]
    input_text = "문장을 순서대로 정렬하세요: " + " </s> ".join(sents)
    answer = [row[f"answer_{i}"] for i in range(4)]
    target_text = " ".join(map(str, answer))  # "0 2 3 1"
    return {"input": input_text, "target": target_text}

inputs = train.apply(make_input, axis=1).tolist()
train_data, valid_data = train_test_split(inputs, test_size=0.1, random_state=42)
train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
valid_dataset = Dataset.from_pandas(pd.DataFrame(valid_data))

# 3. 토크나이저 및 모델 로딩
model_name = "KETI-AIR/ke-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 4. 토크나이징
def tokenize(example):
    model_inputs = tokenizer(example["input"], max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(example["target"], max_length=10, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(tokenize, batched=True)
tokenized_valid = valid_dataset.map(tokenize, batched=True)

# 5. 학습 설정
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

# 6. Trainer 정의 및 학습
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid
)

trainer.train()

# 7. 저장
model.save_pretrained("./results")
tokenizer.save_pretrained("./results")


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|██████████| 6615/6615 [00:03<00:00, 2121.41 examples/s]
Map: 100%|██████████| 736/736 [00:00<00:00, 2218.60 examples/s]
  0%|          | 5/8270 [04:50<145:26:56, 63.35s/it]

## 테스트 코드

In [4]:
# 8. 테스트셋 로딩
test = pd.read_csv('./test.csv')
sentences = test[[f"sentence_{i}" for i in range(4)]].values.tolist()

# 모델 재로딩
tokenizer = T5Tokenizer.from_pretrained("./results")
model = T5ForConditionalGeneration.from_pretrained("./results").to(device)
model.eval()

# 9. 추론 함수
def predict_order(sent_list):
    input_text = "문장을 순서대로 정렬하세요: " + " </s> ".join(sent_list)
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=512,
        padding="longest",
        truncation=True
    ).to(device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_length=10,
            do_sample=False,
            num_beams=4,
            pad_token_id=tokenizer.pad_token_id
        )
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    try:
        return list(map(int, decoded.strip().split()))
    except:
        return [0, 1, 2, 3]

# 10. 전체 예측
predictions = [predict_order(s) for s in tqdm(sentences)]

# 11. 제출 파일 저장
submission = pd.read_csv("./sample_submission.csv")
for i in range(4):
    submission[f"answer_{i}"] = [pred[i] if len(pred) == 4 else i for pred in predictions]

submission.to_csv("final_submission.csv", index=False)
print("✅ 제출 파일 저장 완료: final_submission.csv")


OSError: Incorrect path_or_model_id: './results'. Please provide either the path to a local folder or the repo_id of a model on the Hub.