# Import 

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import torch

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# Data load & Pre-processing

In [2]:
train = pd.read_csv('./train.csv')

In [3]:
# 입력 텍스트와 타겟 텍스트를 구성하는 함수
def make_input(row):
    sentences = [row[f"sentence_{i}"] for i in range(4)]
    input_text = "문장을 순서대로 정렬하세요: " + " </s> ".join(sentences)
    answer = [row[f"answer_{i}"] for i in range(4)]
    target_text = " ".join(map(str, answer))  # 예: "0 3 1 2"
    return {"input": input_text, "target": target_text}

In [4]:
# 데이터셋 가공 및 분할
inputs = train.apply(make_input, axis=1).tolist()
train_data, valid_data = train_test_split(inputs, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
valid_dataset = Dataset.from_pandas(pd.DataFrame(valid_data))

# Model Load

In [5]:
# 토크나이저 및 모델 로딩
model_name = "t5-small"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = T5Tokenizer.from_pretrained(model_name,cache_dir='C:/huggingface_cache')
model = T5ForConditionalGeneration.from_pretrained(model_name,cache_dir='C:/huggingface_cache')
model.to(device)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

# Train

In [6]:
# 토크나이징 함수 정의
def tokenize(example):
    model_inputs = tokenizer(example["input"], max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(example["target"], max_length=16, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 데이터셋 토크나이징
tokenized_train = train_dataset.map(tokenize, batched=True)
tokenized_valid = valid_dataset.map(tokenize, batched=True)

Map: 100%|██████████| 5880/5880 [00:03<00:00, 1655.83 examples/s]
Map: 100%|██████████| 1471/1471 [00:00<00:00, 1679.19 examples/s]


In [7]:
# 학습 설정
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=30,
    
    # Mixed Precision & 속도 최적화
    fp16=True,                             # 16비트 연산
    dataloader_pin_memory=True,            # 메모리 전송 최적화
    dataloader_num_workers=4,              # 병렬 데이터 로딩
)

# Trainer 정의 및 학습
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
)

trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,1.3852
1000,0.2462
1500,0.2171
2000,0.2087
2500,0.2031
3000,0.1993
3500,0.1961
4000,0.1944
4500,0.1903
5000,0.1852


TrainOutput(global_step=11040, training_loss=0.2415983040695605, metrics={'train_runtime': 3226.3806, 'train_samples_per_second': 54.674, 'train_steps_per_second': 3.422, 'total_flos': 2.38742937796608e+16, 'train_loss': 0.2415983040695605, 'epoch': 30.0})

In [8]:
tokenizer.save_pretrained("./results")
model.save_pretrained("./results")

# Inference

In [9]:
# 모델 로드
model_dir = "./results"
tokenizer = T5Tokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir)
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [10]:
# 테스트 데이터
test = pd.read_csv("./test.csv")
sentences = test[[f"sentence_{i}" for i in range(4)]].values.tolist()

# 추론 함수
def predict_order(sent_list):
    input_text = "문장을 순서대로 정렬하세요: " + " </s> ".join(sent_list)
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding="longest",
        max_length=512
    ).to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_length=16,
            do_sample=True,      
            temperature=0.2,     
            top_p=0.9,            
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    
    try:
        order = list(map(int, decoded.strip().split()))
        return order
    except:
        return [0, 1, 2, 3]

# 예측
predictions = []
for sent_group in tqdm(sentences, desc="Predicting"):
    pred = predict_order(sent_group)
    predictions.append(pred)

Predicting: 100%|██████████| 1780/1780 [06:43<00:00,  4.41it/s]


# Submission

In [11]:
# sample_submission 불러오기
sample_submission = pd.read_csv("./sample_submission.csv")

# 예측 결과 적용
for i in range(4):
    sample_submission[f"answer_{i}"] = [
        pred[i] if len(pred) == 4 else i for pred in predictions
    ]

# 저장
sample_submission.to_csv("baseline_submission.csv", index=False)