In [2]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_dataset, Dataset
import json
import re
from hanspell import spell_checker
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# ===== 1. Îç∞Ïù¥ÌÑ∞ Î°úÎìú =====
with open("prompt_dataset_input_label22.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# ===== 1.5 Ï†ïÏ†ú Ìï®Ïàò Ï†ïÏùò =====
def clean_text(text, use_spellcheck=False):
    text = re.sub(r"[^\uAC00-\uD7A3a-zA-Z0-9\s.,!?]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    if use_spellcheck:
        try:
            text = spell_checker.check(text).checked
        except:
            pass
    return text

def clean_raw_data(data, min_len=5, max_len=300, use_spellcheck=False):
    seen = set()
    cleaned = []
    for item in data:
        inp = clean_text(item["input"], use_spellcheck)
        tgt = clean_text(item["label"], use_spellcheck)
        if min_len <= len(inp) <= max_len and inp not in seen:
            cleaned.append({"input": inp, "label": tgt})
            seen.add(inp)
    return cleaned

# ===== 1.6 Îç∞Ïù¥ÌÑ∞ Ï†ïÏ†ú Ï†ÅÏö© =====
cleaned_data = clean_raw_data(raw_data, use_spellcheck=False)  # ÎßûÏ∂§Î≤ï Í≤ÄÏÇ¨ TrueÎ©¥ ÎäêÎ†§Ïßê

# ===== 2. Dataset Î≥ÄÌôò Î∞è Î∂ÑÌï† =====
raw_dataset = Dataset.from_list(cleaned_data)
dataset = raw_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]


In [11]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_dir = "./t5_finetuned_result"  # ‚úÖ Ï†ÄÏû•Îêú ÎîîÎ†âÌÜ†Î¶¨Î™Ö Ï†ïÌôïÌûà ÏßÄÏ†ï

tokenizer = T5Tokenizer.from_pretrained(model_dir)
tokenizer.pad_token = tokenizer.eos_token  # Íº≠ ÏßÄÏ†ï!

model = T5ForConditionalGeneration.from_pretrained(model_dir).to("cuda")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
def preprocess_batch(examples):
    inputs = tokenizer(
        examples["input"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["label"],
            padding="max_length",
            truncation=True,
            max_length=128
        )["input_ids"]
    labels = [
        [token if token != tokenizer.pad_token_id else -100 for token in label]
        for label in labels
    ]
    inputs["labels"] = labels
    return inputs

train_dataset = train_dataset.map(preprocess_batch, batched=True, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(preprocess_batch, batched=True, remove_columns=eval_dataset.column_names)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2475/2475 [00:00<00:00, 4858.97 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 276/276 [00:00<00:00, 4919.08 examples/s]


In [None]:
training_args = TrainingArguments(
    output_dir="./ke-t5-base_finetuned",
    evaluation_strategy="epoch",
    logging_strategy="epoch",            # ‚úÖ Î°úÍ∑∏ÎèÑ epochÎßàÎã§ Ï∂úÎ†•
    save_strategy="epoch",
    save_total_limit=2,

    learning_rate=5e-5,
    lr_scheduler_type="cosine",

    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,

    num_train_epochs=20,
    warmup_ratio=0.1,
    weight_decay=0.01,

    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    report_to="none"
)


In [None]:
# 6. Trainer Ï†ïÏùò Î∞è ÌïôÏäµ (EarlyStopping Ï∂îÍ∞Ä)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

In [25]:
import re

# üîß Ï∂úÎ†• ÌõÑÏ≤òÎ¶¨ Ìï®Ïàò
def clean_output(text):
    text = text.replace("<pad>", "").replace("</s>", "")
    text = re.sub(r"\b(\w+)(\s+\1)+\b", r"\1", text)  # Î∞òÎ≥µ Îã®Ïñ¥ Ï†úÍ±∞
    text = re.sub(r"\s{2,}", " ", text)  # Ï§ëÎ≥µ Í≥µÎ∞± Ï†úÍ±∞
    sentences = re.split(r'(?<=[.?!])\s+|\n+', text.strip())
    return sentences[0].strip() if sentences else text.strip()

# üîç ÌîÑÎ°¨ÌîÑÌä∏ Î¶¨ÎùºÏù¥ÌåÖ Ìï®Ïàò (beam search Í∏∞Î∞ò)
def refine_prompt(bad_prompt, max_new_tokens=64):
    prompt = (
        "Îã§ÏùåÏùÄ ÏÇ¨Ïö©ÏûêÏùò Î™®Ìò∏ÌïòÍ±∞ÎÇò ÏßßÏùÄ ÌîÑÎ°¨ÌîÑÌä∏ÏûÖÎãàÎã§.\n"
        "Ïù¥ ÌîÑÎ°¨ÌîÑÌä∏Î•º Îçî Íµ¨Ï≤¥Ï†ÅÏù¥Í≥† Î™ÖÌôïÌïòÍ≤å Î∞îÍøîÏ£ºÏÑ∏Ïöî.\n\n"
        "ÏòàÏãú1:\n"
        "ÌîÑÎ°¨ÌîÑÌä∏: ÏöîÏïΩÌï¥Ï§ò\n"
        "Í∞úÏÑ†Îêú ÌîÑÎ°¨ÌîÑÌä∏: ÏïÑÎûò Í∏∞ÏÇ¨ ÎÇ¥Ïö©ÏùÑ 3Ï§Ñ Ïù¥ÎÇ¥Î°ú ÌïµÏã¨Îßå ÏöîÏïΩÌï¥Ï§ò.\n\n"
        "ÏòàÏãú2:\n"
        "ÌîÑÎ°¨ÌîÑÌä∏: Ï¢ãÏùÄ ÏÑ†Î¨º Ï∂îÏ≤úÌï¥Ï§ò\n"
        "Í∞úÏÑ†Îêú ÌîÑÎ°¨ÌîÑÌä∏: 20ÎåÄ ÏßÅÏû•Ïù∏ ÎÇ®Ïûê ÏπúÍµ¨ÏóêÍ≤å Ïñ¥Ïö∏Î¶¨Îäî ÏÉùÏùº ÏÑ†Î¨º 3Í∞ÄÏßÄÎ•º Ï∂îÏ≤úÌï¥Ï§ò.\n\n"
        "ÏòàÏãú3:\n"
        "ÌîÑÎ°¨ÌîÑÌä∏: Ïä§ÎßàÌä∏Ìè∞ Ï∂îÏ≤ú\n"
        "Í∞úÏÑ†Îêú ÌîÑÎ°¨ÌîÑÌä∏: 2024ÎÖÑ ÏÉÅÎ∞òÍ∏∞ Í∏∞Ï§Ä, 30ÎåÄ Ïó¨ÏÑ± ÏßÅÏû•Ïù∏ÏóêÍ≤å Ï†ÅÌï©Ìïú Í∞ÄÏÑ±ÎπÑ Ïä§ÎßàÌä∏Ìè∞ 3Í∞ÄÏßÄÎ•º Ï∂îÏ≤úÌï¥Ï§ò.\n\n"
        f"ÌîÑÎ°¨ÌîÑÌä∏: {bad_prompt}\n"
        "Í∞úÏÑ†Îêú ÌîÑÎ°¨ÌîÑÌä∏:"
    )

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512).to(model.device)
    if "token_type_ids" in inputs:
        del inputs["token_type_ids"]

    outputs = model.generate(
    **inputs,
    max_new_tokens=64,
    do_sample=True,
    num_beams=4,
    top_p=0.8,
    temperature=0.9,
    repetition_penalty=1.3
)


    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return clean_output(result)


In [26]:
prompts = [
    "AIÍ∞Ä Î≠êÏïº?",
    "ÏöîÏïΩÌï¥Ï§ò",
    "ÏÑ§Î™ÖÌï¥Ï§ò",
    "Ï¢ãÏùÄ ÏÑ†Î¨ºÎ¨º Ï∂îÏ≤ú",
    "Ï†ÑÏãúÌöå"
]

for p in prompts:
    result = refine_prompt(p)
    print(f"üìù ÏûÖÎ†•: {p}")
    print(f"‚úÖ Ï∂úÎ†•: {result}")
    print("="*60)


üìù ÏûÖÎ†•: AIÍ∞Ä Î≠êÏïº?
‚úÖ Ï∂úÎ†•: ÏÜåÎπÑÏûê Î¶¨Î∑∞ÏóêÏÑú Î∞∞ÏÜ°, ÌíàÏßà, CS Ìï≠Î™© Î™®Îëê Ï¢ãÏùÄ ÌèâÍ∞ÄÎ•º Î∞õÏùÄ ÏáºÌïëÎ™∞ 3Í≥≥ÏùÑ ÏÜåÍ∞úÌï¥Ï§ò.
üìù ÏûÖÎ†•: ÏöîÏïΩÌï¥Ï§ò
‚úÖ Ï∂úÎ†•: Ïã§Ï†ú Íµ¨Îß§ Îç∞Ïù¥ÌÑ∞Î•º Í∏∞Î∞òÏúºÎ°ú ÌåêÎß§Í∞Ä ÎÜíÏùÄ ÏÉÅÌíà 3Í∞ÄÏßÄÎ•º Ï∂îÏ≤úÌï¥Ï§ò.
üìù ÏûÖÎ†•: ÏÑ§Î™ÖÌï¥Ï§ò
‚úÖ Ï∂úÎ†•: ÏûêÏ∑®ÏÉùÏù¥ Í∞ÄÏû• ÎßéÏù¥ Ï∞æÎäî Íµ≠ÎÇ¥ Ïú†Î™Ö Î∏åÎûúÎìú 3Í≥≥ÏùÑ ÏÜåÍ∞úÌïòÍ≥† Ïù∏Í∏∞ Ïù¥Ïú†Î•º ÏïåÎ†§Ï§ò.
üìù ÏûÖÎ†•: Ï¢ãÏùÄ ÏÑ†Î¨ºÎ¨º Ï∂îÏ≤ú
‚úÖ Ï∂úÎ†•: Ïã§Ï†ú Íµ¨Îß§ Îç∞Ïù¥ÌÑ∞Î•º Í∏∞Î∞òÏúºÎ°ú ÌåêÎß§Í∞Ä ÎÜíÏùÄ Ï†úÌíà 3Í∞ÄÏßÄÎ•º Ï∂îÏ≤úÌï¥Ï§ò.
üìù ÏûÖÎ†•: Ï†ÑÏãúÌöå
‚úÖ Ï∂úÎ†•: Î™®Î∞îÏùº ÏáºÌïë Ïï± ÎÇ¥ Î≤†Ïä§Ìä∏ÏÖÄÎü¨ ÏÉÅÌíà 3Í∞ÄÏßÄÎ•º ÏÜåÍ∞úÌï¥Ï§ò.


In [27]:
# ÌÉÄÏù¥Î∞ç Ìè¨Ìï® label ÏòàÏãú Ï∂úÎ†•
for d in cleaned_data:
    if "ÌÉÄÏù¥Î∞ç" in d["label"]:
        print(d["label"])

Ïù¥ÏßÅ Ïãú Ïó∞Î¥â ÌòëÏÉÅ Ï†ÑÎûµÍ≥º ÎßêÌï¥Ïïº Ìï† ÌÉÄÏù¥Î∞çÏùÑ ÏïåÎ†§Ï§ò.
Í≤ΩÎ†•ÏßÅ Ïó∞Î¥â ÌòëÏÉÅÏóêÏÑú Ïú†Î¶¨ÌïòÍ≤å ÌòëÏÉÅÌïòÎäî Î≤ïÍ≥º ÌÉÄÏù¥Î∞çÏùÑ ÏïåÎ†§Ï§ò.
Ïù¥ÏßÅ Ïãú Ïó∞Î¥â ÌòëÏÉÅ Ï†ÑÎûµÍ≥º ÎßêÌï¥Ïïº Ìï† ÌÉÄÏù¥Î∞çÏùÑ ÏïåÎ†§Ï§ò.
Í≤ΩÎ†•ÏßÅ Ïó∞Î¥â ÌòëÏÉÅÏóêÏÑú Ïú†Î¶¨ÌïòÍ≤å ÌòëÏÉÅÌïòÎäî Î≤ïÍ≥º ÌÉÄÏù¥Î∞çÏùÑ ÏïåÎ†§Ï§ò.
Ïù¥ÏßÅ Ïãú Ïó∞Î¥â ÌòëÏÉÅ Ï†ÑÎûµÍ≥º ÎßêÌï¥Ïïº Ìï† ÌÉÄÏù¥Î∞çÏùÑ ÏûêÏÑ∏Ìûà ÏïåÎ†§Ï§ò.
Í≤ΩÎ†•ÏßÅ Ïó∞Î¥â ÌòëÏÉÅÏóêÏÑú Ïú†Î¶¨ÌïòÍ≤å ÌòëÏÉÅÌïòÎäî Î≤ïÍ≥º ÌÉÄÏù¥Î∞çÏùÑ ÏûêÏÑ∏Ìûà ÏïåÎ†§Ï§ò.
