In [25]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

path = "/content/drive/MyDrive/slang_pairs_proper_v2.csv"

df = pd.read_csv(path)
print(df.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                      input                     target
0    카메라을 헬스장에 두고 온 것 같습니다.      카메라을 헬스장에 두고 온 것 같노ㅎㅎ
1  설명을 조금 더 자세히 해 주실 수 있나요?  설명을 쪼끔 더 자세히 해 주실 수 있노?ㅎㅎ
2          카페에 저녁에 도착하겠습니다.              카페에 저녁에 도착하겠노
3     카메라을 버정에 두고 온 것 같습니다.         카메라을 버정에 두고 온 것 같누
4      카메라 배터리가 너무 빨리 닳습니다.        카메라 배터리가 너무 빨리 닳노ㄷㄷ


In [35]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)

PREFIX = "slang: "
X_train = (PREFIX + train_df["input"].astype(str)).tolist()
y_train = train_df["target"].astype(str).tolist()
X_val   = (PREFIX + val_df["input"].astype(str)).tolist()
y_val   = val_df["target"].astype(str).tolist()

모델/토커나이저 준비

In [36]:
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

model_name = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name, from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFMT5ForConditionalGeneration: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
- This IS expected if you are initializing TFMT5ForConditionalGeneration from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFMT5ForConditionalGeneration from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFMT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMT5ForConditionalGeneration for predictions without further training.


In [37]:
import tensorflow as tf

max_in, max_out = 128, 64

enc_train = tokenizer(
    X_train, max_length=max_in, truncation=True, padding="max_length", return_tensors="tf"
)
enc_val = tokenizer(
    X_val,   max_length=max_in, truncation=True, padding="max_length", return_tensors="tf"
)

labels_train = tokenizer(
    text_target=y_train, max_length=max_out, truncation=True, padding="max_length", return_tensors="tf"
)["input_ids"]
labels_val = tokenizer(
    text_target=y_val,   max_length=max_out, truncation=True, padding="max_length", return_tensors="tf"
)["input_ids"]

# pad → -100 (loss에서 무시)
labels_train = tf.where(labels_train == tokenizer.pad_token_id, -100, labels_train)
labels_val   = tf.where(labels_val   == tokenizer.pad_token_id, -100, labels_val)

train_features = dict(enc_train); train_features["labels"] = labels_train
val_features   = dict(enc_val);    val_features["labels"]   = labels_val

BATCH = 16  # T4면 8~16 권장 (OOM 나면 더 줄이기)
train_ds = tf.data.Dataset.from_tensor_slices(train_features).shuffle(10000).batch(BATCH).prefetch(tf.data.AUTOTUNE)
val_ds   = tf.data.Dataset.from_tensor_slices(val_features).batch(BATCH).prefetch(tf.data.AUTOTUNE)


모델 학습

In [38]:
from transformers import create_optimizer
import math

EPOCHS = 5
steps_per_epoch = math.ceil(len(train_df) / BATCH)
num_train_steps = steps_per_epoch * EPOCHS
num_warmup_steps = int(num_train_steps * 0.1)

optimizer, _ = create_optimizer(
    init_lr=5e-5,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    weight_decay_rate=0.01
)

# TF Seq2Seq는 loss를 내부 compute_loss로 계산 → loss 지정 불필요
model.compile(optimizer=optimizer)
history = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### 모델 저장

In [None]:
model.save_pretrained('seq2seq_model/bert-base')
tokenizer.save_pretrained('seq2seq_model/bert-base')

### 추론

In [43]:
import re
from transformers import pipeline, GenerationConfig

# (<extra_id_*>) 금지
bad_words_ids = [tokenizer.encode(f"<extra_id_{i}>", add_special_tokens=False) for i in range(100)]

# 안정 디코딩 프리셋(빔서치)
gen_cfg = GenerationConfig(
    max_new_tokens=48,
    do_sample=False,              # 안정: 샘플링 OFF
    num_beams=4,
    length_penalty=0.9,
    no_repeat_ngram_size=3,
    repetition_penalty=1.15,
    bad_words_ids=bad_words_ids,
    eos_token_id=tokenizer.eos_token_id
)

gen = pipeline("text2text-generation", model=model, tokenizer=tokenizer, framework="tf")

def enforce_no_nu(text, is_q=False):
    s = text.strip()
    s = re.sub(r"[.!?]+$", "", s)
    if re.search(r"(노\??|누\??)$", s):
        return s
    return s + ("노?" if is_q else "노")

def slangify(sentence):
    is_q = sentence.strip().endswith("?")
    # ✅ return_full_text 인자 제거
    out = gen(PREFIX + sentence, generation_config=gen_cfg)[0]["generated_text"]
    return enforce_no_nu(out, is_q)

print(slangify("회의 자료를 공유해 주시겠어요?"))
print(slangify("오늘 날씨가 정말 좋습니다."))
print(slangify("내일까지 결과를 보내 주십시오."))

Device set to use 0


...! 회의 자료를 공유해주시겠어요노?
....”오늘 날씨가 좋습니다노
ddi 결과를 내일 내일까지을 결과로 보내 주십시오노
