# 모델학습

In [1]:
import os
import pandas as pd
import numpy as np

df = pd.read_csv("jeju_train_similarity.csv")
df.head()

Unnamed: 0,standard_form,dialect_form,similarity
0,그러고 디스플레이 부족하댄허난하다고 하니까 사람을 구해준 것 아닙니까,겅허고 디스플레이 부족하댄허난하다고 하니까 사람을 구해준 것 아니꽈,0.855086
1,너 종합병원이야 갑자기 다리꼬지마,너 종합병원 갑자기 다리꼬지마,0.835181
2,그러니까 요즘에 오는 관광객들,게난 요즘에 오는 관광객들,0.734492
3,그렇게 하니까 한번 해볼까 해서 그냥 호기심으로 했지,경 하니까 한번 해보카 해연에 그냥 호기심으로 했주게,0.725456
4,오들락오들락 잘 나 정말 곱게 난다 잡초 하나 없이,오들락오들락 잘 나메 정말 곱게 난다 검질 호나 어시,0.880031


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 893111 entries, 0 to 893110
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   standard_form  893107 non-null  object 
 1   dialect_form   893110 non-null  object 
 2   similarity     893111 non-null  float64
dtypes: float64(1), object(2)
memory usage: 20.4+ MB


In [3]:
df.isnull().sum()

standard_form    4
dialect_form     1
similarity       0
dtype: int64

In [4]:
df = df.dropna(axis=0)
df.head()

Unnamed: 0,standard_form,dialect_form,similarity
0,그러고 디스플레이 부족하댄허난하다고 하니까 사람을 구해준 것 아닙니까,겅허고 디스플레이 부족하댄허난하다고 하니까 사람을 구해준 것 아니꽈,0.855086
1,너 종합병원이야 갑자기 다리꼬지마,너 종합병원 갑자기 다리꼬지마,0.835181
2,그러니까 요즘에 오는 관광객들,게난 요즘에 오는 관광객들,0.734492
3,그렇게 하니까 한번 해볼까 해서 그냥 호기심으로 했지,경 하니까 한번 해보카 해연에 그냥 호기심으로 했주게,0.725456
4,오들락오들락 잘 나 정말 곱게 난다 잡초 하나 없이,오들락오들락 잘 나메 정말 곱게 난다 검질 호나 어시,0.880031


In [5]:
import sentencepiece as spm
from tokenizers import Tokenizer, normalizers
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from tokenizers.processors import TemplateProcessing
from datasets import Dataset
from transformers import PreTrainedTokenizerFast, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments




In [6]:
from transformers import PreTrainedTokenizerFast, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, BertTokenizer, AutoTokenizer
# Define custom tokenizer class
class CustomSPTokenizer(PreTrainedTokenizerFast):
    def __init__(self, tokenizer_path):
        super().__init__(tokenizer_file=tokenizer_path)
        self._tokenizer = Tokenizer.from_file(tokenizer_path)

    def _tokenize(self, text):
        return self._tokenizer.encode(text).tokens

    def _convert_token_to_id(self, token):
        return self._tokenizer.token_to_id(token)

    def _convert_id_to_token(self, index):
        return self._tokenizer.id_to_token(index)

    def convert_tokens_to_string(self, tokens):
        return self._tokenizer.decode(tokens)

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep + [self.sep_token_id]

    @property
    def cls_token_id(self):
        return self._tokenizer.token_to_id("<cls>")

    @property
    def sep_token_id(self):
        return self._tokenizer.token_to_id("<sep>")

    @property
    def pad_token_id(self):
        return self._tokenizer.token_to_id("<pad>")

    @property
    def unk_token_id(self):
        return self._tokenizer.token_to_id("<unk>")

# Initialize the custom tokenizer
tokenizer_path = "bpe_tokenizer.json"
custom_tokenizer = CustomSPTokenizer(tokenizer_path)
custom_tokenizer.add_special_tokens({'pad_token': '<pad>'})

0

In [7]:
model = AutoModelForSeq2SeqLM.from_pretrained("gogamza/kobart-base-v2")

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [8]:
model.resize_token_embeddings(len(custom_tokenizer))

Embedding(16000, 768)

In [9]:
from sklearn.model_selection import train_test_split

# Preprocess the data
def preprocess_function(examples):
    inputs = [ex for ex in examples['standard_form']]
    targets = [ex for ex in examples['dialect_form']]
    # inputs = [ex for ex in examples['dialect_form']]
    # targets = [ex for ex in examples['standard_form']]
    model_inputs = custom_tokenizer(inputs, max_length=100, truncation=True, padding="max_length")

    # Setup the tokenizer for targets
    with custom_tokenizer.as_target_tokenizer():
        labels = custom_tokenizer(targets, max_length=100, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)   
train_dataset = Dataset.from_pandas(train_df)
train_dataset = train_dataset.map(preprocess_function,batched=True)

val_dataset = Dataset.from_pandas(val_df)
val_dataset = val_dataset.map(preprocess_function,batched=True)

# Filter out unwanted columns
train_dataset = train_dataset.remove_columns(['standard_form', 'dialect_form', '__index_level_0__', 'token_type_ids','similarity'])
val_dataset = val_dataset.remove_columns(['standard_form', 'dialect_form', '__index_level_0__', 'token_type_ids','similarity'])
print("train_dataset의 column : ",train_dataset.column_names)
print("val_dataset의 column : ",val_dataset.column_names)

Map:   0%|          | 0/714484 [00:00<?, ? examples/s]



Map:   0%|          | 0/178622 [00:00<?, ? examples/s]

train_dataset의 column :  ['input_ids', 'attention_mask', 'labels']
val_dataset의 column :  ['input_ids', 'attention_mask', 'labels']


In [10]:
import torch 
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [11]:
# Load the BART model
# model = AutoModelForSeq2SeqLM.from_pretrained("gogamza/kobart-base-v2")
model.to(device)

# 로그 디렉토리 경로 설정
log_dir = r"C:\Users\user\Desktop\logs"
os.makedirs(log_dir, exist_ok=True)


training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",  # 'epoch' 대신 'steps'로 설정하여 주기적으로 평가
    eval_steps=500,  # 평가 주기를 500 스텝으로 설정 BART는 5000주기
    save_steps=1000,  # 체크포인트 저장 주기를 1000 스텝으로 설정
    learning_rate=2e-5,  # 일반적으로 2e-5와 같은 낮은 학습률이 안정적
    per_device_train_batch_size=32,  # GPU 메모리와 성능에 따라 조정  Bart = 16. KoBART = 32
    per_device_eval_batch_size=32,  # 평가 시에도 배치 크기를 맞춤  Bart = 16. KoBART = 32
    num_train_epochs=3,  # 기본 에포크 수
    weight_decay=0.01,  # 가중치 감쇠
    logging_dir=log_dir,  # 로그 디렉토리
    logging_steps=100,  # 로그 주기를 100 스텝으로 설정  Bart = 500. KoBART = 100
    save_total_limit=2,  # 저장할 체크포인트 수 제한
    load_best_model_at_end=True,  # 학습이 끝난 후 가장 좋은 모델을 로드
    gradient_accumulation_steps=2,  # 그래디언트 누적을 통해 배치 크기 증가 효과
)


# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=custom_tokenizer,
    eval_dataset=val_dataset
)


In [None]:
trainer.train()

# 모델 저장
model.save_pretrained("./kobart_train")
# model.save_pretrained("./translated_model_bart100_sort_dialect_to_standard")

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss,Validation Loss
500,0.2604,0.233933
1000,0.2032,0.198892
1500,0.1807,0.176541
2000,0.1648,0.160974
2500,0.1497,0.152923
3000,0.1456,0.144607
3500,0.1402,0.136957
4000,0.1357,0.131497
4500,0.1301,0.127183
5000,0.1283,0.122712


Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-defa

In [12]:
# 학습 시작 또는 재개
trainer.train(resume_from_checkpoint=r'C:\Users\user\Desktop\jeju\results\checkpoint-23000')

# 모델 저장
model.save_pretrained("./kobart_train")

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss,Validation Loss
23500,0.0819,0.082444
24000,0.0824,0.081749
24500,0.0828,0.081902
25000,0.0801,0.081596
25500,0.0808,0.081022
26000,0.0814,0.081346
26500,0.0789,0.081147
27000,0.0804,0.080592
27500,0.0817,0.080414
28000,0.0804,0.08036


Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].
Non-default generation parameters: {'forced_eos_token_id': 1}
