In [1]:
! pip install -U accelerate
! pip install -U transformers

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/309.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m307.2/309.4 kB[0m [31m11.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
! pip install kss
! pip install datasets

Collecting kss
  Downloading kss-6.0.4.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting emoji==1.2.0 (from kss)
  Downloading emoji-1.2.0-py3-none-any.whl (131 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.3/131.3 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pecab (from kss)
  Downloading pecab-1.0.8.tar.gz (26.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.4/26.4 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jamo (from kss)
  Downloading jamo-0.4.1-py3-none-any.whl (9.5 kB)
Collecting hangul-jamo (from kss)
  Downloading hangul_jamo-1.0.1-py3-none-any.whl (4.4 kB)
Collecting tossi (from kss)
  Downloading tossi-0.3.1.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecti

In [22]:
import math
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, EarlyStoppingCallback, PreTrainedTokenizerFast
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from kss import split_sentences
import random

# 토크나이저와 모델 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
  bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>')
model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")

# 데이터 읽기 함수
def read_tales(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        tales = f.read().strip().split('///')

    return [tale.strip() for tale in tales if tale.strip()]

# 데이터셋 클래스
class TaleDataset(Dataset):
    def __init__(self, tales, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.tales = tales
        self.max_length = max_length

    def __len__(self):
        return len(self.tales)

    def __getitem__(self, idx):
        tale = self.tales[idx]
        encoding = self.tokenizer(tale, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

# 데이터 콜레이터
def data_collator(features):
    batch = tokenizer.pad(features, padding=True, return_tensors="pt")
    batch['labels'] = batch['input_ids'].clone()
    return batch

# Perplexity 계산 함수
def calculate_perplexity(model, tokenizer, tales, max_samples=100, batch_size=16, max_length=512):
    model.eval()
    total_loss = 0.0
    total_length = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    random.shuffle(tales)
    tales = tales[:max_samples]

    with torch.no_grad():
        for i in range(0, len(tales), batch_size):
            batch = tales[i:i + batch_size]
            encodings = tokenizer(batch, return_tensors='pt', truncation=True, max_length=max_length, padding=True)
            input_ids = encodings.input_ids.to(device)
            attention_mask = encodings.attention_mask.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss

            total_loss += loss.item() * torch.sum(attention_mask).item()
            total_length += torch.sum(attention_mask).item()

    if total_length == 0:
        print("Warning: No valid inputs found. Cannot calculate perplexity.")
        return float('inf')

    perplexity = math.exp(total_loss / total_length)
    return perplexity

# 동화 데이터 로드
tales = read_tales('/content/drive/MyDrive/Tale/processed_final.txt')
train_tales, val_tales = train_test_split(tales, test_size=0.2)

# 데이터셋 생성
train_dataset = TaleDataset(train_tales, tokenizer)
val_dataset = TaleDataset(val_tales, tokenizer)

# 초기 모델의 perplexity 계산
initial_model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")
initial_model.config.pad_token_id = tokenizer.pad_token_id
initial_perplexity = calculate_perplexity(initial_model, tokenizer, val_tales)
print(f"Initial model perplexity: {initial_perplexity:.2f}")

# 하이퍼파라미터 설정
learning_rate = 3e-4
batch_size = 16
num_epochs = 10
weight_decay = 0.01

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


Initial model perplexity: 28133.82


In [23]:
# Early Stopping 콜백 생성
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.01
)

# 학습 인자 설정
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Tale/single_model/results",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=500,
    weight_decay=weight_decay,
    learning_rate=learning_rate,
    logging_dir='/content/drive/MyDrive/Tale/single_model/logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    fp16=True,
)

# 모델 초기화
model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")
model.config.pad_token_id = tokenizer.pad_token_id

# Trainer 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    # callbacks=[early_stopping_callback]
)

# 학습 실행
trainer.train()

# 모델 저장
trainer.save_model("/content/drive/MyDrive/Tale/single_model/final_model")

# 최종 모델 Perplexity 측정
final_model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/Tale/single_model/final_model")
final_model.config.pad_token_id = tokenizer.pad_token_id
final_perplexity = calculate_perplexity(final_model, tokenizer, val_tales)
print(f"Final model perplexity: {final_perplexity:.2f}")


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,2.0731,1.917115
2,1.8427,1.86947
3,1.7597,1.863343
4,1.6812,1.881293
5,1.5837,1.899186
6,1.4262,1.938931
7,1.185,1.989734
8,1.0974,2.032285
9,0.8617,2.082347
10,0.5198,2.013543


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 6.51


In [25]:
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
  bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>')

def generate_text(model_path, prompt, max_new_tokens=100):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.config.pad_token_id = tokenizer.pad_token_id

    input_ids = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=False)

    output = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=None,
    )

    generated_text = tokenizer.decode(output[0][len(input_ids[0]):], skip_special_tokens=True)

    return generated_text.strip()

prompt = "옛날 옛날에 백경이라는 고래가 살고 있었어요. 백경이는 물고기, 돌고래라는 친구들과 함께 살고 있었답니다. 그러던"


model_path = f"/content/drive/MyDrive/Tale/single_model/final_model"
generated_text = generate_text(model_path, prompt)
print(generated_text)
print("-" * 50)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


어느 날 고래는 자신의 운명을 한탄하며 자신의 운명에 대한 회의를 시작했어요!  뻐꾸기 한 마리가 물 위에 떠다니기 시작하더니 굴러 떨어지고 말았어요, 하지만 굴 속 굴 속에는 가시가 가득했답니다!! 고래의 운명은 너무도 강했죠. 苦 맙소사, 괜찮겠지요?  눌려 있던 굴 안에 가시가 하나 남은 것 같기도 했고, 그냥 너무 아파서 너무 슬프기도 했죠
--------------------------------------------------


In [27]:
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
  bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>')

def generate_text(model_path, prompt, max_length=200, max_new_tokens=100):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.config.pad_token_id = tokenizer.pad_token_id

    input_ids = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=False)

    output = model.generate(
        input_ids,
        max_length=max_length + len(input_ids[0]),
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=None,
    )

    generated_text = tokenizer.decode(output[0][len(input_ids[0]):], skip_special_tokens=True)

    return generated_text.strip()

# Initial prompt
prompt = "옛날 옛날에 백경이라는 고래가 살고 있었어요. 백경이는 물고기, 돌고래라는 친구들과 함께 살고 있었답니다. 그러던"

model_path = "/content/drive/MyDrive/Tale/single_model/final_model"

# Generate initial text
generated_texts = []
generated_text = generate_text(model_path, prompt, max_length=500)
generated_texts.append(generated_text)

# Number of continuations you want to generate
num_continuations = 5

# Generate further text by using the previous output as input
for _ in range(num_continuations):
    prompt = prompt + generated_text
    generated_text = generate_text(model_path, prompt, max_length=500)
    generated_texts.append(generated_text)

# Print all generated texts
final_text = "\n".join(generated_texts)
print(final_text)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
Both `max_new_tokens` (=100) and `max_length`(=526) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=626) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=724) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/tran

어느 날 돌고래는 울음을 터뜨렸어요!  퍽 소리가 들렸지요. 괜찮아 보였어요, 그런데 어느 날, 갑자기 백경이 큰 소리로 울어보았어!" 쿵, 쾅, 울컥하는 소리와 함께 불퉁한 바다 속으로 가라앉았지요, 끙, 으아아악! 고래는 너무 겁이 나서 그 불쌍한 돌고래를 일으켜 세우며 말했지요 控 윙! "오, 이런!
저는 정말 무서운 고래입니다.  뽈뽈, 나는 너를 너무도 사랑해서 저도 정말 울고 싶어요.. 넌 너무 아파서요."하고 생각했어요
하지만 펭귄은 씩씩하게 말하더니 뽀드득 고개를 끄덕이며 고개를 내저었지요
"그럼 난 정말 너에게 아무것도 해줄 수 없으니까. 뭔가 해줘 봐, 친구. 난 너의 목숨을 구할 수 있는 유일한 방법은 너와 함께
하는 것뿐이라고."  꾹 참을 수 없는 불쾌감 때문에 꽥꽥거리는 고래를 뻐꾸기며 백경은 잽싸게 돌고래의 어깨를 두드렸답어요,
, 웃고 싶었지만  쩝. "뭐, 뭐, 그런! 내 생명을 구할 수만 있다면 난 절대 너희들을 구할 수가 없어."라며 고래도 동의했답지요,
 뻘! 나는 정말 착한 고래야!! 너는 정말 고마워
! 내가 정말 사랑하는 친구야, 친구! 친구야. 하지만 뭘 해줬더라? , 너 정말 미안해! 친애하는 뼛속까지 사랑하니까! 그게 다야? 내가 널 얼마나 사랑하고 아끼는지 몰라! 고맙고 고만고만하단 말이야."하고 말끝에 킥킥거리는 백경의 말에 뚱뚱한 녀석들이 달려들어 쫄쫄 굶고 말았답죠.,
불끈! 불지르고 캑캑거리며 불 속으로 사라졌지. ;  삐끼오 킁, 이건 정말 짜증이 나네요? 왠지 뻑뻑한 느낌은 싫지만 낄낄거리는 찡그린 표정으로 쌩 뛸 수 밖에 없거든요., 참 좋기도 하다! 으, 좋은 생각이네! 너도 참 재미있으니까, 너도 다치지 않도록 조심해." 하고 말하곤 껄
껄 웃으며 쫑긋 웃어넘었답답니다,  톡! 하고 말하는 꿍꿍! 하는 땡전어이 울먹이는 고래의 목소리와 낑낑거리는 울음소리가  텄어도, 쌘뽈 쩌렁거리는 소리에 다정하게 손을 흔들며 웃는 고래.!하고 웃으시는 캥거루 울음소리도 끄러워지는 씰룩거리지 마세요.- 씽! 하면