In [1]:
from google.colab import drive
drive.mount('/content/drive')
! pip install -U accelerate
! pip install -U transformers
! pip install kss
! pip install datasets

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvid

In [10]:
import math
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, EarlyStoppingCallback, PreTrainedTokenizerFast
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from kss import split_sentences
import itertools
import random

# 토크나이저와 모델 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
  bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>')
model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")

# 데이터 읽기 함수
def read_tales(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        tales = f.read().strip().split('///')

    return [tale.strip() for tale in tales if tale.strip()]

# 데이터셋 클래스
class TaleDataset(Dataset):
    def __init__(self, tales, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.tales = tales
        self.max_length = max_length

    def __len__(self):
        return len(self.tales)

    def __getitem__(self, idx):
        tale = self.tales[idx]
        encoding = self.tokenizer(tale, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

# 데이터 콜레이터
def data_collator(features):
    batch = tokenizer.pad(features, padding=True, return_tensors="pt")
    batch['labels'] = batch['input_ids'].clone()
    return batch

# Perplexity 계산 함수
def calculate_perplexity(model, tokenizer, tales, max_samples=100, batch_size=8, max_length=512):
    model.eval()
    total_loss = 0.0
    total_length = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    random.shuffle(tales)
    tales = tales[:max_samples]

    with torch.no_grad():
        for i in range(0, len(tales), batch_size):
            batch = tales[i:i + batch_size]
            encodings = tokenizer(batch, return_tensors='pt', truncation=True, max_length=max_length, padding=True)
            input_ids = encodings.input_ids.to(device)
            attention_mask = encodings.attention_mask.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss

            total_loss += loss.item() * torch.sum(attention_mask).item()
            total_length += torch.sum(attention_mask).item()

    if total_length == 0:
        print("Warning: No valid inputs found. Cannot calculate perplexity.")
        return float('inf')

    perplexity = math.exp(total_loss / total_length)
    return perplexity



# 동화 데이터 로드
tales = read_tales('/content/drive/MyDrive/Tale/processed_final.txt')
train_tales, val_tales = train_test_split(tales, test_size=0.1)

# 데이터셋 생성
train_dataset = TaleDataset(train_tales, tokenizer)
val_dataset = TaleDataset(val_tales, tokenizer)

# 초기 모델의 perplexity 계산
initial_model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")
initial_model.config.pad_token_id = tokenizer.pad_token_id
initial_perplexity = calculate_perplexity(initial_model, tokenizer, val_tales)
print(f"Initial model perplexity: {initial_perplexity:.2f}")


# 하이퍼파라미터 그리드 정의
param_grid = {
    'learning_rate': [1e-4, 3e-4, 1e-5],
    'per_device_train_batch_size': [8],
    'num_train_epochs': [5, 10],
    'weight_decay': [0.01, 0.1]
}

# 모든 하이퍼파라미터 조합 생성
param_combinations = list(itertools.product(*param_grid.values()))

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


Initial model perplexity: 7564.67


In [14]:
import csv
import torch
import os

best_perplexity = float('inf')
best_params = None

output_dir = '/content/drive/MyDrive/Tale/max_length512'

os.makedirs(output_dir, exist_ok=True)

with open(os.path.join(output_dir, 'results.csv'), 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["LR", "Batch Size", "Epochs", "Weight Decay", "Perplexity", "Improvement"])

    # 각 하이퍼파라미터 조합에 대해 학습 및 평가
    for i, params in enumerate(param_combinations):
        try:
            print(f"Training model {i+1}/{len(param_combinations)}")

            # GPU 메모리 정리
            torch.cuda.empty_cache()

            lr, batch_size, epochs, wd = params

            # Early Stopping 콜백 생성
            early_stopping_callback = EarlyStoppingCallback(
                early_stopping_patience=3,
                early_stopping_threshold=0.01
            )

            # 학습 인자 설정
            training_args = TrainingArguments(
                output_dir=f"/content/drive/MyDrive/Tale/max_length512/results_{i}",
                num_train_epochs=epochs,
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size,
                warmup_steps=500,
                weight_decay=wd,
                learning_rate=lr,
                logging_dir=f'/content/drive/MyDrive/Tale/max_length512/logs_{i}',
                logging_steps=10,
                eval_strategy="epoch",
                save_strategy="epoch",
                load_best_model_at_end=True,
                metric_for_best_model="eval_loss",
            )

            # 모델 초기화
            model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")
            model.config.pad_token_id = tokenizer.pad_token_id

            # Trainer 생성
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                data_collator=data_collator,
                callbacks=[early_stopping_callback]
            )

            # 학습 실행
            trainer.train()

            # 모델 저장
            trainer.save_model(f"/content/drive/MyDrive/Tale/max_length512/results_{i}/final_model")

            # 최종 모델 Perplexity 측정
            final_model = GPT2LMHeadModel.from_pretrained(f"/content/drive/MyDrive/Tale/max_length512/results_{i}/final_model")
            final_model.to('cpu')  # CPU로 모델 이동
            final_model.config.pad_token_id = tokenizer.pad_token_id
            final_perplexity = calculate_perplexity(final_model, tokenizer, val_tales)
            print(f"Final model perplexity: {final_perplexity:.2f}")

            # Perplexity 개선율 계산
            improvement = (initial_perplexity - final_perplexity) / initial_perplexity * 100
            print(f"Perplexity improvement: {improvement:.2f}%")

            # CSV에 결과 쓰기
            writer.writerow([lr, batch_size, epochs, wd, final_perplexity, improvement])
            file.flush()  # 즉시 파일에 쓰기

            # 중간 결과 저장
            if final_perplexity < best_perplexity:
                best_perplexity = final_perplexity
                best_params = params
                torch.save({
                    'best_params': best_params,
                    'best_perplexity': best_perplexity,
                    'model_state_dict': final_model.state_dict()
                }, '/content/drive/MyDrive/Tale/max_length512/best_model_checkpoint.pth')

            print(f"Parameters: LR={lr}, Batch Size={batch_size}, Epochs={epochs}, Weight Decay={wd}")
            print("--------------------")

        except Exception as e:
            print(f"Error occurred during training: {str(e)}")
            continue  # 다음 실험으로 계속 진행

print(f"Best parameters: LR={best_params[0]}, Batch Size={best_params[1]}, Epochs={best_params[2]}, Weight Decay={best_params[3]}")
print(f"Best perplexity: {best_perplexity:.2f}")

Training model 1/12


Epoch,Training Loss,Validation Loss
1,1.8657,1.93795
2,1.8844,1.896484
3,1.8641,1.897948
4,1.6205,1.89667
5,1.4366,1.84276


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 7.21
Perplexity improvement: 99.90%
Parameters: LR=0.0001, Batch Size=8, Epochs=5, Weight Decay=0.01
--------------------
Training model 2/12


Epoch,Training Loss,Validation Loss
1,1.8657,1.937918
2,1.8905,1.896007
3,1.8626,1.894209
4,1.6198,1.897114
5,1.4338,1.84029


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 6.56
Perplexity improvement: 99.91%
Parameters: LR=0.0001, Batch Size=8, Epochs=5, Weight Decay=0.1
--------------------
Training model 3/12


Epoch,Training Loss,Validation Loss
1,1.8657,1.93795
2,1.8844,1.896484
3,1.8641,1.897948
4,1.6205,1.89667
5,1.5609,1.889005


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 7.12
Perplexity improvement: 99.91%
Parameters: LR=0.0001, Batch Size=8, Epochs=10, Weight Decay=0.01
--------------------
Training model 4/12


Epoch,Training Loss,Validation Loss
1,1.8657,1.937918
2,1.8905,1.896006
3,1.8626,1.894209
4,1.6198,1.897114
5,1.5651,1.882553
6,1.2021,1.867802
7,0.8698,1.871969
8,0.7732,1.872391
9,0.6402,1.872451


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 6.47
Perplexity improvement: 99.91%
Parameters: LR=0.0001, Batch Size=8, Epochs=10, Weight Decay=0.1
--------------------
Training model 5/12


Epoch,Training Loss,Validation Loss
1,1.885,1.959723
2,1.9235,1.98285
3,1.9138,2.036334
4,1.6601,2.090192


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 7.56
Perplexity improvement: 99.90%
Parameters: LR=0.0003, Batch Size=8, Epochs=5, Weight Decay=0.01
--------------------
Training model 6/12


Epoch,Training Loss,Validation Loss
1,1.8847,1.961082
2,1.9176,1.982085
3,1.9144,2.044383
4,1.6669,2.09259


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 7.09
Perplexity improvement: 99.91%
Parameters: LR=0.0003, Batch Size=8, Epochs=5, Weight Decay=0.1
--------------------
Training model 7/12


Epoch,Training Loss,Validation Loss
1,1.885,1.959724
2,1.9235,1.98285
3,1.9138,2.036334
4,1.6601,2.090192


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 7.64
Perplexity improvement: 99.90%
Parameters: LR=0.0003, Batch Size=8, Epochs=10, Weight Decay=0.01
--------------------
Training model 8/12


Epoch,Training Loss,Validation Loss
1,1.8847,1.961082
2,1.9176,1.982085
3,1.9144,2.044383
4,1.6669,2.09259


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 7.99
Perplexity improvement: 99.89%
Parameters: LR=0.0003, Batch Size=8, Epochs=10, Weight Decay=0.1
--------------------
Training model 9/12


Epoch,Training Loss,Validation Loss
1,2.0217,2.039129
2,1.9998,1.939631
3,2.0198,1.905551
4,1.8705,1.885407
5,1.9789,1.866644


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 6.89
Perplexity improvement: 99.91%
Parameters: LR=1e-05, Batch Size=8, Epochs=5, Weight Decay=0.01
--------------------
Training model 10/12


Epoch,Training Loss,Validation Loss
1,2.0217,2.039127
2,1.9998,1.93963
3,2.0198,1.905546
4,1.8704,1.885396
5,1.9789,1.866627


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 6.50
Perplexity improvement: 99.91%
Parameters: LR=1e-05, Batch Size=8, Epochs=5, Weight Decay=0.1
--------------------
Training model 11/12


Epoch,Training Loss,Validation Loss
1,2.0217,2.039129
2,1.9998,1.939631
3,2.0198,1.905551
4,1.8705,1.885407
5,1.9875,1.862615
6,1.7841,1.855447
7,1.5965,1.845306
8,1.679,1.844709
9,1.6783,1.84145
10,1.7623,1.841015


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 6.91
Perplexity improvement: 99.91%
Parameters: LR=1e-05, Batch Size=8, Epochs=10, Weight Decay=0.01
--------------------
Training model 12/12


Epoch,Training Loss,Validation Loss
1,2.0217,2.039127
2,1.9998,1.939629
3,2.0198,1.905546
4,1.8704,1.885397
5,1.9875,1.8626
6,1.7841,1.855421
7,1.5965,1.845273
8,1.679,1.844665
9,1.6783,1.8414
10,1.7622,1.84096


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 6.27
Perplexity improvement: 99.92%
Parameters: LR=1e-05, Batch Size=8, Epochs=10, Weight Decay=0.1
--------------------
Best parameters: LR=1e-05, Batch Size=8, Epochs=10, Weight Decay=0.1
Best perplexity: 6.27


In [None]:
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
  bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>')

def generate_text(model_path, prompt, max_new_tokens=50):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.config.pad_token_id = tokenizer.pad_token_id

    input_ids = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=False)

    output = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=None,
    )

    generated_text = tokenizer.decode(output[0][len(input_ids[0]):], skip_special_tokens=True)

    return generated_text.strip()

prompt = "고래 백경이는 "

for i in range(1, 37):
    model_path = f"/content/drive/MyDrive/Tale/batch_learning4/results_{i}/final_model"
    generated_text = generate_text(model_path, prompt)
    print(generated_text)
    print("-" * 50)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


덫에 걸렸다  가까스로 탈출했어요. 는 이 이야기를 하며 자신을 뽐내고 있었죠.  의지할 수 있는 녀석에게 호의를 베풀어준 친구가 바로 공주님이셨답니다.
--------------------------------------------------
펭귄이 자신의 친구 냥 짹짹거리며 톡 웃고 있었어요.  "펭! 뭔가 쿵, 윙,"라며 친구가 말했어요, "아!"  "나는
--------------------------------------------------
펭귄의 아름다운 깃털을 자랑했다.  그 모습을 보고 한 녀석들은 엄청 놀랐다.  뭔가 멋진 봤다. 
그래서 넌 왜 그래,? 괜찮은
--------------------------------------------------
펭귄의 아름다운 그림자에 반해 삐딱삐걱 딱 톡 웃지 않는 녀석.  어느 날 땡볕을 받으며 뽀얀 꼬리를 가진 깡충이는 자신의 얌전히 꼬리를 밟고
--------------------------------------------------
펭귄에 의해 꼼짝없이 덫에 걸려버리고 말았어요.  그 바람에 땡쥐와  톡이 날아갔어요! "!" ck 이 말은
--------------------------------------------------
덫에 걸려 가까스로 탈출했습니다.  하지만 그 과정에서 꼬리를 잃고 말았습니다..  황소들이 무거운 짐을 짊어지고 있음에도 탈출구가 막힌 게 백 경장의 발걸음을 멈추게 하곤 빈손으로 돌아가
--------------------------------------------------
덫에 걸린 상태였어요.  그래서 나뭇가지 아래에 구멍을 파서 매일 같이 보려고 했어요   그래서 백 경이는 물었어요, "왜 거기서 잤어?" 
"응, 네
--------------------------------------------------
펭귄이 좋아해서, 자신이 좋아하는 나무를 이용해 자신의 몸을 보호할 수 있는 친구로 자랐어요.  친구는 그런 친구의 은혜를 모르는지 종종 자신의 운을 한

OSError: Incorrect path_or_model_id: '/content/drive/MyDrive/Tale/batch_learning/results_36/final_model'. Please provide either the path to a local folder or the repo_id of a model on the Hub.