In [None]:
from google.colab import drive
drive.mount('/content/drive')
! pip install -U accelerate
! pip install -U transformers
! pip install kss
! pip install datasets

Mounted at /content/drive
Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py

In [None]:
import math
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, EarlyStoppingCallback, PreTrainedTokenizerFast
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from kss import split_sentences
import itertools
import random

# 토크나이저와 모델 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
  bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>')
model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")

# 데이터 읽기 함수
def read_tales(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        tales = f.read().strip().split('///')

    return [tale.strip() for tale in tales if tale.strip()]

# 데이터셋 클래스
class TaleDataset(Dataset):
    def __init__(self, tales, tokenizer, max_length=1024):
        self.tokenizer = tokenizer
        self.tales = tales
        self.max_length = max_length

    def __len__(self):
        return len(self.tales)

    def __getitem__(self, idx):
        tale = self.tales[idx]
        encoding = self.tokenizer(tale, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

# 데이터 콜레이터
def data_collator(features):
    batch = tokenizer.pad(features, padding=True, return_tensors="pt")
    batch['labels'] = batch['input_ids'].clone()
    return batch

# Perplexity 계산 함수
def calculate_perplexity(model, tokenizer, tales, max_samples=100, batch_size=8, max_length=1024):
    model.eval()
    total_loss = 0.0
    total_length = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    random.shuffle(tales)
    tales = tales[:max_samples]

    with torch.no_grad():
        for i in range(0, len(tales), batch_size):
            batch = tales[i:i + batch_size]
            encodings = tokenizer(batch, return_tensors='pt', truncation=True, max_length=max_length, padding=True)
            input_ids = encodings.input_ids.to(device)
            attention_mask = encodings.attention_mask.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss

            total_loss += loss.item() * torch.sum(attention_mask).item()
            total_length += torch.sum(attention_mask).item()

    if total_length == 0:
        print("Warning: No valid inputs found. Cannot calculate perplexity.")
        return float('inf')

    perplexity = math.exp(total_loss / total_length)
    return perplexity



# 동화 데이터 로드
tales = read_tales('/content/drive/MyDrive/Tale/processed_final.txt')
train_tales, val_tales = train_test_split(tales, test_size=0.1)

# 데이터셋 생성
train_dataset = TaleDataset(train_tales, tokenizer)
val_dataset = TaleDataset(val_tales, tokenizer)

# 초기 모델의 perplexity 계산
initial_model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")
initial_model.config.pad_token_id = tokenizer.pad_token_id
initial_perplexity = calculate_perplexity(initial_model, tokenizer, val_tales)
print(f"Initial model perplexity: {initial_perplexity:.2f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

In [None]:
import math
import torch
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import csv
import random
import itertools
import os

# 토크나이저와 모델 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
  bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>')
model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")

# 데이터 읽기 함수
def read_tales(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        tales = f.read().strip().split('///')

    return [tale.strip() for tale in tales if tale.strip()]

# 데이터셋 클래스
class TaleDataset(Dataset):
    def __init__(self, tales, tokenizer, max_length=1024):
        self.tokenizer = tokenizer
        self.tales = tales
        self.max_length = max_length

    def __len__(self):
        return len(self.tales)

    def __getitem__(self, idx):
        tale = self.tales[idx]
        encoding = self.tokenizer(tale, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

# 데이터 콜레이터
def data_collator(features):
    batch = tokenizer.pad(features, padding=True, return_tensors="pt")
    batch['labels'] = batch['input_ids'].clone()
    return batch

# Perplexity 계산 함수
def calculate_perplexity(model, tokenizer, tales, max_samples=100, batch_size=8, max_length=1024):
    model.eval()
    total_loss = 0.0
    total_length = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    random.shuffle(tales)
    tales = tales[:max_samples]

    with torch.no_grad():
        for i in range(0, len(tales), batch_size):
            batch = tales[i:i + batch_size]
            encodings = tokenizer(batch, return_tensors='pt', truncation=True, max_length=max_length, padding=True)
            input_ids = encodings.input_ids.to(device)
            attention_mask = encodings.attention_mask.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss

            total_loss += loss.item() * torch.sum(attention_mask).item()
            total_length += torch.sum(attention_mask).item()

    if total_length == 0:
        print("Warning: No valid inputs found. Cannot calculate perplexity.")
        return float('inf')

    perplexity = math.exp(total_loss / total_length)
    return perplexity

# 함수: 디렉토리가 존재하지 않을 경우 생성
def create_directory_if_not_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

# 동화 데이터 로드
tales = read_tales('/content/drive/MyDrive/Tale/processed_final.txt')
train_tales, val_tales = train_test_split(tales, test_size=0.1)

# 데이터셋 생성
train_dataset = TaleDataset(train_tales, tokenizer)
val_dataset = TaleDataset(val_tales, tokenizer)

# CSV 파일 경로
csv_file = '/content/drive/MyDrive/Tale/batch_learning3/hyperparameter_results.csv'

# CSV 파일에 헤더 추가
header = ['LR', 'Batch Size', 'Epochs', 'Weight Decay', 'Final Perplexity']
create_directory_if_not_exists(os.path.dirname(csv_file))
with open(csv_file, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(header)

# 하이퍼파라미터 그리드 정의
param_grid = {
    'learning_rate': [1e-4, 3e-4, 5e-4],
    'per_device_train_batch_size': [4, 8, 16],
    'num_train_epochs': [5, 8, 10],
    'weight_decay': [0.01, 0.1]
}

# 각 하이퍼파라미터 조합에 대해 학습 및 평가
param_combinations = list(itertools.product(*param_grid.values()))

for i, params in enumerate(param_combinations):
    print(f"Training model {i+1}/{len(param_combinations)}")

    lr, batch_size, epochs, wd = params

    # Early Stopping 콜백 생성
    early_stopping_callback = EarlyStoppingCallback(
        early_stopping_patience=3,
        early_stopping_threshold=0.01
    )

    training_args = TrainingArguments(
        output_dir=f"/content/drive/MyDrive/Tale/batch_learning3/results_{i}",
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        warmup_steps=500,
        weight_decay=wd,
        learning_rate=lr,
        logging_dir=f'/content/drive/MyDrive/Tale/batch_learning3/logs_{i}',
        logging_steps=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
    )

    # 모델 초기화
    model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")
    model.config.pad_token_id = tokenizer.pad_token_id

    # Trainer 생성
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        callbacks=[early_stopping_callback]
    )

    # 학습 실행
    trainer.train()

    # 모델 저장
    trainer.save_model(f"/content/drive/MyDrive/Tale/batch_learning3/results_{i}/final_model")

    # 최종 모델 Perplexity 측정
    final_model = GPT2LMHeadModel.from_pretrained(f"/content/drive/MyDrive/Tale/batch_learning3/results_{i}/final_model")
    final_model.config.pad_token_id = tokenizer.pad_token_id
    final_perplexity = calculate_perplexity(final_model, tokenizer, val_tales)
    print(f"Final model perplexity: {final_perplexity:.2f}")

    # CSV 파일에 결과 추가
    row = [lr, batch_size, epochs, wd, final_perplexity]
    with open(csv_file, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(row)

    # 출력
    print(f"Parameters: LR={lr}, Batch Size={batch_size}, Epochs={epochs}, Weight Decay={wd}")
    print("--------------------")

# 최종 결과 출력
print(f"Results saved to {csv_file}")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


Training model 1/54


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,1.5668,1.33004
2,1.5847,1.339452
3,1.3317,1.300502
4,1.1668,1.270458
5,0.9981,1.265322


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 4.01
Parameters: LR=0.0001, Batch Size=4, Epochs=5, Weight Decay=0.01
--------------------
Training model 2/54


Epoch,Training Loss,Validation Loss
1,1.5679,1.329557
2,1.5833,1.342974
3,1.3392,1.299825
4,1.166,1.269965
5,0.9977,1.264683


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 4.26
Parameters: LR=0.0001, Batch Size=4, Epochs=5, Weight Decay=0.1
--------------------
Training model 3/54


Epoch,Training Loss,Validation Loss
1,1.5668,1.33004
2,1.5847,1.339452
3,1.3561,1.311289
4,1.2273,1.294528
5,1.0614,1.289849
6,0.8028,1.291635
7,0.8682,1.2891


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 4.54
Parameters: LR=0.0001, Batch Size=4, Epochs=8, Weight Decay=0.01
--------------------
Training model 4/54


Epoch,Training Loss,Validation Loss
1,1.5679,1.329557
2,1.5833,1.342974
3,1.3653,1.312719
4,1.2288,1.298723
5,1.0681,1.289546
6,0.8057,1.289919
7,0.877,1.288056


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 4.31
Parameters: LR=0.0001, Batch Size=4, Epochs=8, Weight Decay=0.1
--------------------
Training model 5/54


Epoch,Training Loss,Validation Loss
1,1.5668,1.33004
2,1.5847,1.339452
3,1.3636,1.313945
4,1.242,1.302894
5,1.09,1.297428
6,0.8299,1.306193
7,0.894,1.304052


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 4.28
Parameters: LR=0.0001, Batch Size=4, Epochs=10, Weight Decay=0.01
--------------------
Training model 6/54


Epoch,Training Loss,Validation Loss
1,1.5679,1.329556
2,1.5833,1.342974
3,1.3705,1.315474
4,1.2461,1.306536
5,1.0891,1.298986
6,0.828,1.303101


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 3.94
Parameters: LR=0.0001, Batch Size=4, Epochs=10, Weight Decay=0.1
--------------------
Training model 7/54


Epoch,Training Loss,Validation Loss
1,1.3462,1.313619
2,1.6006,1.296941
3,1.4958,1.293639
4,1.3803,1.294921
5,1.0999,1.25489


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 3.86
Parameters: LR=0.0001, Batch Size=8, Epochs=5, Weight Decay=0.01
--------------------
Training model 8/54


Epoch,Training Loss,Validation Loss
1,1.3462,1.313618
2,1.6006,1.297901
3,1.4951,1.293494
4,1.3809,1.298967
5,1.1017,1.253799


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 4.08
Parameters: LR=0.0001, Batch Size=8, Epochs=5, Weight Decay=0.1
--------------------
Training model 9/54


Epoch,Training Loss,Validation Loss
1,1.3462,1.313619
2,1.6006,1.29694
3,1.4958,1.293639
4,1.3803,1.294921
5,1.1735,1.28834


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 4.28
Parameters: LR=0.0001, Batch Size=8, Epochs=8, Weight Decay=0.01
--------------------
Training model 10/54


Epoch,Training Loss,Validation Loss
1,1.3462,1.313618
2,1.6006,1.2979
3,1.4951,1.293494
4,1.3809,1.298967
5,1.1762,1.283507


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 4.25
Parameters: LR=0.0001, Batch Size=8, Epochs=8, Weight Decay=0.1
--------------------
Training model 11/54


Epoch,Training Loss,Validation Loss
1,1.3462,1.313618
2,1.6006,1.296941
3,1.4958,1.293639
4,1.3803,1.294921
5,1.1837,1.29328


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 4.30
Parameters: LR=0.0001, Batch Size=8, Epochs=10, Weight Decay=0.01
--------------------
Training model 12/54


Epoch,Training Loss,Validation Loss
1,1.3462,1.313619
2,1.6006,1.2979
3,1.4951,1.293494
4,1.3809,1.298966
5,1.1854,1.288423


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final model perplexity: 4.13
Parameters: LR=0.0001, Batch Size=8, Epochs=10, Weight Decay=0.1
--------------------
Training model 13/54


OutOfMemoryError: CUDA out of memory. Tried to allocate 144.00 MiB. GPU 

In [None]:
initial_model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")
initial_model.config.pad_token_id = tokenizer.pad_token_id
initial_perplexity = calculate_perplexity(initial_model, tokenizer, val_tales)
print(f"Initial model perplexity: {initial_perplexity:.2f}")