In [1]:
!pip uninstall pyarrow datasets
!pip install --no-cache-dir pyarrow datasets

Found existing installation: pyarrow 17.0.0
Uninstalling pyarrow-17.0.0:
  Would remove:
    /usr/local/lib/python3.10/dist-packages/benchmarks/*
    /usr/local/lib/python3.10/dist-packages/cmake_modules/AWSSDKVariables.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/BuildUtils.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/DefineOptions.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/FindAWSSDKAlt.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/FindAzure.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/FindBrotliAlt.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/FindClangTools.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/FindGTestAlt.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/FindInferTools.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/FindLLVMAlt.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/FindOpenSSLAlt.cmake
 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd drive/MyDrive/KUBIG\ Contest
!pwd

/content/drive/MyDrive/KUBIG Contest
/content/drive/MyDrive/KUBIG Contest


In [4]:
import pandas as pd
import numpy as np
import torch
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import utils
from utils.dataloader import load_and_preprocess_df
import yaml
from transformers.optimization import AdamW
import datetime

def lets_train(first_train=False, checkpoint=None):  # first_train: 최초 훈련 여부  # checkpoint: 기존의 모델이 저장된 경로

    ''' config.yaml 파일 불러와서 config에 저장 '''
    config_path = "/content/drive/MyDrive/KUBIG Contest/config.yaml"
    with open(config_path, 'r', encoding='utf-8') as file:
        config = yaml.safe_load(file)


    if first_train == True:  # 최초 훈련일 경우 > tokenizer, model 초기화
        print('This is the first training of your model...')
        tokenizer = PreTrainedTokenizerFast.from_pretrained(config['base_model'])
        # truncate from left (padding_side = right)
        tokenizer.truncation_side = 'left'
        tokenizer.add_special_tokens({'pad_token': '[PAD]',
                                    'sep_token': '</s>',
                                    'eos_token': '<|endoftext|>'})
        model = GPT2LMHeadModel.from_pretrained(config['base_model'])
        # 모델의 토크나이저 업데이트
        model.resize_token_embeddings(len(tokenizer))

    else:  #  최초 훈련이 아닐 경우 > 저장된 모델, 토크나이저 로드
        print("You've already trained your model. Saved model and tokenizer will be loaded...")
        # 모델은 checkpoint 폴더에서 불러오기
        model = GPT2LMHeadModel.from_pretrained(checkpoint)
        # 토크나이저는 final 폴더에서 불러오기
        tokenizer = PreTrainedTokenizerFast.from_pretrained("/content/drive/MyDrive/KUBIG Contest/final/final_20240819_161802")


    ''' TrainingArguments 객체 생성 '''
    training_args = TrainingArguments(
        output_dir = config['output_dir'],
        overwrite_output_dir = config['overwrite_output_dir'],
        logging_dir = config['logging_dir'],
        logging_strategy = config['logging_strategy'],
        logging_steps = config['logging_steps'],
        fp16 = config['fp16'],
        save_strategy = config['save_strategy'],
        save_steps = config['save_steps'],
        save_total_limit = config['save_total_limit'],
        num_train_epochs = config['num_train_epochs'],
        per_device_train_batch_size = config['per_device_train_batch_size'],
        per_device_eval_batch_size = config['per_device_eval_batch_size'],
        learning_rate = float(config['learning_rate']),
        weight_decay = float(config['weight_decay']),
        warmup_ratio = float(config['warmup_ratio']),
        seed = config['seed'],
    )

    ''' 토큰화된 데이터 불러오기 '''
    tokenized_train_dataset, tokenized_valid_dataset = load_and_preprocess_df(tokenizer)

    ''' 배치 생성 '''
    data_collator = DataCollatorForLanguageModeling(
                        tokenizer=tokenizer,
                        mlm=False)  # 일반적인 언어 모델링을 위한 배치. 텍스트 중 일부를 마스킹하지 않음.

    ''' Trainer 객체 생성 '''
    trainer = Trainer(
            model = model,
            args = training_args,  # TrainingArguments
            train_dataset = tokenized_train_dataset,  # 훈련 데이터셋
            eval_dataset = tokenized_valid_dataset,  # 평가 데이터셋
            data_collator = data_collator,  # 데이터를 배치로 변환
            optimizers=(AdamW(model.parameters(), lr=float(config['learning_rate'])), None)  # 옵티마이저 설정
        )

    ''' 훈련 시작! '''
    if not first_train:
        trainer.train(resume_from_checkpoint=checkpoint)
    else:
        trainer.train()
    # 모델, 토크나이저 저장
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    save_directory = f"/content/drive/MyDrive/KUBIG Contest/final/final_{timestamp}"
    model.save_pretrained(save_directory)
    tokenizer.save_pretrained(save_directory)

## ~epoch 1

In [None]:
lets_train(first_train=True)

This is the first training of your model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

  train_df = train_df.applymap(remove_quotes)
  valid_df = valid_df.applymap(remove_quotes)


Map:   0%|          | 0/338148 [00:00<?, ? examples/s]

Map:   0%|          | 0/41926 [00:00<?, ? examples/s]



Step,Training Loss
200,3.997
400,3.5985
600,3.5302
800,3.4852
1000,3.4554
1200,3.4256
1400,3.399
1600,3.3544
1800,3.3311
2000,3.3181


## ~ epoch 3

In [None]:
lets_train(first_train=False, checkpoint="/content/drive/MyDrive/KUBIG Contest/results/checkpoint-10568")

You've already trained your model. Saved model and tokenizer will be loaded...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  train_df = train_df.applymap(remove_quotes)
  valid_df = valid_df.applymap(remove_quotes)


Map:   0%|          | 0/338148 [00:00<?, ? examples/s]

Map:   0%|          | 0/41926 [00:00<?, ? examples/s]

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Step,Training Loss
10600,2.924
10800,2.9823
11000,2.9898
11200,2.9908
11400,2.9892
11600,2.9895
11800,2.9881
12000,2.9903
12200,2.9814
12400,2.9811


## ~epoch 6

In [None]:
lets_train(first_train=False, checkpoint="/content/drive/MyDrive/KUBIG Contest/results/checkpoint-50000")

You've already trained your model. Saved model and tokenizer will be loaded...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  train_df = train_df.applymap(remove_quotes)
  valid_df = valid_df.applymap(remove_quotes)


Map:   0%|          | 0/338148 [00:00<?, ? examples/s]

Map:   0%|          | 0/41926 [00:00<?, ? examples/s]

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Step,Training Loss
50200,2.57
50400,2.569
50600,2.5708
50800,2.5706
51000,2.5717
51200,2.5694
51400,2.5682
51600,2.5673
51800,2.5738
52000,2.5663


Step,Training Loss
50200,2.57
50400,2.569
50600,2.5708
50800,2.5706
51000,2.5717
51200,2.5694
51400,2.5682
51600,2.5673
51800,2.5738
52000,2.5663


## ~epoch 8

In [None]:
lets_train(first_train=False, checkpoint="/content/drive/MyDrive/KUBIG Contest/results/checkpoint-63408")

You've already trained your model. Saved model and tokenizer will be loaded...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  train_df = train_df.applymap(remove_quotes)
  valid_df = valid_df.applymap(remove_quotes)


Map:   0%|          | 0/338148 [00:00<?, ? examples/s]

Map:   0%|          | 0/41926 [00:00<?, ? examples/s]

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Step,Training Loss
63600,2.4785
63800,2.4766
64000,2.4784
64200,2.4891
64400,2.4898
64600,2.4827
64800,2.4886
65000,2.4848
65200,2.4836
65400,2.4884


## ~ epoch 10

In [None]:
lets_train(first_train=False, checkpoint="/content/drive/MyDrive/KUBIG Contest/results/checkpoint-84544")

You've already trained your model. Saved model and tokenizer will be loaded...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  train_df = train_df.applymap(remove_quotes)
  valid_df = valid_df.applymap(remove_quotes)


Map:   0%|          | 0/338148 [00:00<?, ? examples/s]

Map:   0%|          | 0/41926 [00:00<?, ? examples/s]

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Step,Training Loss
84600,2.3953
84800,2.3954
85000,2.4178
85200,2.4153
85400,2.4157
85600,2.4196
85800,2.4112
86000,2.4196
86200,2.4153
86400,2.4222


## ~epoch 13

In [5]:
lets_train(first_train=False, checkpoint="/content/drive/MyDrive/KUBIG Contest/results/checkpoint-105680")

You've already trained your model. Saved model and tokenizer will be loaded...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  train_df = train_df.applymap(remove_quotes)
  valid_df = valid_df.applymap(remove_quotes)


Map:   0%|          | 0/338148 [00:00<?, ? examples/s]

Map:   0%|          | 0/41926 [00:00<?, ? examples/s]

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Step,Training Loss
105800,2.3419
106000,2.3564
106200,2.3542
106400,2.3667
106600,2.3717
106800,2.3617
107000,2.3705
107200,2.3625
107400,2.3664
107600,2.3698
