In [1]:
import os
import numpy as np
import pandas as pd

import transformers
from transformers import Trainer, TrainingArguments
import tensorflow as tf
import tensorflow_datasets as tfds
from datasets import Dataset, DatasetDict, load_dataset, load_metric

# STEP 1. NSMC 데이터 분석 및 Huggingface dataset 구성

In [2]:
ds = DatasetDict({
    'train' : load_dataset("e9t/nsmc", split='train[:15000]'),
    'validation' : load_dataset("e9t/nsmc", split='train[15000:16000]'),
    'test' : load_dataset("e9t/nsmc", split='test[:1000]')
})

Using custom data configuration default
Reusing dataset nsmc (/aiffel/.cache/huggingface/datasets/e9t___nsmc)/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3)
Using custom data configuration default
Reusing dataset nsmc (/aiffel/.cache/huggingface/datasets/e9t___nsmc)/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3)
Using custom data configuration default
Reusing dataset nsmc (/aiffel/.cache/huggingface/datasets/e9t___nsmc)/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3)


In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 15000
    })
    validation: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 1000
    })
})

In [4]:
cols = ds.column_names

In [5]:
for i in range(3):
    for col in cols:
        print(col, ":", ds[col][i])
    print('\n')

train : {'document': '아 더빙.. 진짜 짜증나네요 목소리', 'label': 0, 'id': '9976970'}
validation : {'document': '어쨌든 비디오 대여 1순위였다..', 'label': 1, 'id': '1737741'}
test : {'document': '굳 ㅋ', 'label': 1, 'id': '6270596'}


train : {'document': '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', 'label': 1, 'id': '3819312'}
validation : {'document': '극장 가서 봐라 꼭.끝난다.', 'label': 1, 'id': '5668037'}
test : {'document': 'GDNTOPCLASSINTHECLUB', 'label': 0, 'id': '9274899'}


train : {'document': '너무재밓었다그래서보는것을추천한다', 'label': 0, 'id': '10265843'}
validation : {'document': '굳굳굳....성룡은 최고였어', 'label': 1, 'id': '9706716'}
test : {'document': '뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아', 'label': 0, 'id': '8544678'}




# STEP 2. klue/bert-base model 및 tokenizer 불러오기

In [6]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base", num_labels = 2)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

# STEP 3. 위에서 불러온 tokenizer으로 데이터셋을 전처리하고, model 학습 진행해 보기

In [7]:
def transform(data):
    return tokenizer(
        data['document'],
        truncation = True,
        padding = 'max_length',
        return_token_type_ids = False,
        )

In [8]:
dataset = ds.map(transform, batched=True)

Loading cached processed dataset at /aiffel/.cache/huggingface/datasets/e9t___nsmc)/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-78de48e20dc2255e.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at /aiffel/.cache/huggingface/datasets/e9t___nsmc)/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-2dfbb343908a8250.arrow


In [9]:
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

In [10]:
import os
import numpy as np
from transformers import Trainer, TrainingArguments

output_dir = os.getenv('HOME')+'/aiffel/transformers'

training_arguments = TrainingArguments(
    output_dir,                                         # output이 저장될 경로
    evaluation_strategy="epoch",           #evaluation하는 빈도
    learning_rate = 2e-5,                         #learning_rate
    per_device_train_batch_size = 4,   # 각 device 당 batch size
    per_device_eval_batch_size = 4,    # evaluation 시에 batch size
    num_train_epochs = 3,                     # train 시킬 총 epochs
    weight_decay = 0.01,                        # weight decay
)

In [11]:
from datasets import load_metric
metric = load_metric('accuracy')

def compute_metrics(eval_pred):    
    predictions,labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references = labels)

In [None]:
trainer = Trainer(
    model=model,                    # 학습시킬 model
    args=training_arguments,        # TrainingArguments을 통해 설정한 arguments
    train_dataset=train_dataset,    # training dataset
    eval_dataset=val_dataset,       # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running training *****
  Num examples = 15000
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 11250
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mjanghyeon06[0m ([33mjanghyeon[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4645,0.490182,0.862
2,0.3265,0.581736,0.873


Saving model checkpoint to /aiffel/aiffel/transformers/checkpoint-500
Configuration saved in /aiffel/aiffel/transformers/checkpoint-500/config.json
Model weights saved in /aiffel/aiffel/transformers/checkpoint-500/pytorch_model.bin
Saving model checkpoint to /aiffel/aiffel/transformers/checkpoint-1000
Configuration saved in /aiffel/aiffel/transformers/checkpoint-1000/config.json
Model weights saved in /aiffel/aiffel/transformers/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to /aiffel/aiffel/transformers/checkpoint-1500
Configuration saved in /aiffel/aiffel/transformers/checkpoint-1500/config.json
Model weights saved in /aiffel/aiffel/transformers/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to /aiffel/aiffel/transformers/checkpoint-2000
Configuration saved in /aiffel/aiffel/transformers/checkpoint-2000/config.json
Model weights saved in /aiffel/aiffel/transformers/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to /aiffel/aiffel/transformers/checkpoi

In [None]:
trainer.evaluate(test_dataset)

# STEP 4. Fine-tuning을 통하여 모델 성능(accuarcy) 향상시키기
* 데이터 전처리, TrainingArguments 등을 조정하여 모델의 정확도를 90% 이상으로 끌어올려봅시다.

# STEP 5. Bucketing을 적용하여 학습시키고, STEP 4의 결과와의 비교

* 아래 링크를 바탕으로 bucketing과 dynamic padding이 무엇인지 알아보고, 이들을 적용하여 model을 학습시킵니다.

 * Data Collator

 * Trainer.TrainingArguments 의 group_by_length

* STEP 4에 학습한 결과와 bucketing을 적용하여 학습시킨 결과를 비교해보고, 모델 성능 향상과 훈련 시간 두 가지 측면에서 각각 어떤 이점이 있는지 비교해봅시다.

# 회고

**배운 점**
 * huggingface를 이용해서 데이터셋을 쉽게 로딩하고 모델을 훈련시키는 방법을 배웠다.
   
**아쉬운 점**
 * 학습 시간이 오래걸려서 끝까지 마치지 못하였다.
 * 데이터셋도 10분의 1로 줄였지만 1 epoch 당 ETA가 1시간이었음.
   
**느낀 점**
 * huggingface가 간단하긴 하지만 라이브러리에 대해 잘 알아야 능숙하게 효율적으로 사용할 수 있을 것 같다.
  
**어려웠던 점**
 * 중간에 GPU 부족 문제가 발생했었는데 batch size를 줄이고 training_args에 'fp16=True' 옵션으로 해결했었다. 
   * 최종 코드에서는 GPU 부족 문제가 없었음.