# 머신 러닝 교과서 - 파이토치편

<table align="left"><tr><td>
<a href="https://colab.research.google.com/github/rickiepark/ml-with-pytorch/blob/main/ch16/ch16-part3-bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="코랩에서 실행하기"/></a>
</td></tr></table>

## 패키지 버전 체크

check_packages.py 스크립트에서 로드하기 위해 폴더를 추가합니다:

In [1]:
import sys

# 코랩의 경우 깃허브 저장소로부터 python_environment_check.py를 다운로드 합니다.
if 'google.colab' in sys.modules:
    !wget https://raw.githubusercontent.com/rickiepark/ml-with-pytorch/main/python_environment_check.py
else:
    sys.path.insert(0, '..')

--2023-09-06 02:04:00--  https://raw.githubusercontent.com/rickiepark/ml-with-pytorch/main/python_environment_check.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1629 (1.6K) [text/plain]
Saving to: ‘python_environment_check.py’


2023-09-06 02:04:00 (38.7 MB/s) - ‘python_environment_check.py’ saved [1629/1629]



In [2]:
!pip install transformers datasets accelerate

Collecting transformers
  Downloading transformers-4.33.0-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.22.0-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux201

권장 패키지 버전을 확인하세요:

In [3]:
from python_environment_check import check_packages


d = {
    'pandas': '1.3.2',
    'torch': '1.9.0',
    'torchtext': '0.11.0',
    'datasets': '1.11.0',
    'transformers': '4.9.1',
}
check_packages(d)

[OK] Your Python version is 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]
[OK] pandas 1.5.3
[OK] torch 2.0.1+cu118
[OK] torchtext 0.15.2+cpu
[OK] datasets 2.14.4
[OK] transformers 4.33.0


# 16장 트랜스포머 - 어텐션 메커니즘을 통한 자연어 처리 성능 향상 (파트 3/3)

**목차**

- 파이토치에서 BERT 모델 미세 튜닝하기
  - IMDb 영화 리뷰 데이터셋 로드
  - 데이터셋 토큰화
  - 사전 훈련된 BERT 모델 로드 및 미세 튜닝하기
  - 트레이너 API를 사용하여 트랜스포머를 간편하게 미세 튜닝하기
- 요약

---

https://huggingface.co/transformers/custom_datasets.html 에서 발췌:

> DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than bert-base-uncased , runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language understanding benchmark.

---

In [4]:
from IPython.display import Image

## 파이토치에서 BERT 모델 미세 튜닝하기

### IMDb 영화 리뷰 데이터셋 로드

In [5]:
import gzip
import shutil
import time

import pandas as pd
import requests
import torch
import torch.nn.functional as F
import torchtext

import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

**일반 설정**

In [6]:
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

NUM_EPOCHS = 3

**데이터셋 다운로드**

다음 셀은 긍정-부정 감성 분류를 위해 IMDB 영화 리뷰 데이터셋(http://ai.stanford.edu/~amaas/data/sentiment/)을 CSV 형식의 파일로 다운로드합니다:

In [7]:
url = "https://github.com/rasbt/machine-learning-book/raw/main/ch08/movie_data.csv.gz"
filename = url.split("/")[-1]

with open(filename, "wb") as f:
    r = requests.get(url)
    f.write(r.content)

with gzip.open('movie_data.csv.gz', 'rb') as f_in:
    with open('movie_data.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

데이터셋을 확인합니다:

In [8]:
df = pd.read_csv('movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [9]:
df.shape

(50000, 2)

**데이터셋을 훈련/검증/테스트로 분할합니다**

In [10]:
train_texts = df.iloc[:35000]['review'].values
train_labels = df.iloc[:35000]['sentiment'].values

valid_texts = df.iloc[35000:40000]['review'].values
valid_labels = df.iloc[35000:40000]['sentiment'].values

test_texts = df.iloc[40000:]['review'].values
test_labels = df.iloc[40000:]['sentiment'].values

## 데이터셋 토큰화

In [11]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [12]:
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

In [13]:
train_encodings[0]

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

**데이터셋 클래스와 로더**

In [14]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = IMDbDataset(train_encodings, train_labels)
valid_dataset = IMDbDataset(valid_encodings, valid_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [15]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=16, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

## 사전 훈련된 BERT 모델 로드 및 미세 튜닝하기

In [16]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(DEVICE)
model.train()

optim = torch.optim.Adam(model.parameters(), lr=5e-5)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**모델 훈련 -- 수동 훈련 루프**

In [17]:
def compute_accuracy(model, data_loader, device):
    with torch.no_grad():
        correct_pred, num_examples = 0, 0

        for batch_idx, batch in enumerate(data_loader):

        ### 데이터 준비
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs['logits']
            predicted_labels = torch.argmax(logits, 1)
            num_examples += labels.size(0)
            correct_pred += (predicted_labels == labels).sum()

        return correct_pred.float()/num_examples * 100

In [18]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):

    model.train()

    for batch_idx, batch in enumerate(train_loader):

        ### 데이터 준비
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        ### 정방향
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss, logits = outputs['loss'], outputs['logits']

        ### 역방향
        optim.zero_grad()
        loss.backward()
        optim.step()

        ### 로깅
        if not batch_idx % 250:
            print (f'에포크: {epoch+1:04d}/{NUM_EPOCHS:04d} | '
                   f'배치 {batch_idx:04d}/{len(train_loader):04d} | '
                   f'손실: {loss:.4f}')

    model.eval()

    with torch.set_grad_enabled(False):
        print(f'훈련 정확도: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\n검증 정확도: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')

    print(f'소요 시간: {(time.time() - start_time)/60:.2f} min')

print(f'총 훈련 시간: {(time.time() - start_time)/60:.2f} min')
print(f'테스트 정확도: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

에포크: 0001/0003 | 배치 0000/2188 | 손실: 0.6670
에포크: 0001/0003 | 배치 0250/2188 | 손실: 0.2868
에포크: 0001/0003 | 배치 0500/2188 | 손실: 0.4108
에포크: 0001/0003 | 배치 0750/2188 | 손실: 0.0934
에포크: 0001/0003 | 배치 1000/2188 | 손실: 0.4670
에포크: 0001/0003 | 배치 1250/2188 | 손실: 0.2621
에포크: 0001/0003 | 배치 1500/2188 | 손실: 0.4201
에포크: 0001/0003 | 배치 1750/2188 | 손실: 0.4579
에포크: 0001/0003 | 배치 2000/2188 | 손실: 0.2072
훈련 정확도: 96.36%
검증 정확도: 92.46%
소요 시간: 9.01 min
에포크: 0002/0003 | 배치 0000/2188 | 손실: 0.0444
에포크: 0002/0003 | 배치 0250/2188 | 손실: 0.3364
에포크: 0002/0003 | 배치 0500/2188 | 손실: 0.0954
에포크: 0002/0003 | 배치 0750/2188 | 손실: 0.0741
에포크: 0002/0003 | 배치 1000/2188 | 손실: 0.1052
에포크: 0002/0003 | 배치 1250/2188 | 손실: 0.0306
에포크: 0002/0003 | 배치 1500/2188 | 손실: 0.0603
에포크: 0002/0003 | 배치 1750/2188 | 손실: 0.0503
에포크: 0002/0003 | 배치 2000/2188 | 손실: 0.0492
훈련 정확도: 98.83%
검증 정확도: 93.34%
소요 시간: 17.96 min
에포크: 0003/0003 | 배치 0000/2188 | 손실: 0.0192
에포크: 0003/0003 | 배치 0250/2188 | 손실: 0.0064
에포크: 0003/0003 | 배치 0500/2188 | 손실: 0.0043
에포크:

In [19]:
del model # 메모리 해제

### 트레이너 API를 사용하여 트랜스포머를 간편하게 미세 튜닝하기

사전 훈련된 모델 로드:

In [20]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(DEVICE)
model.train();

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from transformers import Trainer, TrainingArguments


optim = torch.optim.Adam(model.parameters(), lr=5e-5)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

In [22]:
# pip install datasets으로 datasets를 설치합니다.
from datasets import load_metric
import numpy as np


metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred # logits are a numpy array, not pytorch tensor
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(
               predictions=predictions, references=labels)

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [23]:
optim = torch.optim.Adam(model.parameters(), lr=5e-5)


training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10
)

trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    optimizers=(optim, None) # optimizer and learning rate scheduler
)

# 이전 코드와 비교하기 위해 (여러 대의 GPU가 있더라도)
# 1개의 GPU만 사용합니다.

trainer.args._n_gpu = 1

In [24]:
start_time = time.time()
trainer.train()
print(f'총 훈련 시간: {(time.time() - start_time)/60:.2f} min')

Step,Training Loss
10,0.6989
20,0.6905
30,0.6887
40,0.597
50,0.4391
60,0.4583
70,0.3618
80,0.425
90,0.436
100,0.3585


총 훈련 시간: 20.72 min


In [25]:
trainer.evaluate()

{'eval_loss': 0.30645111203193665,
 'eval_accuracy': 0.936,
 'eval_runtime': 38.3605,
 'eval_samples_per_second': 260.685,
 'eval_steps_per_second': 16.293,
 'epoch': 3.0}

In [26]:
model.eval()
model.to(DEVICE)
print(f'테스트 정확도: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

테스트 정확도: 93.60%


# 요약