In [1]:
# BERT 파인 튜닝 해보기
# 파이썬-딥러닝-파이토치 책의 일부를 가져옴

- Fine tuning 이란?  
기존에 학습되어져 있는 모델을 기반으로 아키텍쳐를 새로운 목적(나의 이미지 데이터에 맞게)변형하고,  
이미 학습된 모델 Weights로 부터 학습을 업데이트하는 방법을 말한다.

In [2]:
''' 1. Import Module '''
import re
import sys
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets

from transformers import BertModel, BertTokenizer

In [3]:
''' 2. Tokenizer 만들기 '''
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# test
sentence = "My dog is cute. He likes playing. I bought a  pet food for him"
print(tokenizer.tokenize(sentence))

['my', 'dog', 'is', 'cute', '.', 'he', 'likes', 'playing', '.', 'i', 'bought', 'a', 'pet', 'food', 'for', 'him']


In [4]:
len(tokenizer.vocab) # vocab의 길이

30522

In [5]:
max_seq_length = tokenizer.max_model_input_sizes['bert-base-uncased']
max_seq_length # 512

512

In [6]:
def new_tokenizer(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_seq_length-2] # [CLS], [SEP] token 고려
    return tokens

# Text 전처리
def PreProcessingText(input_sentence):
    input_sentence = input_sentence.lower() # 소문자화
    input_sentence = re.sub('<[^>]*>', repl= ' ', string = input_sentence) # "<br />" 처리
    input_sentence = re.sub('[!"$%&\()*+,-./:;<=>?@[\\]^_`{|}~]', repl= ' ', string = input_sentence) # 특수문자 처리 ("'" 제외)
    input_sentence = re.sub('\s+', repl= ' ', string = input_sentence) # 연속된 띄어쓰기 처리
    if input_sentence:
        return input_sentence

# Token 설정(전처리)하기
def PreProc(list_sentence):
    return [tokenizer.convert_tokens_to_ids(PreProcessingText(x)) for x in list_sentence]

In [7]:
''' 3. Field 만들기 '''
TEXT = data.Field(batch_first = True,
                 use_vocab = False,
                 tokenize = new_tokenizer,
                 preprocessing = PreProc,
                 init_token = tokenizer.cls_token_id,
                 eos_token = tokenizer.eos_token_id,
                 pad_token = tokenizer.pad_token_id,
                 unk_token = tokenizer.unk_token_id)

LABEL = data.LabelField(dtype = torch.float)

In [8]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [9]:
LABEL.build_vocab(train_data)

In [10]:
train_data, valid_data = train_data.split(random_state = random.seed(0), split_ratio=0.8)

In [11]:
len(train_data.examples)

20000

In [12]:
len(test_data.examples)

25000

In [13]:
train_data.fields

{'text': <torchtext.data.field.Field at 0x17e0f2731f0>,
 'label': <torchtext.data.field.LabelField at 0x17e0f2730d0>}

In [14]:
tokenizer.convert_ids_to_tokens(vars(train_data.examples[2])['text'])

['another',
 'in',
 'a',
 'long',
 'line',
 'of',
 'flick',
 '##s',
 'made',
 'by',
 'people',
 'who',
 'think',
 'that',
 'knowing',
 'how',
 'to',
 'operate',
 'a',
 'camera',
 'is',
 'the',
 'same',
 'as',
 'telling',
 'a',
 'story',
 '[UNK]',
 'within',
 '15',
 'minutes',
 '[UNK]',
 'the',
 'entire',
 'premise',
 'is',
 'laid',
 'out',
 'in',
 'just',
 'a',
 'few',
 'lines',
 '[UNK]',
 'so',
 'there',
 'is',
 'absolutely',
 'no',
 'mystery',
 '[UNK]',
 'which',
 'eliminate',
 '##s',
 'a',
 'whole',
 'face',
 '##t',
 'of',
 'the',
 'suspense',
 '[UNK]',
 'the',
 'only',
 'half',
 '[UNK]',
 'way',
 'competent',
 'actor',
 'is',
 'killed',
 '10',
 'minutes',
 'into',
 'the',
 'film',
 '[UNK]',
 'so',
 'we',
 "'",
 're',
 'left',
 'with',
 'stupid',
 'characters',
 'running',
 'around',
 'doing',
 'stupid',
 'things',
 '[UNK]',
 'low',
 'budget',
 'films',
 'can',
 "'",
 't',
 'afford',
 'expensive',
 'special',
 'effects',
 '[UNK]',
 'so',
 'the',
 'c',
 '##gi',
 'portions',
 'are',
 

In [15]:
print('Lable Examples : ')
for idx, (k, v) in enumerate(LABEL.vocab.stoi.items()):
    print('\t', k, v)

Lable Examples : 
	 neg 0
	 pos 1


In [50]:
''' 4. Make variable '''
model_config = {}
model_config['batch_size'] = 4

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data),
                                                                          batch_size = model_config['batch_size'],
                                                                          device = device)

In [51]:
# Check batch data
sample_for_check = next(iter(train_iterator))
print(sample_for_check)
print(sample_for_check.text)
print(sample_for_check.label)


[torchtext.data.batch.Batch of size 4]
	[.text]:[torch.cuda.LongTensor of size 4x371 (GPU 0)]
	[.label]:[torch.cuda.FloatTensor of size 4 (GPU 0)]
tensor([[  101,  7929,   100,  ...,   100,  3185,   100],
        [  101,  2028,  2305,  ...,     0,     0,     0],
        [  101,  2053, 11967,  ...,     0,     0,     0],
        [  101,   100, 27594,  ...,     0,     0,     0]], device='cuda:0')
tensor([0., 0., 0., 1.], device='cuda:0')


In [52]:
''' 5. Making Model '''
BERT = BertModel.from_pretrained('bert-base-uncased')
model_config['emb_dim'] = BERT.config.to_dict()['hidden_size'] # 768

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [58]:
# Fine-tuning 작업
class SentenceClassification(nn.Module):
    def __init__(self, **model_config):
        super(SentenceClassification, self).__init__()
        
        self.bert = BERT
        self.fc = nn.Linear(model_config['emb_dim'],
                           model_config['output_dim'])
        
    def forward(self, x):
        pooled_cls_output = self.bert(x)[1] # [CLS] 토큰에 대한 부분
        return F.sigmoid(self.fc(pooled_cls_output))

In [59]:
''' 6. Setting '''
model_config.update(dict(output_dim=1))
model = SentenceClassification(**model_config)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)
loss = nn.BCEWithLogitsLoss()
model = model.to(device)

In [63]:
''' 7. Train and evaluate function '''
def train(model, iterator, optimizer, loss_fn, idx_epoch, **model_config):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    batch_size = model_config['batch_size']
    
    for idx, batch in enumerate(iterator):
        # optimizer zero grad
        optimizer.zero_grad()
        
        # forward
        prediction = model(batch.text).squeeze()
        
        loss = loss_fn(prediction, batch.label)
        acc = Accuracy(prediction, batch.label)
        
        # backward
        loss.backward() # backpropagation
        optimizer.step() # weight update
        
        if idx % 200 == 0:
            print("Train Epoch: {}[{}/{}] \t Train loss: {:.4f} \t Train acc: {:.4f}".format(
                                    idx_epoch, idx*batch_size, len(iterator)*batch_size, loss.item(), acc.item()))
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

def evaluate(model, iterator, optimizer, loss_fn, idx_epoch, **model_config):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    batch_size = model_config['batch_size']
    
    with torch.no_grad():
        for idx, batch in enumerate(iterator):
            prediction = model(batch.text).squeeze()
            loss = loss_fn(prediction, batch.label)
            acc = Accuracy(prediction, batch.label)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

def Accuracy(pred, y):
    predict = torch.round(pred)
    acc = (predict==y).sum()/len(y)
    return acc

In [64]:
model_config

{'batch_size': 4, 'emb_dim': 768, 'output_dim': 1}

In [72]:
''' 8. Training '''
N_EPOCH = 5
model_name = "BERT"

print('---------------------------------')
print(f'Model name : {model_name}')
print('---------------------------------')
for epoch in range(1,N_EPOCH+1):
    train_loss, train_acc = train(model, train_iterator, optimizer, loss, epoch, **model_config)
    valid_loss, valid_acc = evaluate(model, valid_iterator, optimizer, loss, epoch, **model_config)
    print("\nEpoch: {} \t Train_loss: {:.4f} \t Train_acc: {:.4f} \t Valid_loss: {:.4f} \t Valid_acc: {:.4f}".format(
                                    epoch, train_loss, train_acc, valid_loss, valid_acc))
    

---------------------------------
Model name : BERT
---------------------------------
Train Epoch: 1[0/20000] 	 Train loss: 0.6928 	 Train acc: 0.2500
Train Epoch: 1[800/20000] 	 Train loss: 0.6927 	 Train acc: 0.2500
Train Epoch: 1[1600/20000] 	 Train loss: 0.6932 	 Train acc: 0.5000
Train Epoch: 1[2400/20000] 	 Train loss: 0.6936 	 Train acc: 0.7500
Train Epoch: 1[3200/20000] 	 Train loss: 0.6931 	 Train acc: 0.5000
Train Epoch: 1[4000/20000] 	 Train loss: 0.6931 	 Train acc: 0.5000
Train Epoch: 1[4800/20000] 	 Train loss: 0.6931 	 Train acc: 0.5000
Train Epoch: 1[5600/20000] 	 Train loss: 0.6930 	 Train acc: 0.2500
Train Epoch: 1[6400/20000] 	 Train loss: 0.6929 	 Train acc: 0.2500
Train Epoch: 1[7200/20000] 	 Train loss: 0.6931 	 Train acc: 0.5000
Train Epoch: 1[8000/20000] 	 Train loss: 0.6930 	 Train acc: 0.2500
Train Epoch: 1[8800/20000] 	 Train loss: 0.6932 	 Train acc: 0.5000
Train Epoch: 1[9600/20000] 	 Train loss: 0.6931 	 Train acc: 0.5000
Train Epoch: 1[10400/20000] 	 Trai

Train Epoch: 5[11200/20000] 	 Train loss: 0.6927 	 Train acc: 0.2500
Train Epoch: 5[12000/20000] 	 Train loss: 0.6928 	 Train acc: 0.2500
Train Epoch: 5[12800/20000] 	 Train loss: 0.6929 	 Train acc: 0.2500
Train Epoch: 5[13600/20000] 	 Train loss: 0.6931 	 Train acc: 0.5000
Train Epoch: 5[14400/20000] 	 Train loss: 0.6929 	 Train acc: 0.2500
Train Epoch: 5[15200/20000] 	 Train loss: 0.6933 	 Train acc: 0.7500
Train Epoch: 5[16000/20000] 	 Train loss: 0.6931 	 Train acc: 0.5000
Train Epoch: 5[16800/20000] 	 Train loss: 0.6931 	 Train acc: 0.5000
Train Epoch: 5[17600/20000] 	 Train loss: 0.6930 	 Train acc: 0.2500
Train Epoch: 5[18400/20000] 	 Train loss: 0.6931 	 Train acc: 0.5000
Train Epoch: 5[19200/20000] 	 Train loss: 0.6931 	 Train acc: 0.5000

Epoch: 5 	 Train_loss: 0.6932 	 Train_acc: 0.4986 	 Valid_loss: 0.6932 	 Valid_acc: 0.5054


In [70]:
# GPU memory 부족할 때
import gc
gc.collect()
torch.cuda.empty_cache()