In [1]:
# Colab을 이용할 경우, GPU 사용을 위해 "런타임 - 런타임 유형 변경 - GPU 설정" 후 실행

! nvidia-smi

Mon Jun 29 00:46:14 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
! pip install transformers



In [None]:
import re
import sys
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets

from transformers import BertTokenizer, BertModel

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

sentence = "My dog is cute. He likes playing. I bought a  pet food for him"
# sentence = '나는 책상 위에 사과를 먹었다. 알고 보니 그 사과는 Jason 것이었다. 그래서 Jason에게 사과를 했다'
print(tokenizer.tokenize(sentence))

['my', 'dog', 'is', 'cute', '.', 'he', 'likes', 'playing', '.', 'i', 'bought', 'a', 'pet', 'food', 'for', 'him']


In [5]:
len(tokenizer.vocab)

30522

In [6]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
print(max_input_length)

def new_tokenizer(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

512


In [7]:
def PreProcessingText(input_sentence):
    input_sentence = input_sentence.lower() # 소문자화
    input_sentence = re.sub('<[^>]*>', repl= ' ', string = input_sentence) # "<br />" 처리
    input_sentence = re.sub('[!"$%&\()*+,-./:;<=>?@[\\]^_`{|}~]', repl= ' ', string = input_sentence) # 특수문자 처리 ("'" 제외)
    input_sentence = re.sub('\s+', repl= ' ', string = input_sentence) # 연속된 띄어쓰기 처리
    if input_sentence:
        return input_sentence

def PreProc(list_sentence):
    return [tokenizer.convert_tokens_to_ids(PreProcessingText(x)) for x in list_sentence]

In [9]:
TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = new_tokenizer,
                  preprocessing = PreProc,
                  init_token = tokenizer.cls_token_id,
                  eos_token = tokenizer.sep_token_id,
                  pad_token = tokenizer.pad_token_id,
                  unk_token = tokenizer.unk_token_id)

LABEL = data.LabelField(dtype = torch.float)

In [10]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [11]:
LABEL.build_vocab(train_data)

In [12]:
train_data, valid_data = train_data.split(random_state = random.seed(0), split_ratio=0.8)

## Reading Data

In [13]:
# Data Length
print(f'Train Data Length : {len(train_data.examples)}')
print(f'Test Data Length : {len(test_data.examples)}')

Train Data Length : 20000
Test Data Length : 25000


In [14]:
# Data Fields
train_data.fields

{'label': <torchtext.data.field.LabelField at 0x7f26ea03def0>,
 'text': <torchtext.data.field.Field at 0x7f26ea03df28>}

In [15]:
# Data Sample
print('---- Data Sample ----')
print('Input : ')
print(tokenizer.convert_ids_to_tokens(vars(train_data.examples[2])['text']))


---- Data Sample ----
Input : 
['this', 'film', 'is', 'so', 'ridiculous', '##ly', 'idiot', 'that', 'you', 'may', 'actually', 'laugh', 'at', 'it', '[UNK]', 'but', 'no', '[UNK]', 'even', 'this', 'is', 'too', 'much', 'for', 'this', 'lost', 'meters', 'of', 'cell', '##ulo', '##id', '[UNK]', 'i', 'found', 'it', 'as', 'an', 'offer', 'in', 'a', 'magazine', 'and', 'that', "'", 's', 'why', 'i', "'", 've', 'seen', 'it', '[UNK]', 'i', 'regret', 'the', 'time', 'i', 'lost', 'to', 'see', 'this', '[UNK]', '1', 'out', 'of', '10', '[UNK]', 'because', 'they', 'don', "'", 't', 'have', 'a', 'lower', 'grade', '[UNK]', '[UNK]']


## Pre-processing Data

## Making Vocab & Setting Embedding

In [16]:
# Label Info
print(f'Label Size : {len(LABEL.vocab)}')

print('Lable Examples : ')
for idx, (k, v) in enumerate(LABEL.vocab.stoi.items()):
    print('\t', k, v)

Label Size : 2
Lable Examples : 
	 neg 0
	 pos 1


## Spliting Validation Data & Making Data Iterator

In [17]:
model_config = {}

In [18]:
model_config['batch_size'] = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=model_config['batch_size'],
    device=device)

In [19]:
# Check batch data
sample_for_check = next(iter(train_iterator))
print(sample_for_check)
print(sample_for_check.text)
print(sample_for_check.label)


[torchtext.data.batch.Batch of size 10]
	[.text]:[torch.cuda.LongTensor of size 10x404 (GPU 0)]
	[.label]:[torch.cuda.FloatTensor of size 10 (GPU 0)]
tensor([[ 101, 2023, 2003,  ...,    0,    0,    0],
        [ 101, 1045, 3728,  ...,    0,    0,    0],
        [ 101, 2023, 2018,  ...,    0,    0,    0],
        ...,
        [ 101, 2023, 2003,  ...,    0,    0,    0],
        [ 101, 2941, 1045,  ...,    0,    0,    0],
        [ 101, 2348, 2045,  ...,    0,    0,    0]], device='cuda:0')
tensor([1., 1., 0., 1., 1., 1., 0., 0., 1., 0.], device='cuda:0')


## Making Model

In [35]:
bert = BertModel.from_pretrained('bert-base-uncased')

In [21]:
model_config['emb_dim'] = bert.config.to_dict()['hidden_size']

In [22]:
print(model_config['emb_dim'])

768


In [23]:
class SentenceClassification(nn.Module):
    def __init__(self, **model_config):
        super(SentenceClassification, self).__init__()
        self.bert = bert
        self.fc = nn.Linear(model_config['emb_dim'],
                            model_config['output_dim'])
        
    def forward(self, x):
        pooled_cls_output = self.bert(x)[1]
        return self.fc(pooled_cls_output)

## Training

In [24]:
def train(model, iterator, optimizer, loss_fn, idx_epoch, **model_params):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train() 
    batch_size = model_params['batch_size']

    for idx, batch in enumerate(iterator):
        
        # Initializing
        optimizer.zero_grad()
        
        # Forward 
        predictions = model(batch.text).squeeze()
        loss = loss_fn(predictions, batch.label)

        acc = binary_accuracy(predictions, batch.label)
        
        sys.stdout.write(
                    "\r" + f"[Train] Epoch : {idx_epoch:^3}"\
                    f"[{(idx + 1) * batch_size} / {len(iterator) * batch_size} ({100. * (idx + 1) / len(iterator) :.4}%)]"\
                    f"  Loss: {loss.item():.4}"\
                    f"  Acc : {acc.item():.4}"\
                    )

        # Backward 
        loss.backward()
        optimizer.step()
        
        # Update Epoch Performance
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss/len(iterator) , epoch_acc/len(iterator) 

In [30]:
def evaluate(model, iterator, loss_fn, idx_epoch, **model_params):
    
    epoch_loss = 0
    epoch_acc = 0
    
    batch_size = model_params['batch_size']
    
    # evaluation mode
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(iterator):
            predictions = model(batch.text).squeeze()
            loss = loss_fn(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

            sys.stdout.write(
                    "\r" + f"[Eval] Epoch : {idx_epoch:^3}"\
                    f"[{(idx + 1) * batch_size} / {len(iterator) * batch_size} ({100. * (idx + 1) / len(iterator) :.4}%)]"\
                    f"  Loss: {loss.item():.4}"\
                    f"  Acc : {acc.item():.4}"\
                    )
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

### bi-RNN

In [26]:
model_config.update(dict(output_dim = 1))

In [36]:
def binary_accuracy(preds, y):
    # rounded_preds = torch.argmax(preds, axis=1) 
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum()/len(correct)
    return acc


model = SentenceClassification(**model_config)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)
loss_fn = nn.BCEWithLogitsLoss().to(device)
model = model.to(device)

In [28]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

109483009

In [37]:
N_EPOCH = 4

best_valid_loss = float('inf')
model_name = "BERT"

print('---------------------------------')
print(f'Model name : {model_name}')
print('---------------------------------')

for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, train_iterator, optimizer, loss_fn, epoch, **model_config)
    print('')
    print(f'\t Epoch : {epoch} | Train Loss : {train_loss:.4} | Train Acc : {train_acc:.4}')
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_fn, epoch, **model_config)
    print('')
    print(f'\t Epoch : {epoch} | Valid Loss : {valid_loss:.4} | Valid Acc : {valid_acc:.4}')
    # print('')
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./{model_name}.pt')
        print(f'\t Model is saved at {epoch}-epoch')

    
    

---------------------------------
Model name : BERT
---------------------------------
	 Epoch : 0 | Train Loss : 0.2949 | Train Acc : 0.8676
	 Epoch : 0 | Valid Loss : 0.2053 | Valid Acc : 0.919
	 Model is saved at 0-epoch
	 Epoch : 1 | Train Loss : 0.152 | Train Acc : 0.9451
	 Epoch : 1 | Valid Loss : 0.2603 | Valid Acc : 0.9176
	 Epoch : 2 | Train Loss : 0.08783 | Train Acc : 0.9692
	 Epoch : 2 | Valid Loss : 0.2506 | Valid Acc : 0.9128
	 Epoch : 3 | Train Loss : 0.05724 | Train Acc : 0.9815
	 Epoch : 3 | Valid Loss : 0.2739 | Valid Acc : 0.9248


In [39]:
# Test set
# model.load_state_dict(torch.load(f'./{model_name}.pt'))
epoch = 0
test_loss, test_acc = evaluate(model, test_iterator, loss_fn, epoch, **model_config)
print('')
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

Test Loss : 0.2686 | Test Acc : 0.9242


In [40]:
# Test set
model.load_state_dict(torch.load(f'./{model_name}.pt'))
epoch = 0
test_loss, test_acc = evaluate(model, test_iterator, loss_fn, epoch, **model_config)
print('')
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

Test Loss : 0.1899 | Test Acc : 0.9262
