In [8]:
import torch

from bert_data_utils import get_raw_imdb_data

In [None]:
# 데이터 셋을 불러옴
train_data, valid_data, test_data = get_raw_imdb_data()

In [None]:
from transformers import BertTokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
def bert_tokenized_data(tokenizer, data, max_seq_len=128, pad_to_max_len=True):
    sentences = [' '.join(s.text) for s in data]  # I am so ... good .
    labels = [torch.tensor([1]) if l.label == 'pos' else torch.tensor([0]) for l in data]  # [1, 0, 0, ... , 1, ...]

    sentences = [tokenizer.encode_plus(s, max_length=max_seq_len, pad_to_max_length=pad_to_max_len) for s
                 in sentences]
    input_ids = [torch.tensor(s['input_ids']) for s in sentences]
    token_type_ids = [torch.tensor(s['token_type_ids']) for s in sentences]
    attn_mask = [torch.tensor(s['attention_mask']) for s in sentences]
    
    return input_ids, \
           token_type_ids, \
           attn_mask, \
           labels

In [None]:
train_input_ids, train_token_type_ids, train_attn_mask, train_labels = bert_tokenized_data(tokenizer, train_data, max_seq_len=160)
valid_input_ids, valid_token_type_ids, valid_attn_mask, valid_labels = bert_tokenized_data(tokenizer, valid_data, max_seq_len=160)
test_input_ids, test_token_type_ids, test_attn_mask, test_labels = bert_tokenized_data(tokenizer, test_data, max_seq_len=160)

In [None]:
from bert_dataset import Corpus
from torch.utils.data import Dataset, DataLoader

In [None]:
train = Corpus(train_input_ids, train_token_type_ids, train_attn_mask, train_labels)
valid = Corpus(valid_input_ids, valid_token_type_ids, valid_attn_mask, valid_labels)
test = Corpus(test_input_ids, test_token_type_ids, test_attn_mask, test_labels)

params = {'batch_size': 32,
          'shuffle': True,
          'num_workers': 6}

train_loader = DataLoader(train, **params)
valid_loader = DataLoader(valid, **params)
test_loader = DataLoader(test, **params)

In [None]:
from transformers import BertModel, BertPreTrainedModel, BertForSequenceClassification

In [None]:
# Bert 모델 정의
bert_config = 'bert-base-cased'
model = BertForSequenceClassification.from_pretrained(bert_config)

In [None]:
def get_device():
    return 'cuda' if torch.cuda.is_available() else 'cpu'

def get_num_corrects(logits, labels):
    return (logits.max(1)[1] == labels.max(1)[0]).sum()

def train(model, optim, iterator, scheduler, device):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optim.zero_grad()
        
        input_ids, token_type_ids, attn_masks, labels = batch[0].to(device), batch[1].to(device), batch[2].to(device), batch[3].to(device)
        loss, logits = model(input_ids=input_ids, attention_mask=attn_masks, token_type_ids=token_type_ids, labels=labels)
        num_corrects = get_num_corrects(logits, labels)
        acc = 100.0 * num_corrects.item() / float(labels.size(0))
        epoch_loss += loss.item()
        epoch_acc += acc
        
#         print(f'acc: {acc} loss: {loss}')

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optim.step()
        # `Update learning rate schedule
        scheduler.step()

        # Clear the previous accumulated gradients
        optimizer.zero_grad()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, otpim, iterator, device):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            input_ids, token_type_ids, attn_masks, labels = batch[0].to(device), batch[1].to(device), batch[2].to(device), batch[3].to(device)
            loss, logits = model(input_ids=input_ids, attention_mask=attn_masks, token_type_ids=token_type_ids, labels=labels)
            num_corrects = get_num_corrects(logits, labels)
            acc = 100.0 * num_corrects.item() / float(labels.size(0))
#             print(f'acc: {acc} loss: {loss}')
            epoch_loss += loss.item()
            epoch_acc += acc


    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import torch.optim as optim
import torch.nn as nn

from transformers import AdamW, get_linear_schedule_with_warmup

torch.backends.cudnn.deterministic = True
torch.cuda.set_device(1)

N_EPOCHS = 5

best_valid_loss = float('inf')

WEIGHT_DECAY = 0.01
LR = 2e-5

num_training_steps = len(train_loader)*N_EPOCHS
num_warmup_steps = 0

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)],
     'weight_decay': WEIGHT_DECAY},
    {'params': [p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=LR, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

device = get_device()
model = model.to(device)
# model = nn.DataParallel(model, output_device=1)

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, optimizer, train_loader, scheduler, device)
    valid_loss, valid_acc = evaluate(model, optimizer, valid_loader, device)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './bert_base_cased_sentence_classification.pt')

    print(f'Train Loss: {train_loss} | Train Acc: {train_acc}%')
    print(f'Val Loss: {valid_loss} |  Val Acc: {valid_acc}%')

In [None]:
!nvidia-smi