In [None]:
!pip install kobert_transformers
!pip install torch
!pip install transformers==3.4.0

In [2]:
from kobert_transformers import get_tokenizer, get_kobert_model
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch.nn.utils

from transformers import AdamW, BertForTokenClassification
from transformers.optimization import get_cosine_schedule_with_warmup

In [3]:
device = torch.device("cuda:0")

if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [4]:
tokenizer = get_tokenizer()
seqPATH = '/content/drive/MyDrive/call_bureng/data/gen_seq/data'

#이부분 util 파일에 Reader class 만들기
with open(seqPATH + '/seq_in2.txt', 'r', encoding = 'utf-8') as f:
    x = f.readlines()
with open(seqPATH + '/seq_out2.txt', 'r', encoding = 'utf-8') as f:
    y = f.readlines()

Downloading:   0%|          | 0.00/371k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

In [5]:
class SLOTDataset(Dataset):
    def __init__(self, x, y, tokenizer, MAXLEN, label_dict):
        # x embedding
        x_embedding = [tokenizer.convert_tokens_to_ids(('[CLS] ' + sentence + ' [SEP]').split()) for sentence in x]
        for i in x_embedding:
            i += [0] * (MAXLEN - len(i))
        # y labeling
        y = [('[CLS] ' + sentence + ' [SEP]').split() for sentence in y]
        y_label = []
        for sentence in y:
            y_label.append([label_dict.get(l) for l in sentence])
        for i in y_label:
            i += [0] * (MAXLEN - len(i))
            
        attention_mask = []
        for seq in x_embedding:
            seq_mask = [float(i>0) for i in seq]
            attention_mask.append(seq_mask)
 
        self.x = torch.tensor(x_embedding)
        self.y = torch.tensor(y_label)
        self.attention_mask = torch.tensor(attention_mask)
        #self.segment_id = torch.tensor([1] * MAXLEN)

    def __getitem__(self, i):
        return {'token_ids' : self.x[i], 
                'attention_mask' : self.attention_mask[i],
                'labels' : self.y[i]}

    def __len__(self):
        return (len(self.x))

In [6]:
MAXLEN = 40

In [7]:
label_dict = {'0' : 0, 'DEP': 1, 'ARR' : 2, 'DATE' : 3, 'DEP_TIME' : 4, 'ARR_TIME' : 5, '[CLS]' : 6, '[SEP]' : 7 }

In [8]:
train = SLOTDataset(x, y, tokenizer, MAXLEN, label_dict)

In [25]:
batch_size = 8
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
learning_rate =  1e-5

In [12]:
train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_size, num_workers=5)

  cpuset_checked))


In [13]:
model = BertForTokenClassification.from_pretrained('monologg/kobert', num_labels = 8).to(device)

Downloading:   0%|          | 0.00/426 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
#정확도 측정
def calc_accuracy(pred_labels, test_target):
    num, total = 0, 0
    
    for i in range(0, len(pred_labels)):
    # 고정으로 들어가 있는 [CLS], [SEP] 는 제외함
        for pred, target in zip(pred_labels[i][1:-1], test_target[i][1:-1]):
            if pred == target:
                num +=1
        total += len(pred_labels[i])-2 # [CLS], [SEP] 태그 수 빼기

    return num/total*100

In [26]:
#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)
    
for e in range(num_epochs):
    model.train()

    train_loss = 0.0
    train_acc = 0.0   

    for batch_id, batch in enumerate(tqdm(train_dataloader)):

        batch_token_ids = batch['token_ids'].long().to(device)
        batch_attention_mask = batch['attention_mask'].long().to(device)
        batch_label = batch['labels'].long().to(device)

        out = model(batch_token_ids, token_type_ids = None, attention_mask = batch_attention_mask, labels = batch_label)
        loss = out[0]
        prediction = out[1]

        loss.sum().backward()
        
        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        # 그래디언트를 통해 가중치 업데이트
        optimizer.step()
        #스케줄러로 학습률 감소
        scheduler.step()
        # loss 구하기
        train_loss += loss
        # 그래디언트 초기화
        optimizer.zero_grad()

        pred = np.argmax(prediction.detach().to('cpu').numpy(), axis = 2)
        label = batch_label.to('cpu').numpy()
        train_acc += calc_accuracy(pred, label)

    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    

  cpuset_checked))
100%|██████████| 2250/2250 [03:24<00:00, 11.00it/s]


epoch 1 train acc 82.70190058479537


100%|██████████| 2250/2250 [03:32<00:00, 10.60it/s]


epoch 2 train acc 84.73728070175441


100%|██████████| 2250/2250 [03:33<00:00, 10.54it/s]


epoch 3 train acc 84.72499999999984


100%|██████████| 2250/2250 [03:34<00:00, 10.49it/s]


epoch 4 train acc 83.63040935672494


100%|██████████| 2250/2250 [03:34<00:00, 10.51it/s]


epoch 5 train acc 82.67470760233921


100%|██████████| 2250/2250 [03:33<00:00, 10.54it/s]


epoch 6 train acc 77.85029239766082


100%|██████████| 2250/2250 [03:33<00:00, 10.54it/s]


epoch 7 train acc 78.56257309941525


100%|██████████| 2250/2250 [03:32<00:00, 10.57it/s]


epoch 8 train acc 77.7422514619884


100%|██████████| 2250/2250 [03:32<00:00, 10.59it/s]


epoch 9 train acc 79.0766081871345


100%|██████████| 2250/2250 [03:32<00:00, 10.57it/s]

epoch 10 train acc 79.14283625730975





In [19]:
torch.save(model, '/content/drive/MyDrive/call_bureng/KoBERTSoltModel02_0812.pt')