In [1]:
!pip install transformers
!pip install seqeval

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 7.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 49.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 66.7 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 61.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml


In [2]:
import os 
import random
import numpy as np
import pandas as pd

from tqdm import tqdm
import gc

import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, RandomSampler, DataLoader
from transformers import BertTokenizer, BertTokenizerFast, BertForTokenClassification
from transformers import get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score


from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [4]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# 1. Data Load

In [None]:
# train
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1jatBP8yZkWn6Kg6mjN7nWLnYVwXE_sY_' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1jatBP8yZkWn6Kg6mjN7nWLnYVwXE_sY_" -O ner_train_data.csv && rm -rf /tmp/cookies.txt

# test
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1YVYShKCtWfigXBOb5ie7s6QmA-dHnjt3' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1YVYShKCtWfigXBOb5ie7s6QmA-dHnjt3" -O ner_test_data.csv && rm -rf /tmp/cookies.txt

# label
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1_DPfdY1Q5Xt2md7QVbKcQLRDYHm3qgVQ' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1_DPfdY1Q5Xt2md7QVbKcQLRDYHm3qgVQ" -O ner_label.txt && rm -rf /tmp/cookies.txt

In [14]:
train_data = pd.read_csv("ner_train_data.csv")

test_data = pd.read_csv('ner_test_data.csv')

ner_tag = [label.strip() for label in open('ner_label.txt','r',encoding='utf-8')]

tag_to_index = {tag: index for index, tag in enumerate(ner_tag)}
index_to_tag = {index: tag for index, tag in enumerate(ner_tag)}

In [8]:
tokenizer = BertTokenizerFast.from_pretrained('klue/bert-base')

model = BertForTokenClassification.from_pretrained('klue/bert-base',num_labels=len(ner_tag))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [15]:
batch_input = tokenizer(train_data['Sentence'].tolist(), padding=True, truncation=True)

document = [str(i).split() for i in train_data['Sentence']]
tag_label = [i.split() for i in train_data['Tag']]

In [17]:
pad_token_label_id = -100
max_len = len(batch_input['input_ids'][0])
batch_size = 32

label = []
for words,labels in zip(document,tag_label):
    tokens = []
    label_ids = []
    for word, slot_label in zip(words, labels):
        word_tokens = tokenizer.tokenize(word)
        if not word_tokens:
            word_tokens = [tokenizer.unk_token]  # For handling the bad-encoded word
        tokens.extend(word_tokens)
        label_ids.extend([tag_to_index.get(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1))

    # Account for [CLS] and [SEP]
    # 토큰의 길이가 max_seq_len을 넘으면 잘라줌
    special_tokens_count = 2
    if len(label_ids) > max_len - special_tokens_count:
        label_ids = label_ids[: (max_len - special_tokens_count)]

    label_ids = [pad_token_label_id] + label_ids # [CLS]
    label_ids += [pad_token_label_id] # [SEP]
    label_ids += [pad_token_label_id]* (max_len - len(label_ids))  # [PAD]

    label.append(label_ids)

# 데이터를 tensor로 변환
batch_input = {k : torch.tensor(v) for k,v in batch_input.items()}
label = torch.tensor(label)

train_input, val_input, train_labels, val_labels = train_test_split(batch_input['input_ids'],
                                                                    label,
                                                                    random_state=42,
                                                                    test_size = 0.1)

train_mask, val_mask, train_token, val_token = train_test_split(batch_input['attention_mask'],
                                                batch_input['token_type_ids'],
                                                random_state=42,
                                                test_size = 0.1)

train_set = TensorDataset(train_input, train_token, train_mask, train_labels)
train_sampler = RandomSampler(train_set)
train_dataloader = DataLoader(train_set, sampler=train_sampler, batch_size=batch_size)

val_set = TensorDataset(val_input, val_token, val_mask, val_labels)
val_dataloader = DataLoader(val_set, batch_size=batch_size)

In [None]:
epochs = 3

gc.collect()
torch.cuda.empty_cache()

optimizer = Adam(model.parameters(),
                lr = 1e-5, # 학습률
                eps = 1e-8)

total_steps = len(train_set) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

for epoch in range(1,epochs+1):
#=================================================================================
#                                Training
#=================================================================================
    model.train()

    train_loss = 0.0

    for batchs in tqdm(train_dataloader):
        batch = tuple(b.to(device) for b in batchs)

        l_input_ids, l_segment, l_mask, l_labels = batch

        optimizer.zero_grad()

        output = model(l_input_ids,
                        token_type_ids = l_segment if len(torch.unique(l_segment.flatten())) > 1 else None,
                        attention_mask=l_mask,
                        labels=l_labels)
        
        loss = output[0]
        
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_dataloader)
    print("LOSS : ", avg_train_loss) 
#=================================================================================
#                                   Evaluation
#=================================================================================
    model.eval()

    eval_loss =  0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None


    for batchs in tqdm(val_dataloader):
        batch = tuple(b.to(device) for b in batchs)
        l_input_ids, l_segment, l_mask, l_labels = batch

        with torch.no_grad():
                output = model(l_input_ids,
                            token_type_ids = None,
                            attention_mask=l_mask)
                
        logits = output[0]

        # CPU로 데이터 이동
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids  = l_labels.to('cpu').numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, l_labels.detach().cpu().numpy(), axis=0)

    preds = np.argmax(preds, axis=2)

    # output을 담을 리스트 생성
    true_labels = [[] for _ in range(out_label_ids.shape[0])]
    pred_labels = [[] for _ in range(out_label_ids.shape[0])]

    for i in range(out_label_ids.shape[0]):
        for j in range(out_label_ids.shape[1]):
            if out_label_ids[i, j] != -100:
                true_labels[i].append(index_to_tag[out_label_ids[i][j]])
                pred_labels[i].append(index_to_tag[preds[i][j]])

    f1 = f1_score(true_labels, pred_labels, suffix=True)
    print(f1)

100%|██████████| 4557/4557 [1:05:53<00:00,  1.15it/s]


LOSS :  0.19781042334033205


100%|██████████| 507/507 [03:02<00:00,  2.78it/s]


0.8411817636472705


100%|██████████| 4557/4557 [1:05:54<00:00,  1.15it/s]


LOSS :  0.15177669847089392


100%|██████████| 507/507 [03:01<00:00,  2.79it/s]


0.850146877665119


100%|██████████| 4557/4557 [1:05:54<00:00,  1.15it/s]


LOSS :  0.11818927909589491


100%|██████████| 507/507 [03:01<00:00,  2.79it/s]


0.8540508154056432


In [None]:
path = 'ner_test'

model.save_pretrained(f'/gdrive/My Drive/NLP/models/{path}')
tokenizer.save_pretrained(f'/gdrive/My Drive/NLP/models/{path}')

# TEST

In [20]:
path = '/gdrive/My Drive/NLP/models/ner_test'

tokenizer = BertTokenizerFast.from_pretrained(path)

model = BertForTokenClassification.from_pretrained(path,num_labels=len(ner_tag))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [46]:
batch_input = tokenizer(test_data['Sentence'].tolist(), padding=True, truncation=True)

document = [str(i).split() for i in test_data['Sentence']]
tag_label = [i.split() for i in test_data['Tag']]


pad_token_label_id = -100
max_len = len(batch_input['input_ids'][0])
batch_size = 32

label = []
for words,labels in zip(document,tag_label):
    tokens = []
    label_ids = []
    for word, slot_label in zip(words, labels):
        word_tokens = tokenizer.tokenize(word)
        if not word_tokens:
            word_tokens = [tokenizer.unk_token]  # For handling the bad-encoded word
        tokens.extend(word_tokens)
        label_ids.extend([tag_to_index.get(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1))

    # Account for [CLS] and [SEP]
    # 토큰의 길이가 max_seq_len을 넘으면 잘라줌
    special_tokens_count = 2
    if len(label_ids) > max_len - special_tokens_count:
        label_ids = label_ids[: (max_len - special_tokens_count)]

    label_ids = [pad_token_label_id] + label_ids # [CLS]
    label_ids += [pad_token_label_id] # [SEP]
    label_ids += [pad_token_label_id]* (max_len - len(label_ids))  # [PAD]

    label.append(label_ids)

batch_input = {key : torch.tensor(value) for key, value in batch_input.items()}

test_set = TensorDataset(batch_input['input_ids'], batch_input['token_type_ids'], batch_input['attention_mask'], torch.tensor(label))
test_dataloader = DataLoader(test_set, batch_size=32)

In [80]:
examples = test_data['Sentence'].tolist()[:5]


cls_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
pad_token_id = tokenizer.pad_token_id
pad_token_id_for_segment=0
pad_token_id_for_label=-100
max_seq_len = 140

input_ids, attention_masks, token_type_ids, label_masks = [], [], [], []

for example in tqdm(examples):
    tokens = []
    label_mask = []
    for one_word in example:
        subword_tokens = tokenizer.tokenize(one_word)
        tokens.extend(subword_tokens)
        label_mask.extend([0]+ [pad_token_id_for_label] * (len(subword_tokens) - 1))
    
    special_tokens_count = 2
    if len(tokens) > max_seq_len - special_tokens_count:
        tokens = tokens[:(max_seq_len - special_tokens_count)]
        label_mask = label_mask[:(max_seq_len - special_tokens_count)]
    
    tokens += [sep_token]
    tokens = [cls_token] + tokens

    label_mask += [pad_token_id_for_label]
    label_mask = [pad_token_id_for_label] + label_mask

    input_id = tokenizer.convert_tokens_to_ids(tokens)
    attention_mask = [1] * len(input_id)
    
    padding_count = max_seq_len - len(input_id)
    input_id = input_id + ([pad_token_id] * padding_count)
    attention_mask = attention_mask + ([0] * padding_count)
    token_type_id = [pad_token_id_for_segment] * max_seq_len
    label_mask = label_mask + ([pad_token_id_for_label] * padding_count)

    input_ids.append(input_id)
    attention_masks.append(attention_mask)
    token_type_ids.append(token_type_id)
    label_masks.append(label_mask)

100%|██████████| 5/5 [00:00<00:00, 181.25it/s]

['[CLS]', '라', '티', '은', '-', '원', '윤', '정', ',', '휘', '닉', '스', '파', '크', '클', '래', '식', '프', '로', '골', '퍼', '[SEP]']
[2, 942, 1819, 1497, 17, 1478, 1492, 1543, 16, 1952, 802, 1316, 1826, 1750, 1752, 952, 1326, 1878, 991, 598, 1844, 3]
['[CLS]', '5', '원', '으', '로', '맺', '어', '진', '애', '인', '까', '지', '돈', '이', '라', '는', '민', '감', '한', '원', '자', '재', '를', '통', '해', '현', '대', '인', '의', '물', '질', '만', '능', '주', '의', '를', '꼬', '집', '고', '있', '는', '이', '무', '비', '는', '.', '[SEP]']
[2, 25, 1478, 1495, 991, 1057, 1406, 1585, 1389, 1506, 653, 1583, 850, 1504, 942, 793, 1109, 548, 1891, 1478, 1517, 1528, 1022, 1799, 1897, 1919, 823, 1506, 1503, 1093, 1586, 1038, 797, 1564, 1503, 1022, 676, 1589, 594, 1513, 793, 1504, 1088, 1187, 793, 18, 3]
['[CLS]', '-', '날', '로', '삼', '키', '면', '맛', '이', '어', '떤', '지', '일', '차', '드', '셔', '보', '시', '겠', '어', '요', '.', '[SEP]']
[2, 17, 721, 991, 1238, 1754, 1073, 1045, 1504, 1406, 911, 1583, 1507, 1632, 878, 1276, 1160, 1325, 582, 1406, 1464, 18, 3]
['[CLS]', 




In [76]:
len(label_masks[0])

142

In [47]:
model.eval()

eval_loss =  0
nb_eval_steps = 0
preds = None
out_label_ids = None


for batchs in tqdm(test_dataloader):
    batch = tuple(b.to(device) for b in batchs)
    l_input_ids, l_segment, l_mask, l_labels = batch

    with torch.no_grad():
            output = model(l_input_ids,
                        token_type_ids = None,
                        attention_mask=l_mask)
            
    logits = output[0]

    # CPU로 데이터 이동
    if preds is None:
        preds = logits.detach().cpu().numpy()
        out_label_ids  = l_labels.to('cpu').numpy()
    else:
        preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
        out_label_ids = np.append(out_label_ids, l_labels.detach().cpu().numpy(), axis=0)

preds = np.argmax(preds, axis=2)

# output을 담을 리스트 생성
true_labels = [[] for _ in range(out_label_ids.shape[0])]
pred_labels = [[] for _ in range(out_label_ids.shape[0])]

for i in range(out_label_ids.shape[0]):
    for j in range(out_label_ids.shape[1]):
        if out_label_ids[i, j] != -100:
            true_labels[i].append(index_to_tag[out_label_ids[i][j]])
            pred_labels[i].append(index_to_tag[preds[i][j]])

100%|██████████| 282/282 [01:57<00:00,  2.40it/s]


In [48]:
f1 = f1_score(true_labels, pred_labels, suffix=True)
print(f1)

0.8590292047023951


In [None]:
preds[0], out_label_ids[0]

In [49]:
true_labels[0], pred_labels[0]

(['PER-B', 'EVT-B', 'CVL-B'], ['PER-B', 'EVT-B', 'CVL-B'])

In [31]:
# output을 담을 리스트 생성
pred_labels = [[] for _ in range(preds.shape[0])]

for i in range(out_label_ids.shape[0]):
    for j in range(out_label_ids.shape[1]):
        if out_label_ids[i, j] != -100:
            pred_labels[i].append(index_to_tag[preds[i][j]])

IndexError: ignored

In [35]:
test_data

Unnamed: 0,Sentence,Tag
0,"라티은-원윤정, 휘닉스파크클래식 프로골퍼",PER-B EVT-B CVL-B
1,5원으로 맺어진 애인까지 돈이라는 민감한 원자재를 통해 현대인의 물질만능주의를 꼬집...,NUM-B O O O O O O O O O O O FLD-B O
2,-날로 삼키면 맛이 어떤지 일차 드셔보시겠어요 .,O O O O NUM-B O O
3,"-네, 지었습니다 .",O O O
4,◇신규 투자촉진에 방점=이번 접속료 조정결과에서 눈에 띄는 지점은 WCDMA/HSD...,O O O O O O O O TRM-B O TRM-B TRM-I ORG-B O TR...
...,...,...
8995,제가 뉴사우스웨일즈주로 가서 프레드를 도와줘야겠어요 .,O LOC-B O PER-B O O
8996,17쿼터 들어 윤석금의 실점이 폭렬했고 KT&G는 주도권을 잡아갔다 .,NUM-B O PER-B O O ORG-B O O O
8997,"오병욱 신임총재는 그동안 울산수산업협동조합장, 검은콩차 출신이 도맡다시피했던 프로스...",PER-B CVL-B O CVL-B CVL-B O O O CVL-B O O O CV...
8998,신판 ‘프로페셔널의 원칙’은 프로페셔널(전문직업인)으로서의 직업적 근기를 내면있게 ...,O AFW-B AFW-I CVL-B O O O O O


In [36]:
pred_labels[]

['O',
 'O',
 'PER-B',
 'O',
 'O',
 'O',
 'O',
 'O',
 'PER-B',
 'PER-B',
 'O',
 'O',
 'O',
 'PER-B']

['PER-B', 'EVT-B', 'CVL-B']