In [1]:
import pandas as pd
import numpy as np
import warnings
import random
import os
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

device = torch.device("cuda:0") if torch.cuda.is_available() else "cpu"

In [2]:
!nvidia-smi

Sun Oct 22 17:17:30 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.72       Driver Version: 410.72       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:3B:00.0 Off |                    0 |
| N/A   31C    P0    32W / 250W |   2084MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [3]:
# 각 데이터셋 마다 따로 조정해줄 필요가 있다.
CFG = {
    "EPOCHS": 50,
    "SEED": 0,
    "learning_rate": 4e-5,
    "batch_size": 128
}
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED'])

In [4]:
dataset = pd.read_csv("./final.csv", encoding = 'utf-8')
dataset.dropna(inplace = True)
dataset.reset_index(inplace = True)
dataset.drop(["index"], inplace = True, axis = 1)

Unnamed: 0,PAPER_TEXT,target,TARGET
0,대식세포이동저해인자 (Macrophage Migration Inhibitory Fa...,ND10,260
1,북방전복 (Haliotis discus hannai) 은 우리나라의 전복 양식생산량...,ND10,260
2,"Tegillarca granosa is, ecologically warmwater ...",ND10,260
3,"Matallothionein (MT) 은 약 60여 개의 아미노산으로 구성되고, 분...",ND10,260
4,굴 (Crassostrea gigas) 은 2015년 기준 세계 총생산량이 약 60...,ND10,260


In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54242 entries, 0 to 54241
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   PAPER_TEXT  54242 non-null  object
 1   target      54242 non-null  object
 2   TARGET      54242 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [9]:
# 우선 class가 하나만 있는 것은 train에 다 포함시킬 것
# 해당 셀은 class가 하나만 있는 것은 train에 포함시키기 전에 따로 떼어놓기 위함
# stratify를 통해 균일하게 train과 valid를 분리하고자 한다.
classes = dataset["TARGET"].value_counts()
classes = pd.DataFrame(classes)
classes.reset_index(inplace = True)
uniq_cls = list(classes[classes["count"] == 1]["TARGET"])
train_1 = []
train_1_lbl = []
for i in range(len(dataset)):
    if dataset.loc[i, "TARGET"] in uniq_cls:
        train_1.append(dataset.loc[i, :])
        train_1_lbl.append(i)
train_1 = pd.DataFrame(train_1)
dataset.drop(train_1_lbl, axis = 0, inplace = True)
dataset.reset_index(inplace = True)
dataset.drop(["index"], axis = 1, inplace = True)
train_1.reset_index(inplace = True)
train_1.drop(["index"], axis = 1, inplace = True)

In [12]:
from sklearn.model_selection import train_test_split

# 데이터를 train, validation 나누기
train_ratio = 0.8
validation_ratio = 0.2

# train-validation 데이터를 먼저 나누고 나머지를 test 데이터로 나눕니다.
train, valid = train_test_split(dataset, test_size = 1 - train_ratio, random_state=0, stratify = dataset["TARGET"])

# # 데이터 개수 확인
print("학습 데이터 개수:", len(train))
print("검증 데이터 개수:", len(valid))
train = pd.concat([train_1, train], axis = 0)
train.reset_index(inplace = True)
train.drop(["index"], inplace = True, axis = 1)
valid.reset_index(inplace = True)
valid.drop(["index"], inplace = True, axis = 1)

학습 데이터 개수: 43386
검증 데이터 개수: 10847


In [18]:
def make_dataset(mode):
    data = None
    if mode == "train":
        data = train
    if mode == "valid":
        data = valid
    if mode == "test":
        data = test
    mk_data = []
    for sentence, label in tqdm(zip(data["PAPER_TEXT"], data["TARGET"])):
        data_ = [sentence, label]
        mk_data.append(data_)
    return mk_data

train_set = make_dataset("train")
valid_set = make_dataset("valid")

43395it [00:00, 862088.21it/s]
10847it [00:00, 903389.84it/s]


In [19]:
# BERTDataset 형태로 변경하는 과정
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len, pad, pair, mode = 'train'):
        # bert_tokenizer: 텍스트를 토큰으로 분할하는 데 사용되는 토크나이저
        # max_seq_length: 최대 시퀀스의 길이
        # pad: 패딩 여부를 나타내는 boolean
        # pair: 2개의 문장을 입력으로 받는 경우
        # BERT 모델을 사용하는 자연어 처리 작업에서 데이터를 준비하고 변환하는 역할
        transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair, vocab = vocab)
        self.mode = mode
        if self.mode == "train":
            self.sentences = [transform([i[sent_idx]]) for i in dataset]
            self.labels = [np.int32(i[label_idx]) for i in dataset]
        else:
            self.sentences = [transform(i) for i in dataset]

    def __getitem__(self, i):
        if self.mode == "train":
            return (self.sentences[i] + (self.labels[i], ))
        else:
            return self.sentences[i]

    def __len__(self):
        return (len(self.sentences))

In [20]:
# Setup Hyperparameters
# tokenizer를 통해서 우선 어느 정도의 max_len을 설정할 지 체크한다.
# BERT모델의 입력 제한이 512
max_len = 128
batch_size = CFG["batch_size"]
warmup_ratio = 0.1
num_epochs = CFG["EPOCHS"]
# Gradient Clipping process
max_grad_norm = 1
log_interval = 200
learning_rate = CFG["learning_rate"]

In [22]:
from gluonnlp.vocab import Vocab
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertTokenizer
from transformers import BertModel
import json
# 이를 불러오고자 한다면 return_dict는 무조건 false로 해야한다.(transformers 버전 문제)
bertmodel = BertModel.from_pretrained("skt/kobert-base-v1", return_dict = False)
# 커스텀 trained vocab txt 파일을 가지고 BertTokenzier 생성
tokenizer = BertTokenizer.from_pretrained("./vocab.txt", do_basic_tokenize = False)
tok = tokenizer.tokenize

# customize vocab을 직접 불러온다.
with open("vocab.txt", "r") as f:
    vocab = f.readlines()
vocab = [voc.rstrip("\n") for voc in vocab]
dic = dict()
for i in range(len(vocab)):
    dic[vocab[i]] = i
# 최종적으로 vocab 형태를 만들어준다.
vocab = Vocab(dic)

In [26]:
"""
dataset, sent_idx: dataset의 0번은 embedding을 시켜야 할 문장, label_idx: 1번 index는 label에 해당, tok: tokenizer, max_len: 문장의 길이, True: padding 여부
"""
data_train = BERTDataset(dataset = train_set, sent_idx = 0, label_idx = 1, bert_tokenizer = tok, vocab = vocab, max_len = max_len, pad = True, pair = False)
data_valid = BERTDataset(dataset = valid_set, sent_idx = 0, label_idx = 1, bert_tokenizer = tok, vocab = vocab, max_len = max_len, pad = True, pair = False)

train_dataloader = DataLoader(data_train, batch_size = batch_size, num_workers = 4)
valid_dataloader = DataLoader(data_valid, batch_size = batch_size, num_workers = 4)

In [27]:
# customize vocab을 이용하기 위함
bertmodel.resize_token_embeddings(len(vocab))

Embedding(8005, 768)

In [28]:
# vocab 파일과 매칭을 통하여 어떤식으로 tokenize가 되었는지를 확인하는 셀이다.
# 직접 실행시켜보고 진행해도 좋다
# data_train[0][0] -> data_train[데이터 index][input_ids의 index]
voc_list = vocab.idx_to_token
cnt = 1
for i in data_train[0][0]:
    print("{}번째 token: {}".format(cnt, voc_list[cnt-1]))
    cnt += 1
print("label은 {}".format(data_train[0][3]))

1번째 token: <unk>
2번째 token: <pad>
3번째 token: <bos>
4번째 token: <eos>
5번째 token: Eu
6번째 token: ##ygen
7번째 token: cause
8번째 token: Each
9번째 token: disper
10번째 token: cho
11번째 token: ##화로
12번째 token: sources
13번째 token: ##ashion
14번째 token: 해석할
15번째 token: 계산하였다
16번째 token: 값의
17번째 token: 연구결과를
18번째 token: arg
19번째 token: ##만의
20번째 token: particle
21번째 token: 결정하는
22번째 token: ##inant
23번째 token: brand
24번째 token: 비교해
25번째 token: 재생
26번째 token: 아니다
27번째 token: his
28번째 token: ##위치
29번째 token: ke
30번째 token: meaning
31번째 token: conventional
32번째 token: ##ising
33번째 token: ##by
34번째 token: failure
35번째 token: 단위로
36번째 token: 구별
37번째 token: draw
38번째 token: ##으로서의
39번째 token: recomm
40번째 token: ##프라
41번째 token: 화면
42번째 token: sat
43번째 token: get
44번째 token: ess
45번째 token: ##tual
46번째 token: 퇴적
47번째 token: 스테
48번째 token: 작업을
49번째 token: 경기
50번째 token: 사람들이
51번째 token: Ge
52번째 token: 이루어졌
53번째 token: ##cos
54번째 token: Second
55번째 token: ##화한
56번째 token: calculation
57번째 token: contain
58번째 toke

In [29]:
class BERTClassifier(nn.Module):
    def __init__(self, bert, hidden_size = 768, num_classes=365, dr_rate=None, params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
        torch.nn.init.kaiming_uniform(self.classifier.weight)
    
    def get_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.get_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [30]:
"""
추후에 class imbalance를 해결하기 위한 방안 중, weighted cross entropy를 이용하고자 한다.
"""
target_unique = train.loc[:, "TARGET"].value_counts()
target_unique = pd.DataFrame(target_unique)
target_unique.loc[:, "count"]
target_unique.reset_index(inplace = True)

In [35]:
model = BERTClassifier(bertmodel, dr_rate = 0.1).to(device)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

weighted = torch.tensor([1.0] * 365, device = device)
for x in range(len(target_unique)):
    target = target_unique.loc[x, "TARGET"]
    count = target_unique.loc[x, "count"]
    if count <= 150:
        weighted[target] = 2.0
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss(weight=weighted)

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=t_total)   

In [32]:
from sklearn.metrics import f1_score

def calc_accuracy(pred,label):
    max_vals, max_indices = torch.max(pred, 1)
    train_acc = (max_indices == label).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

def calc_f1_score(pred, label):
    y_pred = torch.argmax(pred, dim = -1)
    label = label.cpu().detach().numpy()
    y_pred = y_pred.cpu().detach().numpy()
    f1 = f1_score(label, y_pred, average = "micro")
    return f1

In [None]:
best_model = None
best_loss = 999999
train_loss_graph = []
valid_loss_graph = []
for e in range(num_epochs):
    train_f1 = 0.0
    train_loss = 0.0
    valid_f1 = 0.0
    valid_loss = 0.0
    train_acc = 0.0
    valid_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()    # scheduler 적용
        train_f1 += calc_f1_score(out, label)
        train_acc += calc_accuracy(out, label)
        train_loss += loss.item()
    print("---------------------------------------------------")
    print("epoch {} train f1 {}".format(e+1, train_f1 / (batch_id+1)))
    print("epoch {} train accuracy {}".format(e+1, train_acc / (batch_id + 1)))
    print("epoch {} train loss {}".format(e+1, train_loss / len(train_dataloader)))
    train_loss_graph.append(train_loss / len(train_dataloader))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(valid_dataloader), total=len(valid_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        valid_f1 += calc_f1_score(out, label)
        valid_acc += calc_accuracy(out, label)
        valid_loss += loss.item()
    print("epoch {} valid f1 {}".format(e+1, valid_f1 / (batch_id+1)))
    print("epoch {} valid accuracy {}".format(e+1, valid_acc / (batch_id + 1)))
    # validation loss 설정
    print("epoch {} valid loss {}".format(e+1, valid_loss / len(valid_dataloader)))
    val_loss = valid_loss / len(valid_dataloader)
    valid_loss_graph.append(val_loss)
    if best_loss > val_loss:
        best_model = model
        best_loss = val_loss
        print("epoch {} best valid loss {}".format(e+1, best_loss))
    print("---------------------------------------------------")

100%|██████████| 340/340 [04:50<00:00,  1.17it/s]

---------------------------------------------------
epoch 1 train f1 0.500390625
epoch 1 train accuracy 0.500390625
epoch 1 train loss 3.0883198096471673



100%|██████████| 85/85 [00:24<00:00,  3.47it/s]

epoch 1 valid f1 0.1906762770897833
epoch 1 valid accuracy 0.1906762770897833
epoch 1 valid loss 4.608705307455624
epoch 1 best valid loss 4.608705307455624
---------------------------------------------------



100%|██████████| 340/340 [04:50<00:00,  1.17it/s]

---------------------------------------------------
epoch 2 train f1 0.8025965073529412
epoch 2 train accuracy 0.8025965073529412
epoch 2 train loss 1.2337225429275456



100%|██████████| 85/85 [00:24<00:00,  3.49it/s]

epoch 2 valid f1 0.18467395510835913
epoch 2 valid accuracy 0.18467395510835913
epoch 2 valid loss 4.962245638230268
---------------------------------------------------



100%|██████████| 340/340 [04:50<00:00,  1.17it/s]

---------------------------------------------------
epoch 3 train f1 0.8780330882352941
epoch 3 train accuracy 0.8780330882352941
epoch 3 train loss 0.7218527290531818



100%|██████████| 85/85 [00:24<00:00,  3.48it/s]

epoch 3 valid f1 0.17700561145510837
epoch 3 valid accuracy 0.17700561145510837
epoch 3 valid loss 5.281047720067641
---------------------------------------------------



100%|██████████| 340/340 [04:50<00:00,  1.17it/s]

---------------------------------------------------
epoch 4 train f1 0.9067555147058823
epoch 4 train accuracy 0.9067555147058823
epoch 4 train loss 0.5031645392451216



100%|██████████| 85/85 [00:24<00:00,  3.46it/s]

epoch 4 valid f1 0.182312306501548
epoch 4 valid accuracy 0.182312306501548
epoch 4 valid loss 5.421407738853904
---------------------------------------------------



100%|██████████| 340/340 [04:50<00:00,  1.17it/s]

---------------------------------------------------
epoch 5 train f1 0.9206801470588235
epoch 5 train accuracy 0.9206801470588235
epoch 5 train loss 0.3922219845137614



100%|██████████| 85/85 [00:24<00:00,  3.48it/s]

epoch 5 valid f1 0.18017414860681114
epoch 5 valid accuracy 0.18017414860681114
epoch 5 valid loss 5.601568805470186
---------------------------------------------------



100%|██████████| 340/340 [04:49<00:00,  1.17it/s]

---------------------------------------------------
epoch 6 train f1 0.9328584558823529
epoch 6 train accuracy 0.9328584558823529
epoch 6 train loss 0.3155915729852174



100%|██████████| 85/85 [00:24<00:00,  3.50it/s]

epoch 6 valid f1 0.17296923374613002
epoch 6 valid accuracy 0.17296923374613002
epoch 6 valid loss 5.946795216728659
---------------------------------------------------



100%|██████████| 340/340 [04:49<00:00,  1.17it/s]

---------------------------------------------------
epoch 7 train f1 0.9383042279411765
epoch 7 train accuracy 0.9383042279411765
epoch 7 train loss 0.2701249728910625



100%|██████████| 85/85 [00:24<00:00,  3.49it/s]

epoch 7 valid f1 0.18525348297213623
epoch 7 valid accuracy 0.18525348297213623
epoch 7 valid loss 6.079554282917696
---------------------------------------------------



100%|██████████| 340/340 [04:50<00:00,  1.17it/s]

---------------------------------------------------
epoch 8 train f1 0.9448069852941177
epoch 8 train accuracy 0.9448069852941177
epoch 8 train loss 0.2333099403706215



100%|██████████| 85/85 [00:24<00:00,  3.49it/s]

epoch 8 valid f1 0.18295568885448918
epoch 8 valid accuracy 0.18295568885448918
epoch 8 valid loss 6.161913473465863
---------------------------------------------------



100%|██████████| 340/340 [04:49<00:00,  1.17it/s]

---------------------------------------------------
epoch 9 train f1 0.9471737132352941
epoch 9 train accuracy 0.9471737132352941
epoch 9 train loss 0.21202269897467513



100%|██████████| 85/85 [00:24<00:00,  3.48it/s]

epoch 9 valid f1 0.1831153250773994
epoch 9 valid accuracy 0.1831153250773994
epoch 9 valid loss 6.277985701841467
---------------------------------------------------



100%|██████████| 340/340 [04:49<00:00,  1.18it/s]

---------------------------------------------------
epoch 10 train f1 0.9539751838235294
epoch 10 train accuracy 0.9539751838235294
epoch 10 train loss 0.1818415974965319



100%|██████████| 85/85 [00:24<00:00,  3.50it/s]

epoch 10 valid f1 0.1811174535603715
epoch 10 valid accuracy 0.1811174535603715
epoch 10 valid loss 6.384032810435576
---------------------------------------------------



100%|██████████| 340/340 [04:50<00:00,  1.17it/s]

---------------------------------------------------
epoch 11 train f1 0.9590533088235295
epoch 11 train accuracy 0.9590533088235295
epoch 11 train loss 0.15921705133137842



100%|██████████| 85/85 [00:24<00:00,  3.48it/s]

epoch 11 valid f1 0.17820820433436532
epoch 11 valid accuracy 0.17820820433436532
epoch 11 valid loss 6.420739863900577
---------------------------------------------------



100%|██████████| 340/340 [04:49<00:00,  1.18it/s]

---------------------------------------------------
epoch 12 train f1 0.9601792279411765
epoch 12 train accuracy 0.9601792279411765
epoch 12 train loss 0.14895111673069872



100%|██████████| 85/85 [00:24<00:00,  3.48it/s]

epoch 12 valid f1 0.18090944272445822
epoch 12 valid accuracy 0.18090944272445822
epoch 12 valid loss 6.5383161713095275
---------------------------------------------------



100%|██████████| 340/340 [04:50<00:00,  1.17it/s]

---------------------------------------------------
epoch 13 train f1 0.9645220588235294
epoch 13 train accuracy 0.9645220588235294
epoch 13 train loss 0.1330773980661487



100%|██████████| 85/85 [00:24<00:00,  3.48it/s]

epoch 13 valid f1 0.18774284055727555
epoch 13 valid accuracy 0.18774284055727555
epoch 13 valid loss 6.574300592085894
---------------------------------------------------



100%|██████████| 340/340 [04:50<00:00,  1.17it/s]

---------------------------------------------------
epoch 14 train f1 0.9650735294117647
epoch 14 train accuracy 0.9650735294117647
epoch 14 train loss 0.12733330144529806



100%|██████████| 85/85 [00:24<00:00,  3.50it/s]

epoch 14 valid f1 0.1865441176470588
epoch 14 valid accuracy 0.1865441176470588
epoch 14 valid loss 6.732658341351677
---------------------------------------------------



100%|██████████| 340/340 [04:49<00:00,  1.17it/s]

---------------------------------------------------
epoch 15 train f1 0.9682444852941177
epoch 15 train accuracy 0.9682444852941177
epoch 15 train loss 0.1154056545756141



100%|██████████| 85/85 [00:24<00:00,  3.46it/s]

epoch 15 valid f1 0.18525348297213623
epoch 15 valid accuracy 0.18525348297213623
epoch 15 valid loss 6.833712157081155
---------------------------------------------------



 96%|█████████▌| 325/340 [04:38<00:12,  1.17it/s]

In [34]:
torch.save(best_model, 'best_1020_version2.pt')
torch.save(model, 'last_1020_version2.pt')