In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
from tqdm.notebook import tqdm

In [None]:
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import get_linear_schedule_with_warmup

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
import time
import datetime

In [None]:
device = torch.device("cuda:0")

# 전처리된 데이터 불러오기

In [None]:
sentence = pd.read_csv('/content/drive/MyDrive/aiContest/aicontest_data10.csv')
sentence.head()

Unnamed: 0,태그,new_sent
0,문제 정의,재난관리체계의 개선방법으로는 여러 가지가 있을 수 있겠으나 여기서는 재난관리분야에서...
1,문제 정의,이 연구는 재난관리에 관여하는 조직 및 부처들 간의 얼마만큼 재난관리와 관련한 공조...
2,대상 데이터,본 연구의 사용된 표본의 특성은 TABLE 5와 같다. 설문지 응답자들의 근무처를 ...
3,대상 데이터,우리나라에서 재난관리 활동에 영향을 주는 외적요인을 다중응답 분석을 이용해서 살펴보...
4,대상 데이터,연구자는 국가통합재난관리행정체계에 대한 연구목적을 달성하기 위하여 소방방재청과 지...


# 모델 훈련

In [None]:
sentence = sentence.rename(columns={'문장':'new_sent','태그':'tag'})
sentence = sentence[['tag','new_sent']]
print(len(sentence))

63319


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
result = le.fit_transform(sentence['tag'])
le_idx = dict(zip(list(le.classes_), le.transform(list(le.classes_))))
print(le_idx)

{'가설 설정': 0, '기술 정의': 1, '대상 데이터': 2, '데이터처리': 3, '문제 정의': 4, '성능/효과': 5, '이론/모형': 6, '제안 방법': 7, '후속연구': 8}


In [None]:
sentence['tag'] = result

In [None]:
sentence['tag'].value_counts()

5    17454
7     7361
2     7221
3     7028
8     6882
4     6241
1     5973
6     3238
0     1921
Name: tag, dtype: int64

In [None]:
sentence_train, sentence_test = train_test_split(sentence, test_size = 0.25, random_state=117)
sentence_train = sentence_train.reset_index().drop('index', axis = 1)
sentence_test = sentence_test.reset_index().drop('index', axis = 1)

In [None]:
sentence_train.to_csv("drive/MyDrive/train_sen.csv", encoding='utf-8', index = False, sep='\t')
sentence_test.to_csv("drive/MyDrive/test_sen.csv", encoding='utf-8', index = False, sep='\t')

In [None]:
bertmodel, vocab = get_pytorch_kobert_model()

[██████████████████████████████████████████████████]
[██████████████████████████████████████████████████]


In [None]:
dataset_train = nlp.data.TSVDataset("drive/MyDrive/train_sen.csv", field_indices=[1,0], num_discard_samples=1)
dataset_test = nlp.data.TSVDataset("drive/MyDrive/test_sen.csv", field_indices=[1,0], num_discard_samples=1)

In [None]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [None]:
## Setting parameters
max_len = 128
batch_size = 32
warmup_ratio = 0.1
num_epochs = 4
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [None]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, bert, hidden_size = 768, num_classes=9, dr_rate=None, params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

In [None]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [None]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [None]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [None]:
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

HBox(children=(FloatProgress(value=0.0, max=1485.0), HTML(value='')))

epoch 1 batch id 1 loss 2.2600109577178955 train acc 0.03125
epoch 1 batch id 201 loss 1.3609724044799805 train acc 0.3126554726368159
epoch 1 batch id 401 loss 0.9024076461791992 train acc 0.48215399002493764
epoch 1 batch id 601 loss 0.7137834429740906 train acc 0.5566763727121464
epoch 1 batch id 801 loss 0.7249074578285217 train acc 0.5966370162297129
epoch 1 batch id 1001 loss 0.490646630525589 train acc 0.6229395604395604
epoch 1 batch id 1201 loss 0.8339254856109619 train acc 0.6408982098251457
epoch 1 batch id 1401 loss 0.9258242845535278 train acc 0.6556700571020699

epoch 1 train acc 0.6606271043771044


HBox(children=(FloatProgress(value=0.0, max=495.0), HTML(value='')))


epoch 1 test acc 0.7564451331496787


HBox(children=(FloatProgress(value=0.0, max=1485.0), HTML(value='')))

epoch 2 batch id 1 loss 1.0229741334915161 train acc 0.6875
epoch 2 batch id 201 loss 0.8908522129058838 train acc 0.7552860696517413
epoch 2 batch id 401 loss 0.5340965986251831 train acc 0.7557668329177057
epoch 2 batch id 601 loss 0.5275629162788391 train acc 0.7639351081530782
epoch 2 batch id 801 loss 0.6021588444709778 train acc 0.7693898252184769
epoch 2 batch id 1001 loss 0.2693697214126587 train acc 0.7741633366633367
epoch 2 batch id 1201 loss 0.5205455422401428 train acc 0.7783878018318068
epoch 2 batch id 1401 loss 0.8857640624046326 train acc 0.781160778015703

epoch 2 train acc 0.7825968013468013


HBox(children=(FloatProgress(value=0.0, max=495.0), HTML(value='')))


epoch 2 test acc 0.7557506887052342


HBox(children=(FloatProgress(value=0.0, max=1485.0), HTML(value='')))

epoch 3 batch id 1 loss 0.7048078775405884 train acc 0.71875
epoch 3 batch id 201 loss 0.6391580700874329 train acc 0.8075248756218906
epoch 3 batch id 401 loss 0.298938125371933 train acc 0.8093048628428927
epoch 3 batch id 601 loss 0.333867609500885 train acc 0.8191555740432612
epoch 3 batch id 801 loss 0.3212060332298279 train acc 0.8254915730337079
epoch 3 batch id 1001 loss 0.1722629964351654 train acc 0.8300137362637363
epoch 3 batch id 1201 loss 0.4430111050605774 train acc 0.8346169858451291
epoch 3 batch id 1401 loss 0.6498923897743225 train acc 0.8366345467523197

epoch 3 train acc 0.8379419191919192


HBox(children=(FloatProgress(value=0.0, max=495.0), HTML(value='')))


epoch 3 test acc 0.7663280532598714


HBox(children=(FloatProgress(value=0.0, max=1485.0), HTML(value='')))

epoch 4 batch id 1 loss 0.489019513130188 train acc 0.8125
epoch 4 batch id 201 loss 0.47308632731437683 train acc 0.8641169154228856
epoch 4 batch id 401 loss 0.26152294874191284 train acc 0.8656483790523691
epoch 4 batch id 601 loss 0.2710314989089966 train acc 0.8723481697171381
epoch 4 batch id 801 loss 0.16103209555149078 train acc 0.8772237827715356
epoch 4 batch id 1001 loss 0.12040784955024719 train acc 0.8808066933066933
epoch 4 batch id 1201 loss 0.3581717610359192 train acc 0.8846534138218152
epoch 4 batch id 1401 loss 0.6224435567855835 train acc 0.8846582798001428

epoch 4 train acc 0.8851430976430976


HBox(children=(FloatProgress(value=0.0, max=495.0), HTML(value='')))


epoch 4 test acc 0.7704315886134068


In [None]:
path = 'drive/MyDrive/aiContest/'
torch.save(model, path+'model_KoBert_전체데이터_Rhino영어부분제거모델.pt')