1. Settings

In [1]:
import torch

print(torch.__version__)

1.10.1+cu102


In [2]:
!python --version

Python 3.8.10


In [3]:
pip install gluonnlp==0.8.0 pandas tqdm

You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
pip install mxnet

You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [5]:
pip install sentencepiece

You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [6]:
pip install openpyxl

You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [7]:
pip install numpy==1.21.0

You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [8]:
pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Collecting git+https://****@github.com/SKTBrain/KoBERT.git@master
  Cloning https://****@github.com/SKTBrain/KoBERT.git (to revision master) to /tmp/pip-req-build-uedp8uf2
  Running command git clone --filter=blob:none --quiet 'https://****@github.com/SKTBrain/KoBERT.git' /tmp/pip-req-build-uedp8uf2
  Resolved https://****@github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25ldone
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [9]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
import pandas as pd
import random
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import StratifiedKFold, train_test_split

#KoBERT
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
#transformer
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup



In [10]:
#GPU 설정
device = torch.device("cuda:0")

import os

n_devices = torch.cuda.device_count()
print(n_devices)

for i in range(n_devices):
    print(torch.cuda.get_device_name(i))

2
Tesla V100-SXM2-16GB
Tesla V100-SXM2-16GB


In [11]:
#bertmodel, vocabulary 불러오기
bertmodel, vocab = get_pytorch_kobert_model()

using cached model. /root/kobert/dataset/.cache/kobert_v1.zip
using cached model. /root/kobert/dataset/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


2. Load Dataset

(1) preprocessing

In [12]:
def preparing_data_1(df:pd.DataFrame) -> pd.DataFrame:
    # SettingWithCopyWarning
    data = df.copy()

    def func(c):
        if c == "sad" : return "sadness"
        elif c == "anger" : return "angry"
        else : return c
        
    data = data[["발화문", "상황"]]
    data.columns = ["sentence", "class"]
    data.reset_index(drop=True, inplace=True)
    data["class"] = data["class"].apply(func)
        
    return data

In [13]:
def preparing_data_2(df:pd.DataFrame) -> pd.DataFrame:
    # SettingWithCopyWarning
    data = df.copy()
    
    def func(c):
        if c == "분노" : return "angry"
        elif c == "혐오" : return "disgust"
        elif c == "중립" : return "neutral"
        elif c == "놀람" : return "surprise"
        elif c == "행복" : return "happiness"
        elif c == "공포" : return "fear"
        elif c == "슬픔" : return "sadness"
        else : return np.nan

    # cleaning data    
    data = data[["Unnamed: 1" ,"Unnamed: 2"]]
    data.columns = ["sentence", "class"]
    data.reset_index(drop=True, inplace=True)
    data["class"] = data["class"].apply(func)
    data.dropna(inplace=True)

    # imbalanced data 
    """
    
    # all data size
    data.shape >= 50000

    # one column data size
    data["neutral"].shape >= 40000
    
    # apply random undersampling
    data["neutral"].shape : 10000

    """
    neutral_index = list(data[data["class"] == "neutral"].index)
    remove_index = random.sample(neutral_index,33786)
    data.drop(remove_index, inplace=True)
    
    return data

In [14]:
#
train1 = preparing_data_1(pd.read_csv('/root/kobert/dataset/year_4.csv', index_col=0, encoding="cp949"))
train2 = preparing_data_1(pd.read_csv('/root/kobert/dataset/year_5_1.csv', index_col=0, encoding="cp949"))
train3 = preparing_data_1(pd.read_csv('/root/kobert/dataset/year_5_2.csv', index_col=0, encoding="cp949"))

#
train4 = preparing_data_2(pd.read_excel('/root/kobert/dataset/chat_korean.xlsx', index_col=0))

# concat
train = pd.DataFrame()
train = pd.concat([train, train1], axis=0, ignore_index=True)
train = pd.concat([train, train2], axis=0, ignore_index=True)
train = pd.concat([train, train3], axis=0, ignore_index=True)

In [15]:
from collections import Counter
Counter(train['class'])

Counter({'angry': 11635,
         'sadness': 14000,
         'fear': 4131,
         'disgust': 4660,
         'neutral': 3262,
         'happiness': 4548,
         'surprise': 1755})

(2) label encoding

In [16]:
train.loc[(train['class'] == "sadness"), 'class'] = 0 
train.loc[(train['class'] == "fear"), 'class'] = 1  
train.loc[(train['class'] == "disgust"), 'class'] = 2  
train.loc[(train['class'] == "neutral"), 'class'] = 3  
train.loc[(train['class'] == "happiness"), 'class'] = 4  
train.loc[(train['class'] == "angry"), 'class'] = 5  
train.loc[(train['class'] == "surprise"), 'class'] = 6  

data_list = []
for sen, label in zip(train['sentence'], train['class'])  :
    data = []   
    data.append(sen)
    data.append(str(label))

    data_list.append(data)

(3) split train / test

In [17]:
train, test = train_test_split(data_list, test_size=0.2, shuffle=True, random_state = 1234)

(4) tokenizing

In [18]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /root/kobert/dataset/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [19]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

(5) Setting parameters

In [20]:
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 7
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [21]:
data_train = BERTDataset(train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(test, 0, 1, tok, max_len, True, False)

train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

In [22]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=7,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

3. Model Training

(1) load model

In [23]:
#BERT 모델 불러오기
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)
 
#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss() # 다중분류를 위한 대표적인 loss func

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

#정확도 측정을 위한 함수 정의
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc
    
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f83f45f9b50>

(2) training

In [24]:
from tqdm.notebook import tqdm
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

  0%|          | 0/550 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.9907768964767456 train acc 0.1875
epoch 1 batch id 201 loss 0.7478275299072266 train acc 0.4095926616915423
epoch 1 batch id 401 loss 0.48260119557380676 train acc 0.6115180798004988
epoch 1 train acc 0.6803733766233767


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/138 [00:00<?, ?it/s]

epoch 1 test acc 0.8833420991117344


  0%|          | 0/550 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.2890363931655884 train acc 0.921875
epoch 2 batch id 201 loss 0.1554868370294571 train acc 0.8877487562189055
epoch 2 batch id 401 loss 0.22342266142368317 train acc 0.8984180174563591
epoch 2 train acc 0.9040990259740259


  0%|          | 0/138 [00:00<?, ?it/s]

epoch 2 test acc 0.9126672802711547


  0%|          | 0/550 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.15736445784568787 train acc 0.984375
epoch 3 batch id 201 loss 0.032751016318798065 train acc 0.9290267412935324
epoch 3 batch id 401 loss 0.19341477751731873 train acc 0.9342269326683291
epoch 3 train acc 0.9377191558441558


  0%|          | 0/138 [00:00<?, ?it/s]

epoch 3 test acc 0.9193475338943432


  0%|          | 0/550 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.10736618936061859 train acc 0.953125
epoch 4 batch id 201 loss 0.07959021627902985 train acc 0.9506374378109452
epoch 4 batch id 401 loss 0.14458265900611877 train acc 0.9514884663341646
epoch 4 train acc 0.953713474025974


  0%|          | 0/138 [00:00<?, ?it/s]

epoch 4 test acc 0.925574888966807


  0%|          | 0/550 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.02203626185655594 train acc 1.0
epoch 5 batch id 201 loss 0.0033633182756602764 train acc 0.966806592039801
epoch 5 batch id 401 loss 0.11451438814401627 train acc 0.96875
epoch 5 train acc 0.9700730519480519


  0%|          | 0/138 [00:00<?, ?it/s]

epoch 5 test acc 0.9256881136044881


  0%|          | 0/550 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 0.002368479035794735 train acc 1.0
epoch 6 batch id 201 loss 0.001440887339413166 train acc 0.9761349502487562
epoch 6 batch id 401 loss 0.05691201984882355 train acc 0.9776730049875312
epoch 6 train acc 0.978676948051948


  0%|          | 0/138 [00:00<?, ?it/s]

epoch 6 test acc 0.9279526063581113


  0%|          | 0/550 [00:00<?, ?it/s]

epoch 7 batch id 1 loss 0.001987518509849906 train acc 1.0
epoch 7 batch id 201 loss 0.004437095019966364 train acc 0.982431592039801
epoch 7 batch id 401 loss 0.051512446254491806 train acc 0.9834008728179551
epoch 7 train acc 0.9839366883116882


  0%|          | 0/138 [00:00<?, ?it/s]

epoch 7 test acc 0.929658280738663


(3) save the trained model

In [25]:
PATH = '/root/kobert/' 
torch.save(model, PATH + 'KoBERT_ver1.pt')  # 전체 모델 저장
torch.save(model.state_dict(), PATH + 'KoBERT_ver1_state_dict.pt')  # 모델 객체의 state_dict 저장
torch.save({
    'model': model.state_dict(),
    'optimizer': optimizer.state_dict()
}, PATH + 'all.tar')  # 여러 가지 값 저장, 학습 중 진행 상황 저장을 위해 epoch, loss 값 등 일반 scalar값 저장 가능

(4) save the requirements

In [26]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


3. Model Testing

In [27]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
    model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)


        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()

            if np.argmax(logits) == 0:
                test_eval.append("슬픔이")
            elif np.argmax(logits) == 1:
                test_eval.append("공포가")
            elif np.argmax(logits) == 2:
                test_eval.append("혐오가")
            elif np.argmax(logits) == 3:
                test_eval.append("중립이")
            elif np.argmax(logits) == 4:
                test_eval.append("행복이")
            elif np.argmax(logits) == 5:
                test_eval.append("화남이")
            elif np.argmax(logits) == 6:
                test_eval.append("놀람이")

            #test_eval.append(np.argmax(logits))

        print(">> 입력하신 내용에서 " + test_eval[0] + " 느껴집니다.")

using cached model. /root/kobert/dataset/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [28]:
#질문 무한반복하기! 0 입력시 종료
end = 1
while end == 1 :
    sentence = input("하고싶은 말을 입력해주세요 : ")
    if sentence == "0" :
        break
    predict(sentence)
    print("\n")

하고싶은 말을 입력해주세요 :  안녕?


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1 [00:00<?, ?it/s]

>> 입력하신 내용에서 중립이 느껴집니다.




하고싶은 말을 입력해주세요 :  너 덕분에 할 수 있었어 고마워ㅠㅠ


  0%|          | 0/1 [00:00<?, ?it/s]

>> 입력하신 내용에서 행복이 느껴집니다.




하고싶은 말을 입력해주세요 :  엥 진짜로 ?? 심각한데..;


  0%|          | 0/1 [00:00<?, ?it/s]

>> 입력하신 내용에서 중립이 느껴집니다.




하고싶은 말을 입력해주세요 :  왜 자꾸 하지 말란 짓을 골라서 하냐.. 뭐하자는거임?


  0%|          | 0/1 [00:00<?, ?it/s]

>> 입력하신 내용에서 화남이 느껴집니다.




하고싶은 말을 입력해주세요 :  엥 진짜로 ?? 심각한데,,


  0%|          | 0/1 [00:00<?, ?it/s]

>> 입력하신 내용에서 중립이 느껴집니다.




하고싶은 말을 입력해주세요 :  3시까지 만나자


  0%|          | 0/1 [00:00<?, ?it/s]

>> 입력하신 내용에서 슬픔이 느껴집니다.




하고싶은 말을 입력해주세요 :   갑자기 무섭게 왜그래..


  0%|          | 0/1 [00:00<?, ?it/s]

>> 입력하신 내용에서 공포가 느껴집니다.




하고싶은 말을 입력해주세요 :  만나서 얘기해 힘들다.. 이제


  0%|          | 0/1 [00:00<?, ?it/s]

>> 입력하신 내용에서 화남이 느껴집니다.




하고싶은 말을 입력해주세요 :  이번에는 회의 참석 못할것 같아 미안,,


  0%|          | 0/1 [00:00<?, ?it/s]

>> 입력하신 내용에서 슬픔이 느껴집니다.




하고싶은 말을 입력해주세요 :  오키~ 내일봐


  0%|          | 0/1 [00:00<?, ?it/s]

>> 입력하신 내용에서 중립이 느껴집니다.




하고싶은 말을 입력해주세요 :  


  0%|          | 0/1 [00:00<?, ?it/s]

>> 입력하신 내용에서 혐오가 느껴집니다.




하고싶은 말을 입력해주세요 :  0
