## 필요한 패키지 및 기본 제공 함수 (Requirements)

In [None]:
!pip install PyKomoran
!pip install nltk

Collecting PyKomoran
[?25l  Downloading https://files.pythonhosted.org/packages/75/b5/d6d45db7b150ba9be3811283c919bf80707c3bb031f43207b53fd3e88631/PyKomoran-0.1.6.post1-py3-none-any.whl (6.4MB)
[K     |████████████████████████████████| 6.4MB 9.9MB/s 
[?25hCollecting py4j==0.10.9.2
[?25l  Downloading https://files.pythonhosted.org/packages/2b/e2/543019a6e620b759a59f134158b4595766f9bf520a1081a2ba1a1809ba32/py4j-0.10.9.2-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 39.3MB/s 
[?25hInstalling collected packages: py4j, PyKomoran
Successfully installed PyKomoran-0.1.6.post1 py4j-0.10.9.2


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import json
import torch
from tqdm import trange
import numpy as np
import random
from nltk import sent_tokenize
from PyKomoran import *

In [None]:
def load_data(path) :
  with open(path) as f :
    data = json.load(f)

  return data


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
# end of set_seed

## 데이터 로드 및 전처리 (Preprocess)

In [None]:
train_path = "./newsdata_train.json"
test_path = "./newsdata_test.json"
label_list = ['IT', '경제', '문화', '스포츠', '정치']

train_data = load_data(train_path)
test_data = load_data(test_path)

komoran = Komoran("EXP")

In [None]:
def create_w2i_l2i_i2l(data, label_list, komoran) :

    labels2idx = {label: i for i, label in enumerate(label_list)}
    idx2labels = {i: label for i, label in enumerate(label_list)}

    POS_result = set()

    for doc in data:
        sentences = sent_tokenize(doc["content"])
        for sentence in sentences:
            POS_result.update(komoran.get_plain_text(sentence).split(' '))

    word2idx = {'<PAD>': 0, '<UNK>': 1}
    for idx, token in enumerate(sorted(POS_result)):
        word2idx[token] = idx + 2

    return word2idx, labels2idx, idx2labels

In [None]:
word2idx, labels2idx, idx2labels = create_w2i_l2i_i2l(train_data, label_list, komoran)

In [None]:
def convert_examples_to_features(data, word2idx, labels2idx, komoran, max_length=512):
    input_ids = list()
    labels = list()

    for doc in data:
        doc_ids = []
        sentences = sent_tokenize(doc["content"])
        for sentence in sentences:
            doc_ids.extend(
                [
                    word2idx[w if w in word2idx else '<UNK>']
                    for w in komoran.get_plain_text(sentence).split(' ')
                ]
            )

        if len(doc_ids) < max_length:
            doc_ids += [word2idx['<PAD>']] * (max_length - len(doc_ids))
            
        elif len(doc_ids) > max_length:
            doc_ids = doc_ids[:max_length]

        input_ids.append(doc_ids)
        labels.append(labels2idx[doc["topic"]])

    return input_ids, labels


def make_dataset(input_ids, labels):
    return torch.utils.data.TensorDataset(torch.tensor(input_ids, dtype=torch.long),
                                          torch.tensor(labels, dtype=torch.long))

In [None]:
train_inputs, train_labels = convert_examples_to_features(train_data, word2idx, labels2idx, komoran, max_length=512)
train_dataset = make_dataset(train_inputs, train_labels)

test_inputs, test_labels = convert_examples_to_features(test_data, word2idx, labels2idx, komoran, max_length=512)
test_dataset = make_dataset(test_inputs, test_labels)

## 모델 학습 (Train)

In [None]:
class CNN(torch.nn.Module):
    def __init__(self, vocab_size, output_dim):
        super(CNN, self).__init__()
        self.word_embed = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=128, padding_idx=0)
        self.conv_layer1 = torch.nn.Conv1d(128, 30, 3)
        self.conv_layer2 = torch.nn.Conv1d(128, 30, 4)
        self.conv_layer3 = torch.nn.Conv1d(128, 30, 5)
        # self.convs = torch.nn.ModuleList([torch.nn.Conv1d(128, 30, k) for k in [3, 4, 5]])
        self.dropout = torch.nn.Dropout(0.1)
        self.fc = torch.nn.Linear(3 * 30, output_dim, bias=True)

    def forward(self, inputs):
      
        embedded = self.word_embed(inputs).permute(0, 2, 1)

        conv1 = torch.nn.functional.relu(self.conv_layer1(embedded).permute(0, 2, 1).max(1)[0])

        

        conv2 = torch.nn.functional.relu(self.conv_layer2(embedded).permute(0, 2, 1).max(1)[0])
        conv3 = torch.nn.functional.relu(self.conv_layer3(embedded).permute(0, 2, 1).max(1)[0])

        x = torch.cat([conv1, conv2, conv3], dim=1)

        output = self.fc(self.dropout(x))

        return output

In [None]:
def train(model, train_dataset, args):

    set_seed(42)

    train_batch_size = args["train_batch_size"]
    num_train_epochs = args["num_train_epochs"]
    device = args["device"]
    learning_rate = args["learning_rate"]

    # optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Loss function
    criterion = torch.nn.CrossEntropyLoss()

    train_dataLoader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)

    train_iterator = trange(num_train_epochs, desc="Epoch")

    print("\n***** Running training *****")
    print("  Num examples = {}".format(len(train_dataset)))
    print("  Num Epochs = {}".format(num_train_epochs))
    print("  Train Batch size = {}".format(train_batch_size))
    print("  Device = ", device)

    model.to(device)
    model.train(True)
    model.zero_grad()
    for epoch in train_iterator:
        loss = 0
        for batch in train_dataLoader:
            input_vector = batch[0].to(device)
            label = batch[1].to(device)
            predict = model(input_vector)

            loss = criterion(predict, label)
            loss += loss.item()

            loss.backward()
            optimizer.step()
            model.zero_grad()

        if (epoch + 1) % 10 == 0:
            print("\n********** Train Result **********")
            print("  Epoch / Total Epoch : {} / {}".format(epoch + 1, num_train_epochs))
            print("  Loss : {:.4f}".format(loss))

    model.train(False)
# end of train

In [None]:
vocab_size = len(word2idx)
output_dim = len(label_list)

args = dict()
args["train_batch_size"] = 64
args["device"] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args["learning_rate"] = 0.0005
args["num_train_epochs"] = 500

model = CNN(vocab_size, output_dim)

train(model, train_dataset, args)

Epoch:   0%|          | 0/500 [00:00<?, ?it/s]


***** Running training *****
  Num examples = 800
  Num Epochs = 500
  Train Batch size = 64
  Device =  cuda


Epoch:   2%|▏         | 11/500 [00:14<04:08,  1.97it/s]


********** Train Result **********
  Epoch / Total Epoch : 10 / 500
  Loss : 1.8675


Epoch:   4%|▍         | 21/500 [00:15<01:18,  6.07it/s]


********** Train Result **********
  Epoch / Total Epoch : 20 / 500
  Loss : 0.7596


Epoch:   6%|▌         | 31/500 [00:17<01:04,  7.30it/s]


********** Train Result **********
  Epoch / Total Epoch : 30 / 500
  Loss : 0.2431


Epoch:   8%|▊         | 41/500 [00:18<01:04,  7.11it/s]


********** Train Result **********
  Epoch / Total Epoch : 40 / 500
  Loss : 0.2090


Epoch:  10%|█         | 51/500 [00:20<00:54,  8.25it/s]


********** Train Result **********
  Epoch / Total Epoch : 50 / 500
  Loss : 0.1025


Epoch:  12%|█▏        | 61/500 [00:21<00:57,  7.59it/s]


********** Train Result **********
  Epoch / Total Epoch : 60 / 500
  Loss : 0.0511


Epoch:  14%|█▍        | 71/500 [00:22<00:56,  7.63it/s]


********** Train Result **********
  Epoch / Total Epoch : 70 / 500
  Loss : 0.1128


Epoch:  16%|█▌        | 81/500 [00:24<00:59,  7.10it/s]


********** Train Result **********
  Epoch / Total Epoch : 80 / 500
  Loss : 0.0865


Epoch:  18%|█▊        | 91/500 [00:25<00:54,  7.48it/s]


********** Train Result **********
  Epoch / Total Epoch : 90 / 500
  Loss : 0.0450


Epoch:  20%|██        | 101/500 [00:27<01:05,  6.09it/s]


********** Train Result **********
  Epoch / Total Epoch : 100 / 500
  Loss : 0.0318


Epoch:  22%|██▏       | 111/500 [00:28<00:51,  7.57it/s]


********** Train Result **********
  Epoch / Total Epoch : 110 / 500
  Loss : 0.0208


Epoch:  24%|██▍       | 121/500 [00:29<00:42,  8.95it/s]


********** Train Result **********
  Epoch / Total Epoch : 120 / 500
  Loss : 0.0199


Epoch:  26%|██▌       | 131/500 [00:30<00:41,  8.98it/s]


********** Train Result **********
  Epoch / Total Epoch : 130 / 500
  Loss : 0.0102


Epoch:  28%|██▊       | 141/500 [00:31<00:40,  8.94it/s]


********** Train Result **********
  Epoch / Total Epoch : 140 / 500
  Loss : 0.0084


Epoch:  30%|███       | 151/500 [00:32<00:38,  8.98it/s]


********** Train Result **********
  Epoch / Total Epoch : 150 / 500
  Loss : 0.0062


Epoch:  32%|███▏      | 161/500 [00:33<00:37,  8.93it/s]


********** Train Result **********
  Epoch / Total Epoch : 160 / 500
  Loss : 0.1659


Epoch:  34%|███▍      | 171/500 [00:35<00:36,  8.91it/s]


********** Train Result **********
  Epoch / Total Epoch : 170 / 500
  Loss : 0.0070


Epoch:  36%|███▌      | 181/500 [00:36<00:35,  8.94it/s]


********** Train Result **********
  Epoch / Total Epoch : 180 / 500
  Loss : 0.0207


Epoch:  38%|███▊      | 191/500 [00:37<00:35,  8.80it/s]


********** Train Result **********
  Epoch / Total Epoch : 190 / 500
  Loss : 0.0030


Epoch:  40%|████      | 201/500 [00:38<00:33,  8.87it/s]


********** Train Result **********
  Epoch / Total Epoch : 200 / 500
  Loss : 0.0022


Epoch:  42%|████▏     | 211/500 [00:39<00:32,  8.89it/s]


********** Train Result **********
  Epoch / Total Epoch : 210 / 500
  Loss : 0.0029


Epoch:  44%|████▍     | 221/500 [00:40<00:31,  8.82it/s]


********** Train Result **********
  Epoch / Total Epoch : 220 / 500
  Loss : 0.0052


Epoch:  46%|████▌     | 231/500 [00:41<00:30,  8.92it/s]


********** Train Result **********
  Epoch / Total Epoch : 230 / 500
  Loss : 0.0078


Epoch:  48%|████▊     | 241/500 [00:42<00:28,  8.95it/s]


********** Train Result **********
  Epoch / Total Epoch : 240 / 500
  Loss : 0.0036


Epoch:  50%|█████     | 251/500 [00:44<00:27,  8.92it/s]


********** Train Result **********
  Epoch / Total Epoch : 250 / 500
  Loss : 0.0017


Epoch:  52%|█████▏    | 261/500 [00:45<00:27,  8.85it/s]


********** Train Result **********
  Epoch / Total Epoch : 260 / 500
  Loss : 0.0024


Epoch:  54%|█████▍    | 271/500 [00:46<00:26,  8.79it/s]


********** Train Result **********
  Epoch / Total Epoch : 270 / 500
  Loss : 0.0005


Epoch:  56%|█████▌    | 281/500 [00:47<00:24,  8.90it/s]


********** Train Result **********
  Epoch / Total Epoch : 280 / 500
  Loss : 0.0012


Epoch:  58%|█████▊    | 291/500 [00:48<00:23,  8.89it/s]


********** Train Result **********
  Epoch / Total Epoch : 290 / 500
  Loss : 0.0010


Epoch:  60%|██████    | 301/500 [00:49<00:22,  8.87it/s]


********** Train Result **********
  Epoch / Total Epoch : 300 / 500
  Loss : 0.0009


Epoch:  62%|██████▏   | 311/500 [00:50<00:21,  8.80it/s]


********** Train Result **********
  Epoch / Total Epoch : 310 / 500
  Loss : 0.0106


Epoch:  64%|██████▍   | 321/500 [00:51<00:20,  8.78it/s]


********** Train Result **********
  Epoch / Total Epoch : 320 / 500
  Loss : 0.0039


Epoch:  66%|██████▌   | 331/500 [00:53<00:19,  8.85it/s]


********** Train Result **********
  Epoch / Total Epoch : 330 / 500
  Loss : 0.0004


Epoch:  68%|██████▊   | 341/500 [00:54<00:17,  8.85it/s]


********** Train Result **********
  Epoch / Total Epoch : 340 / 500
  Loss : 0.0037


Epoch:  70%|███████   | 351/500 [00:55<00:17,  8.75it/s]


********** Train Result **********
  Epoch / Total Epoch : 350 / 500
  Loss : 0.0042


Epoch:  72%|███████▏  | 361/500 [00:56<00:15,  8.78it/s]


********** Train Result **********
  Epoch / Total Epoch : 360 / 500
  Loss : 0.0006


Epoch:  74%|███████▍  | 371/500 [00:57<00:14,  8.87it/s]


********** Train Result **********
  Epoch / Total Epoch : 370 / 500
  Loss : 0.0006


Epoch:  76%|███████▌  | 381/500 [00:58<00:13,  8.85it/s]


********** Train Result **********
  Epoch / Total Epoch : 380 / 500
  Loss : 0.0014


Epoch:  78%|███████▊  | 391/500 [00:59<00:12,  8.80it/s]


********** Train Result **********
  Epoch / Total Epoch : 390 / 500
  Loss : 0.0008


Epoch:  80%|████████  | 401/500 [01:01<00:11,  8.83it/s]


********** Train Result **********
  Epoch / Total Epoch : 400 / 500
  Loss : 0.0003


Epoch:  82%|████████▏ | 411/500 [01:02<00:10,  8.82it/s]


********** Train Result **********
  Epoch / Total Epoch : 410 / 500
  Loss : 0.0003


Epoch:  84%|████████▍ | 421/500 [01:03<00:08,  8.88it/s]


********** Train Result **********
  Epoch / Total Epoch : 420 / 500
  Loss : 0.0018


Epoch:  86%|████████▌ | 431/500 [01:04<00:07,  8.85it/s]


********** Train Result **********
  Epoch / Total Epoch : 430 / 500
  Loss : 0.0014


Epoch:  88%|████████▊ | 441/500 [01:05<00:08,  6.96it/s]


********** Train Result **********
  Epoch / Total Epoch : 440 / 500
  Loss : 0.0750


Epoch:  90%|█████████ | 451/500 [01:07<00:05,  8.58it/s]


********** Train Result **********
  Epoch / Total Epoch : 450 / 500
  Loss : 0.0002


Epoch:  92%|█████████▏| 461/500 [01:08<00:04,  8.78it/s]


********** Train Result **********
  Epoch / Total Epoch : 460 / 500
  Loss : 0.0017


Epoch:  94%|█████████▍| 471/500 [01:09<00:03,  8.77it/s]


********** Train Result **********
  Epoch / Total Epoch : 470 / 500
  Loss : 0.0001


Epoch:  96%|█████████▌| 481/500 [01:10<00:02,  8.63it/s]


********** Train Result **********
  Epoch / Total Epoch : 480 / 500
  Loss : 0.0004


Epoch:  98%|█████████▊| 491/500 [01:11<00:01,  8.78it/s]


********** Train Result **********
  Epoch / Total Epoch : 490 / 500
  Loss : 0.0003


Epoch: 100%|██████████| 500/500 [01:12<00:00,  6.88it/s]


********** Train Result **********
  Epoch / Total Epoch : 500 / 500
  Loss : 0.0002





## 모델 학습 후 평가 (Evaluation)

In [None]:
def evaluate(model, test_dataset, args, news_num=900):
    test_dataLoader = torch.utils.data.DataLoader(test_dataset, shuffle=False, batch_size=1)

    device = args["device"]

    print("***** Running evaluation *****")
    print("  Num examples = {}".format(len(test_dataset)))
    print("  Test Batch size = 1")

    model.eval()
    pred = None
    label = None
    for batch in test_dataLoader:
        input_vector = batch[0].to(device)

        with torch.no_grad():
            predict = model(input_vector)

        if pred is None:
            pred = predict.detach().cpu().numpy()
            label = batch[1].numpy()
        else:
            pred = np.append(pred, predict.detach().cpu().numpy(), axis=0)
            label = np.append(label, batch[1].numpy(), axis=0)

    pred = np.argmax(pred, axis=1)

    news_num -= 800
    sample_pred = pred[news_num]
    sample_label = label[news_num]
    sample_result = {"pred": sample_pred, "label": sample_label}

    accuracy = (pred == label).sum() / 200

    return accuracy, sample_result

In [None]:
accuracy, sample_result = evaluate(model, test_dataset, args, news_num=900)

print("\n********** Total Test Result **********")
print("  Accuracy {}".format(accuracy))
print("  Sample pred : {}".format(sample_result["pred"]))
print("  Sample label : {}".format(sample_result["label"]))

***** Running evaluation *****
  Num examples = 200
  Test Batch size = 1

********** Total Test Result **********
  Accuracy 0.75
  Sample pred : 0
  Sample label : 3


## 전체 코드

In [None]:
# main

train_path = "./newsdata_train.json"
test_path = "./newsdata_test.json"
label_list = ['IT', '경제', '문화', '스포츠', '정치']

train_data = load_data(train_path)
test_data = load_data(test_path)

komoran = Komoran("EXP")

args = dict()
args["train_batch_size"] = 64
args["device"] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args["learning_rate"] = 0.0005
args["num_train_epochs"] = 500

word2idx, labels2idx, idx2labels = create_w2i_l2i_i2l(train_data, label_list, komoran)

train_inputs, train_labels = convert_examples_to_features(train_data, word2idx, labels2idx, komoran, max_length=512)
train_dataset = make_dataset(train_inputs, train_labels)

test_inputs, test_labels = convert_examples_to_features(test_data, word2idx, labels2idx, komoran, max_length=512)
test_dataset = make_dataset(test_inputs, test_labels)

input_dim = len(word2idx)
output_dim = len(label_list)

model = CNN(input_dim, output_dim)

train(model, train_dataset, args)

accuracy, sample_result = evaluate(model, test_dataset, args, news_num=900)

print("\n********** Total Test Result **********")
print("  Accuracy {}".format(accuracy))
print("  Sample pred : {}".format(sample_result["pred"]))
print("  Sample label : {}".format(sample_result["label"]))