In [1]:
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from pytorch_transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
from pathlit import Path

import os

In [4]:
from easydict import EasyDict as edict

args = edict()
args.gpu = True
args.batch_size = 4
args.num_epochs = 15
args.learning_rate =  5e-5

device = 'cuda' if torch.cuda.is_available() and args.gpu else 'cpu'

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_category_clf = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=8)

In [6]:
dataset_train = []
dataset_val = []
dataset_test = []

root = Path('dataset/category')
list = os.listdir(root)
for cat in list:
    files = os.listdir(root / cat)
    for i,f in enumerate(files):
        fname = root / cat / f
        with open(fname, "r", encoding="utf-8") as file:
            strings = file.read()
            if i < 150:
                dataset_train.append([strings, cat])
            elif i < 175:
                dataset_val.append([strings, cat])
            else:
                dataset_test.append([strings,cat])

print(len(dataset_train), len(dataset_val), len(dataset_test))

1360 240


In [7]:
print(dataset_train[0][0][:64]) #sentence
print(dataset_train[0][1]) #label

동남아 담당' 北 최희철 부상 베이징 도착…싱가포르행 주목	최 부상, 행선지·방문 목적 질문에는 '묵묵부답'

(베
0


In [8]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx):
        # 현재 i[sent_idx] 가 본문
        self.sentences = [i[sent_idx][:64] for i in dataset]
        self.labels = [i[label_idx] for i in dataset]

    def __getitem__(self, i):
        return self.sentences[i], self.labels[i]

    def __len__(self):
        return (len(self.labels))

In [10]:
data_train = BERTDataset(dataset_train, 0, 1)
data_val = BERTDataset(dataset_val, 0, 1)
data_test = BERTDataset(dataset_test, 0, 1)

In [11]:
train_dataloader = DataLoader(data_train, batch_size=args.batch_size, num_workers=5, shuffle=True)
val_dataloader = DataLoader(data_val, batch_size=args.batch_size, num_workers=5, shuffle=True)
test_dataloader = DataLoader(data_test, batch_size=args.batch_size, num_workers=5, shuffle=True)

In [12]:
bert_category_clf.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [None]:
def save_model(model, mode='last'):
    torch.save(model.state_dict(),  Path('pretrained_models') / f'{type(model).__name__}_Category_{mode}.ckpt')

In [15]:
#training step
optimizer = optim.AdamW(bert_category_clf.parameters(), lr=1e-6)

for epoch in range(args.num_epochs):
    train_loss = 0
    total_len = 0
    total_correct = 0
    bert_category_clf.train()
    for sentence, label in train_dataloader:
        optimizer.zero_grad()
        
        encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in sentence]
        padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
        
        sample = torch.tensor(padded_list)
        label = tuple((int(x[0])) for x in label)
        label = torch.tensor(label)
        sample = sample.to(device)
        label = label.to(device)
        
        labels = torch.tensor(label)
        loss, logits = bert_category_clf(sample, labels=labels)

        pred = torch.argmax(F.softmax(logits), dim=1)        
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        train_loss += loss.item()
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        #scheduler.step()        
    print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, args.num_epochs, train_loss, total_correct/total_len))

    with torch.no_grad():
        bert_category_clf.eval()
        val_loss = 0
        v_total_correct = 0
        v_total_len = 0
        for sentence, label in val_dataloader:
            encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in sentence]
            padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
            
            sample = torch.tensor(padded_list)
            label = tuple((int(x[0])) for x in label)
            label = torch.tensor(label)
            sample = sample.to(device)
            label = label.to(device)
            
            labels = torch.tensor(label)
            loss, logits = bert_category_clf(sample, labels=labels)
            
            pred = torch.argmax(F.softmax(logits), dim=1)        
            correct = pred.eq(labels)
            val_loss += loss.item()
            v_total_correct += correct.sum().item()
            v_total_len += len(labels)
        print('[Epoch {}/{}] -> Validation Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch + 1, args.num_epochs, val_loss, v_total_correct / v_total_len))
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        save_model(bert_category_clf, 'best')
    save_model(bert_category_clf)



[Epoch 1/10] Iteration 100 -> Train Loss: 2.0944, Accuracy: 0.142
[Epoch 1/10] Iteration 200 -> Train Loss: 2.1053, Accuracy: 0.125
[Epoch 1/10] Iteration 300 -> Train Loss: 2.1012, Accuracy: 0.107
[Epoch 2/10] Iteration 400 -> Train Loss: 2.0895, Accuracy: 0.115
[Epoch 2/10] Iteration 500 -> Train Loss: 2.0900, Accuracy: 0.122
[Epoch 2/10] Iteration 600 -> Train Loss: 2.0718, Accuracy: 0.133
[Epoch 3/10] Iteration 700 -> Train Loss: 2.0424, Accuracy: 0.188
[Epoch 3/10] Iteration 800 -> Train Loss: 2.0199, Accuracy: 0.253
[Epoch 3/10] Iteration 900 -> Train Loss: 1.9576, Accuracy: 0.287
[Epoch 3/10] Iteration 1000 -> Train Loss: 1.8790, Accuracy: 0.380
[Epoch 4/10] Iteration 1100 -> Train Loss: 1.7712, Accuracy: 0.470
[Epoch 4/10] Iteration 1200 -> Train Loss: 1.6708, Accuracy: 0.535
[Epoch 4/10] Iteration 1300 -> Train Loss: 1.5861, Accuracy: 0.547
[Epoch 5/10] Iteration 1400 -> Train Loss: 1.4713, Accuracy: 0.610
[Epoch 5/10] Iteration 1500 -> Train Loss: 1.4193, Accuracy: 0.650
[Epo

In [16]:
bert_category_clf.eval()

t_total_len = 0
t_total_correct = 0
with torch.no_grad():
    for sentence, label in test_dataloader:
        encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in sentence]
        padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
        sample = torch.tensor(padded_list)
        label = tuple((int(x[0])) for x in label)
        label = torch.tensor(label)
        sample = sample.to(device)
        label = label.to(device)
            
        labels = torch.tensor(label)
        _, logits = bert_category_clf(sample, labels=labels)

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(labels)
        t_total_correct += correct.sum().item()
        t_total_len += len(labels)

print('Test accuracy: ', t_total_correct / t_total_len)



Test accuracy:  0.675


In [17]:
def test_model(model, seq):
    cate = ["정치","경제","사회", "생활/문화","세계","기술/IT", "연예", "스포츠"]
    tmp = [seq]
    encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in tmp]
    padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
    sample = torch.tensor(padded_list)

    labels = torch.tensor([1]).unsqueeze(0)
    sample = sample.to(device)
    labels = labels.to(device)
    _, logits = model(sample, labels=labels)

    pred = torch.argmax(F.softmax(logits), dim=1)

    print("뉴스의 카테고리는:", cate[pred])
    print("신뢰도는:", "{:.2f}%".format(torch.max(F.softmax(logits), dim=1).item() * 100))

In [22]:
test_model(bert_category_clf, "SK텔레콤 분사 의결... '2025년 순자산 75조, 연 매출 22조 목표'")

뉴스의 카테고리는: 세계
신뢰도는: 61.70%


