In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

import os

In [2]:
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [3]:
import pandas as pd
from pytorch_transformers import BertTokenizer, BertForSequenceClassification, BertConfig

In [4]:
device = torch.device("cuda")

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=8)

In [6]:
dataset_train = []
dataset_test = []

root = "newsData/"
list = os.listdir(root)
for cat in list:
    files = os.listdir(root + cat)
    for i,f in enumerate(files):
        fname = root + cat + "/" + f
        file = open(fname, "r", encoding="utf-8")
        strings = file.read()
        if i<170:
            dataset_train.append([strings, cat])
        else:
            dataset_test.append([strings,cat])
        file.close()

print(len(dataset_train), len(dataset_test))

1360 240


In [7]:
print(dataset_train[0][0][:64]) #sentence
print(dataset_train[0][1]) #label

동남아 담당' 北 최희철 부상 베이징 도착…싱가포르행 주목	최 부상, 행선지·방문 목적 질문에는 '묵묵부답'

(베
0


In [8]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx):
        # 현재 i[sent_idx] 가 본문
        self.sentences = [i[sent_idx][:64] for i in dataset]
        self.labels = [i[label_idx] for i in dataset]

    def __getitem__(self, i):
        return self.sentences[i], self.labels[i]

    def __len__(self):
        return (len(self.labels))

In [9]:
batch_size = 4
num_epochs = 10
warmup_ratio = 0.1
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [10]:
data_train = BERTDataset(dataset_train, 0, 1)
data_test = BERTDataset(dataset_test, 0, 1)

In [11]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5, shuffle=True)

In [12]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [13]:
for text,label in train_dataloader:
    print(text)
    result = tuple((int(x[0])) for x in label)
    result = torch.tensor(result)
    print(result)
    break

("美 대사관 예루살렘 이전 임박…가자지구 '일촉즉발'\t[뉴스데스크]◀ 앵커 ▶\n\n이스라엘에 빼앗긴 땅으로 돌아가겠다는", '경남지사 후보, 김경수 42.5% vs 김태호 26.3%\t드루킹 연루 확인땐 김경수 계속 지지 29.4%…철회 26', '판교 넘보는 네이버…부동산 펀드에 2000억 투자\t미래에셋이 운영하는 부동산펀드에 1963억원(45.1%) 출자\n알', "성추행·협박' 이서원→반성·사죄→'어바웃타임' 하차→'뮤뱅' MC는? [종합]\t[OSEN=하수정 기자] 이서원이 성")
tensor([4, 0, 5, 6])


In [14]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [15]:
#training step
'''
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = optim.AdamW(optimizer_grouped_parameters, lr=1e-6)

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total) 
'''
optimizer = optim.AdamW(model.parameters(), lr=1e-6)
itr=1
p_itr=100
total_loss = 0
total_len = 0
total_correct = 0

model.train()
for epoch in range(num_epochs):
    
    for sentence, label in train_dataloader:
        optimizer.zero_grad()
        
        encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in sentence]
        padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
        
        sample = torch.tensor(padded_list)
        label = tuple((int(x[0])) for x in label)
        label = torch.tensor(label)
        sample = sample.to(device)
        label = label.to(device)
        
        labels = torch.tensor(label)
        outputs = model(sample, labels=labels)
        loss, logits = outputs

        pred = torch.argmax(F.softmax(logits), dim=1)        
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        total_loss += loss.item()
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        #scheduler.step()
        
        if itr % p_itr == 0:
            print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, num_epochs, itr, total_loss/p_itr, total_correct/total_len))
            total_loss = 0
            total_len = 0
            total_correct = 0

        itr+=1



[Epoch 1/10] Iteration 100 -> Train Loss: 2.0944, Accuracy: 0.142
[Epoch 1/10] Iteration 200 -> Train Loss: 2.1053, Accuracy: 0.125
[Epoch 1/10] Iteration 300 -> Train Loss: 2.1012, Accuracy: 0.107
[Epoch 2/10] Iteration 400 -> Train Loss: 2.0895, Accuracy: 0.115
[Epoch 2/10] Iteration 500 -> Train Loss: 2.0900, Accuracy: 0.122
[Epoch 2/10] Iteration 600 -> Train Loss: 2.0718, Accuracy: 0.133
[Epoch 3/10] Iteration 700 -> Train Loss: 2.0424, Accuracy: 0.188
[Epoch 3/10] Iteration 800 -> Train Loss: 2.0199, Accuracy: 0.253
[Epoch 3/10] Iteration 900 -> Train Loss: 1.9576, Accuracy: 0.287
[Epoch 3/10] Iteration 1000 -> Train Loss: 1.8790, Accuracy: 0.380
[Epoch 4/10] Iteration 1100 -> Train Loss: 1.7712, Accuracy: 0.470
[Epoch 4/10] Iteration 1200 -> Train Loss: 1.6708, Accuracy: 0.535
[Epoch 4/10] Iteration 1300 -> Train Loss: 1.5861, Accuracy: 0.547
[Epoch 5/10] Iteration 1400 -> Train Loss: 1.4713, Accuracy: 0.610
[Epoch 5/10] Iteration 1500 -> Train Loss: 1.4193, Accuracy: 0.650
[Epo

In [16]:
#testing step

model.eval()

total_loss = 0
total_len = 0
total_correct = 0

for sentence, label in test_dataloader:
    encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in sentence]
    padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
    sample = torch.tensor(padded_list)
    label = tuple((int(x[0])) for x in label)
    label = torch.tensor(label)
    sample = sample.to(device)
    label = label.to(device)
        
    labels = torch.tensor(label)
    outputs = model(sample, labels=labels)
    _, logits = outputs

    pred = torch.argmax(F.softmax(logits), dim=1)
    correct = pred.eq(labels)
    total_correct += correct.sum().item()
    total_len += len(labels)

print('Test accuracy: ', total_correct / total_len)



Test accuracy:  0.675


In [17]:
def softmax(vals, idx):
    valscpu = vals.cpu().detach().squeeze(0)
    a = 0
    for i in valscpu:
        a += np.exp(i)
    return ((np.exp(valscpu[idx]))/a).item() * 100

def testModel(model, seq):
    cate = ["정치","경제","사회", "생활/문화","세계","기술/IT", "연예", "스포츠"]
    tmp = [seq]
    encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in tmp]
    padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
    sample = torch.tensor(padded_list)

    labels = torch.tensor([1]).unsqueeze(0)
    sample = sample.to(device)
    labels = labels.to(device)
    outputs = model(sample, labels=labels)
    _, logits = outputs

    pred = torch.argmax(F.softmax(logits), dim=1)

    print("뉴스의 카테고리는:", cate[pred])
    print("신뢰도는:", "{:.2f}%".format(softmax(logits,pred)))

In [22]:
testModel(model, "베트남 태국 일본 미국 중국")

뉴스의 카테고리는: 세계
신뢰도는: 61.70%


