In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

import os

In [2]:
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [3]:
import pandas as pd
from pytorch_transformers import BertTokenizer, BertForSequenceClassification, BertConfig

In [4]:
device = torch.device("cuda")

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=4)

In [6]:
dataset_train = []
dataset_test = []

root = "exp_2020/"
list = os.listdir(root)
for cat in list:
    files = os.listdir(root + cat)
    for i,f in enumerate(files):
        if (f == '.ipynb_checkpoints'):
            continue
        fname = root + cat + "/" + f
        file = open(fname, "r", encoding="utf-8")
        strings = file.read()
        #mynum = len(files) * 0.75
        if i<480:
            dataset_train.append([strings, cat])
        elif i<604:
            dataset_test.append([strings,cat])
        file.close()
        
root2 = "exp_2019/"
list = os.listdir(root2)
for cat in list:
    files = os.listdir(root2 + cat)
    for i,f in enumerate(files):
        if (f == '.ipynb_checkpoints'):
            continue
        fname = root2 + cat + "/" + f
        file = open(fname, "r", encoding="utf-8")
        strings = file.read()
        #mynum = len(files) * 0.75
        if i<201:
            dataset_train.append([strings, cat])
        elif i<278:
            dataset_test.append([strings,cat])
        file.close()

print(len(dataset_train), len(dataset_test))

2721 802


In [7]:
print(dataset_train[0][0][:64]) #sentence
print(dataset_train[0][1]) #label

삼성전자, 2020년 지속가능경영보고서 발간 [단독 인터뷰] 中 기업 가는 장원기 전 삼성전자 사장 “기술유출? 삼
0


In [8]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx):
        # 현재 i[sent_idx] 가 본문
        self.sentences = [i[sent_idx][:64] for i in dataset]
        self.labels = [i[label_idx] for i in dataset]

    def __getitem__(self, i):
        return self.sentences[i], self.labels[i]

    def __len__(self):
        return (len(self.labels))

In [9]:
batch_size = 4
num_epochs = 40
warmup_ratio = 0.1
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [10]:
data_train = BERTDataset(dataset_train, 0, 1)
data_test = BERTDataset(dataset_test, 0, 1)

In [11]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5, shuffle=True)

In [12]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [13]:
for text,label in train_dataloader:
    print(text)
    result = tuple((int(x[0])) for x in label)
    result = torch.tensor(result)
    print(result)
    break

('HMM, 싱가포르항에 전용 터미널 확보 추진', "현대차 美서 질주…첨단기술 만족도 1위 현대차, 미국 소비자가 뽑은 첨단 기술 만족도 1위 車브랜드 '모델명? 서브", 'LG디스플레이, TV 줄이고 IT 중심 다각화 눈길 LG디스플레이 파주사업장 직원 코로나19 확진 LG디스플레이 파', "삼바 vs 셀트리온, 美서 '허셉틴' 바이오시밀러 경쟁 예고")
tensor([0, 2, 0, 1])


In [14]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [15]:
optimizer = optim.AdamW(model.parameters(), lr=1e-6)
itr=1
p_itr=200
total_loss = 0
total_len = 0
total_correct = 0

for epoch in range(num_epochs):
    
    model.train()
    total_loss = 0
    total_len = 0
    total_correct = 0
    for sentence, label in train_dataloader:
        optimizer.zero_grad()
        
        encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in sentence]
        padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
        
        sample = torch.tensor(padded_list)
        label = tuple((int(x[0])) for x in label)
        label = torch.tensor(label)
        sample = sample.to(device)
        label = label.to(device)
        
        labels = torch.tensor(label)
        outputs = model(sample, labels=labels)
        loss, logits = outputs

        pred = torch.argmax(F.softmax(logits), dim=1)        
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        total_loss += loss.item()
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        #scheduler.step()    
    print('[Epoch {}/{}] -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, num_epochs, total_loss, total_correct/total_len))
    
    model.eval()
    t_total_correct = 0
    t_total_len = 0
    for sentence, label in test_dataloader:
        encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in sentence]
        padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
        
        sample = torch.tensor(padded_list)
        label = tuple((int(x[0])) for x in label)
        label = torch.tensor(label)
        sample = sample.to(device)
        label = label.to(device)
        
        labels = torch.tensor(label)
        outputs = model(sample, labels=labels)
        _, logits = outputs
        
        pred = torch.argmax(F.softmax(logits), dim=1)        
        correct = pred.eq(labels)
        t_total_correct += correct.sum().item()
        t_total_len += len(labels)
    print('[Epoch {}/{}] -> Test Accuracy: {:.3f}'.format(epoch+1, num_epochs, t_total_correct/t_total_len))



[Epoch 1/40] -> Train Loss: 951.0277, Accuracy: 0.264




[Epoch 1/40] -> Test Accuracy: 0.248
[Epoch 2/40] -> Train Loss: 947.5280, Accuracy: 0.251
[Epoch 2/40] -> Test Accuracy: 0.251
[Epoch 3/40] -> Train Loss: 946.9815, Accuracy: 0.249
[Epoch 3/40] -> Test Accuracy: 0.251
[Epoch 4/40] -> Train Loss: 948.1542, Accuracy: 0.258
[Epoch 4/40] -> Test Accuracy: 0.246
[Epoch 5/40] -> Train Loss: 947.4507, Accuracy: 0.244
[Epoch 5/40] -> Test Accuracy: 0.233
[Epoch 6/40] -> Train Loss: 947.4057, Accuracy: 0.249
[Epoch 6/40] -> Test Accuracy: 0.259
[Epoch 7/40] -> Train Loss: 945.6025, Accuracy: 0.263
[Epoch 7/40] -> Test Accuracy: 0.231
[Epoch 8/40] -> Train Loss: 945.1024, Accuracy: 0.259
[Epoch 8/40] -> Test Accuracy: 0.221
[Epoch 9/40] -> Train Loss: 941.9778, Accuracy: 0.276
[Epoch 9/40] -> Test Accuracy: 0.248
[Epoch 10/40] -> Train Loss: 944.1598, Accuracy: 0.252
[Epoch 10/40] -> Test Accuracy: 0.221
[Epoch 11/40] -> Train Loss: 941.2367, Accuracy: 0.272
[Epoch 11/40] -> Test Accuracy: 0.222
[Epoch 12/40] -> Train Loss: 937.5149, Accuracy: 

In [16]:
model.eval()

total_loss = 0
total_len = 0
total_correct = 0

for sentence, label in test_dataloader:
    encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in sentence]
    padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
    sample = torch.tensor(padded_list)
    label = tuple((int(x[0])) for x in label)
    label = torch.tensor(label)
    sample = sample.to(device)
    label = label.to(device)
        
    labels = torch.tensor(label)
    outputs = model(sample, labels=labels)
    _, logits = outputs

    pred = torch.argmax(F.softmax(logits), dim=1)
    correct = pred.eq(labels)
    total_correct += correct.sum().item()
    total_len += len(labels)

print('Test accuracy: ', total_correct / total_len)

  app.launch_new_instance()


Test accuracy:  0.28233830845771146


In [17]:
def softmax(vals, idx):
    valscpu = vals.cpu().detach().squeeze(0)
    a = 0
    for i in valscpu:
        a += np.exp(i)
    return ((np.exp(valscpu[idx]))/a).item() * 100

def testModel(model, seq):
    cate = ["대폭 하락","소폭 하락","소폭 상승", "대폭 상승"]
    tmp = [seq]
    encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in tmp]
    padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
    sample = torch.tensor(padded_list)

    labels = torch.tensor([1]).unsqueeze(0)
    sample = sample.to(device)
    labels = labels.to(device)
    outputs = model(sample, labels=labels)
    _, logits = outputs

    pred = torch.argmax(F.softmax(logits), dim=1)

    print("주가는:", cate[pred])
    print("신뢰도는:", "{:.2f}%".format(softmax(logits,pred)))

In [20]:
testModel(model, "삼성전자 실적 폭락 공장 불나고 난리남")

주가는: 대폭 상승
신뢰도는: 88.53%




In [19]:
testModel(model, "대박 갤럭시 최고 증가 1등")

주가는: 대폭 상승
신뢰도는: 95.60%


