In [1]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import datetime
import FinanceDataReader as fdr
from requests.adapters import HTTPAdapter

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

import os
import re

In [2]:
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

In [3]:
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [4]:
device = torch.device("cuda")

In [5]:
bertmodel, vocab = get_pytorch_kobert_model()

using cached model
using cached model


In [6]:
dataset_train = []
dataset_test = []

root = "newsData/"
list = os.listdir(root)
for cat in list:
    files = os.listdir(root + cat)
    for i,f in enumerate(files):
        fname = root + cat + "/" + f
        file = open(fname, "r", encoding="utf-8")
        strings = file.read()
        if i<170:
            dataset_train.append([strings, cat])
        else:
            dataset_test.append([strings,cat])
        file.close()

print(len(dataset_train), len(dataset_test))

1360 240


In [7]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


In [8]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [9]:
l1 = [len(i[0]) for i in dataset_train]
l2 = [len(i[0]) for i in dataset_test]
max(max(l1),max(l2))

3153

In [10]:
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [11]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

In [12]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5, shuffle=True)

In [13]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=8,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device), return_dict=False)

        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [14]:
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

In [15]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [16]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [17]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [18]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [19]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [20]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        # if batch_id % log_interval == 0:
        #     print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


  0%|          | 0/22 [00:00<?, ?it/s]

epoch 1 train acc 0.13920454545454544


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 1 test acc 0.4375


  0%|          | 0/22 [00:00<?, ?it/s]

epoch 2 train acc 0.5980113636363636


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 2 test acc 0.7682291666666666


  0%|          | 0/22 [00:00<?, ?it/s]

epoch 3 train acc 0.8515625


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 3 test acc 0.8385416666666666


  0%|          | 0/22 [00:00<?, ?it/s]

epoch 4 train acc 0.9183238636363636


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 4 test acc 0.8502604166666666


  0%|          | 0/22 [00:00<?, ?it/s]

epoch 5 train acc 0.9360795454545454


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 5 test acc 0.8333333333333334


  0%|          | 0/22 [00:00<?, ?it/s]

epoch 6 train acc 0.9637784090909091


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 6 test acc 0.8880208333333334


  0%|          | 0/22 [00:00<?, ?it/s]

epoch 7 train acc 0.9715909090909091


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 7 test acc 0.8802083333333334


  0%|          | 0/22 [00:00<?, ?it/s]

epoch 8 train acc 0.9886363636363636


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 8 test acc 0.8958333333333334


  0%|          | 0/22 [00:00<?, ?it/s]

epoch 9 train acc 0.9900568181818182


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 9 test acc 0.8958333333333334


  0%|          | 0/22 [00:00<?, ?it/s]

epoch 10 train acc 0.9914772727272727


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 10 test acc 0.8880208333333334


In [21]:
def softmax(vals, idx):
    valscpu = vals.cpu().detach().squeeze(0)
    a = 0
    for i in valscpu:
        a += np.exp(i)
    return ((np.exp(valscpu[idx]))/a).item() * 100

In [22]:
torch.save(model.state_dict(), "news.pt")
modelload = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)
modelload.load_state_dict(torch.load("news.pt", device))

<All keys matched successfully>

In [23]:
def loadStockData(symbol, startDate, endDate):
    df_stock = fdr.DataReader(symbol, startDate.isoformat(), endDate.isoformat())
    df_stock = df_stock[['Close']]
    df_stock['Fluctuation'] = df_stock['Close'].div(df_stock['Close'].shift(1)).apply(lambda x : (x - 1) * 100)
    return df_stock

In [24]:
def aggregateTitles(companyName, url):
    resp = requests.get(url)
    titles = []
    cate = ["정치","경제","사회", "생활/문화","세계","기술/IT", "연예", "스포츠"]

    for item in bs(resp.text, 'xml').find_all('item'):
        title = item.title.string
        source = item.source.string # 언론사
        if(companyName in title):
            newtitle = str(title.encode('utf-8'))
            newtitle = re.sub(companyName,"",title)
            tmp = [newtitle]
            transform = nlp.data.BERTSentenceTransform(tok, max_len, pad=True, pair=False)
            tokenized = transform(tmp)
            
            modelload.eval()
            result = model(torch.tensor([tokenized[0]]).to(device), [tokenized[1]], torch.tensor(tokenized[2]).to(device))
            idx = result.argmax().cpu().item()
            if(idx==1 or idx==5):
                titles.append(title[:title.find(source) - 3])

    return ' '.join(titles)

In [25]:
def classifyFluctuation(fluctuation):
    if fluctuation < -2.5:
        return 0
    elif fluctuation < 0:
        return 1
    elif fluctuation < 2.5:
        return 2
    else:
        return 3

In [26]:
def crawl(companyName, startDate, endDate, isKor=True): 
    if isKor:
        country = ('ko', 'KR')
        symbol = str(df_kospi.loc[df_kospi['Name'] == companyName]['Symbol'].values[0])
        print(f'Start crawling for {companyName} in Google News Korea')
    else:
        country = ('en', 'US')
        symbol = df_snp.loc[df_snp['Name'] == companyName]['Symbol'].values[0]
        print(f'Start crawling for {companyName} in Google News US')

    df_stock = loadStockData(symbol, startDate - datetime.timedelta(days=1), endDate)
    # df_stock.to_csv(f'./stock/{country[1]}/{companyName}_{startDate.isoformat()}_{endDate.isoformat()}.csv')
    print(f'Loaded {companyName} price info, from {startDate.isoformat()} to {endDate.isoformat()}!')

    dateList = df_stock.index.map(lambda x: datetime.datetime.strftime(x, '%Y-%m-%d')).values
    fluctuationList = df_stock.loc[:, 'Fluctuation'].values

    idx = 1
    while idx < len(dateList):
        url = f'https://news.google.com/rss/search?q={companyName}+after:{dateList[idx - 1]}+before:{dateList[idx]}& \
                hl={country[0]}&gl={country[1]}&ceid={country[1]}:{country[0]}'
        aggTitle = aggregateTitles(companyName, url)
        if aggTitle:
            with open(f'./new_exp_2018/{classifyFluctuation(fluctuationList[idx])}/{companyName}_{dateList[idx]}.txt', 
                        'w', encoding='UTF-8') as file:
                file.write(aggTitle)
        idx += 1

In [28]:
if __name__=="__main__":
    df_kospi = fdr.StockListing('KOSPI')
    df_snp = fdr.StockListing('S&P500')
    # May replace w/ fixed dictionary

    startDate = datetime.date(2018, 1, 1) # inclusive
    endDate = datetime.date(2018, 12, 31) # inclusive
    companyListK = ['삼성전자', 'SK하이닉스', 'NAVER', '삼성바이오로직스', '카카오', 'LG화학', '삼성SDI', 
                    '현대차', '기아', '셀트리온', '카카오뱅크', '크래프톤', 'POSCO', 'KB금융', '현대모비스', 
                    '카카오페이', '삼성물산', 'SK이노베이션', 'LG전자', '신한지주', 'LG생활건강', 'SK바이오사이언스', 
                    '하이브', '엔씨소프트', '한국전력', '삼성생명', '두산중공업', '하나금융지주', 'HMM', '삼성전기', 
                    '삼성에스디에스', 'SK아이이테크놀로지', 'KT&G', '넷마블', '포스코케미칼', '아모레퍼시픽', '삼성화재', 
                    '대한항공', 'S-Oil', '우리금융지주', '현대중공업', '고려아연', '기업은행', 'KT', 'SK바이오팜', 'LG디스플레이', '한온시스템']
    # 우리금융지주 수집 중 "Remote end closed connection without" urllib3.exceptions.ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
    # KOSPI 시총 상위 50개 종목, 지주회사 제외

    for companyName in companyListK:
        crawl(companyName, startDate, endDate)

    # companyListUS = ['Apple', 'IBM', 'Delta Air Lines']
    # for companyName in companyListUS:
    #     crawl(companyName, startDate, endDate, False)

Start crawling for 삼성전자 in Google News Korea
Loaded 삼성전자 price info, from 2018-01-01 to 2018-12-31!
Start crawling for SK하이닉스 in Google News Korea
Loaded SK하이닉스 price info, from 2018-01-01 to 2018-12-31!
Start crawling for NAVER in Google News Korea
Loaded NAVER price info, from 2018-01-01 to 2018-12-31!
Start crawling for 삼성바이오로직스 in Google News Korea
Loaded 삼성바이오로직스 price info, from 2018-01-01 to 2018-12-31!
Start crawling for 카카오 in Google News Korea
Loaded 카카오 price info, from 2018-01-01 to 2018-12-31!
Start crawling for LG화학 in Google News Korea
Loaded LG화학 price info, from 2018-01-01 to 2018-12-31!
Start crawling for 삼성SDI in Google News Korea
Loaded 삼성SDI price info, from 2018-01-01 to 2018-12-31!
Start crawling for 현대차 in Google News Korea
Loaded 현대차 price info, from 2018-01-01 to 2018-12-31!
Start crawling for 기아 in Google News Korea
Loaded 기아 price info, from 2018-01-01 to 2018-12-31!
Start crawling for 셀트리온 in Google News Korea
Loaded 셀트리온 price info, from 2018-01-01 to 2018