In [None]:
import os
from tqdm import tqdm
import numpy as np
import pandas as pd

import torch

import gluonnlp as nlp
from kobert import get_pytorch_kobert_model
from kobert import get_tokenizer

In [None]:
# Load the entire Naver Finance thread title data (tsv format)

dataset = nlp.data.TSVDataset(os.path.join(os.getcwd(),'title.txt'), field_indices=[1], num_discard_samples=1)

In [None]:
# Get and initialize KOBERT tokenizer

_, vocab = get_pytorch_kobert_model(cachedir=".cache")

tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

In [None]:
# Preprocessor for each sentence(text input) data,
# based on the KOBERT tokenizer
# max_seq_length is set to be the largest length in the given dataset

transform = nlp.data.BERTSentenceTransform(
            tok, max_seq_length=48, pad=True, pair=False)

In [None]:
# Get token ids and segment ids for each sentences(text inputs)
# based on the preprocessor defined above

token_id_all = torch.tensor([transform([i[0]])[0] for i in tqdm(dataset)])
valid_length_all = torch.unsqueeze(torch.tensor([transform([i[0]])[1].item() for i in tqdm(dataset)]),1)
segment_id_all = torch.tensor([transform([i[0]])[2] for i in tqdm(dataset)])

100%|██████████| 1400932/1400932 [00:55<00:00, 25119.07it/s]
  token_id_all = torch.tensor([transform([i[0]])[0] for i in tqdm(dataset)])
100%|██████████| 1400932/1400932 [00:56<00:00, 24588.99it/s]
100%|██████████| 1400932/1400932 [00:55<00:00, 25078.02it/s]


In [None]:
## CPU
# device = torch.device("cpu")

## GPU
device = torch.device("cuda:0")

In [None]:
# Load the trained model for the Naver Finance Sentiment classification

model = torch.load(os.path.join(os.getcwd(),'trained_model.pt')).to(device)

In [None]:
# Run the model on the entire Naver Finance dataset to get their
# sentiment scores and classfied sentiments(positive/negative)
# and store them in the separate lists

scores_list = []
sentiments_list = []
batch_size = 100

for batch_id in tqdm(range(len(dataset)//batch_size+1)):
    token_ids = token_id_all[batch_id*batch_size:min((batch_id+1)*batch_size,len(dataset))].long().to(device)
    valid_length = valid_length_all[batch_id*batch_size:min((batch_id+1)*batch_size,len(dataset))]
    segment_ids = segment_id_all[batch_id*batch_size:min((batch_id+1)*batch_size,len(dataset))].long().to(device)

    model_result = model(token_ids, valid_length, segment_ids)
    scores = model_result.data.cpu().tolist()
    sentiments = torch.max(model_result,1)[1].data.cpu().tolist()

    scores_list.extend(scores)
    sentiments_list.extend(sentiments)

100%|██████████| 14010/14010 [1:05:09<00:00,  3.58it/s]


In [None]:
# Save scores into scores_df
# and sentiments into sentiments_df

scores_df = pd.DataFrame(scores_list).rename(columns={0:'negative_score',1:'positive_score'})
scores_df.to_csv(os.path.join(os.getcwd(),'scores.csv'),index=False)

sentiments_df = pd.DataFrame(sentiments_list).rename(columns={0:'sentiment'})
sentiments_df.to_csv(os.path.join(os.getcwd(),'sentiments.csv'),index=False)

In [None]:
# Add sentiment scores and classfied sentiments as the columns of the entire dataset
# and save it as the final data (all_sentiments.csv)

all_df = pd.read_csv(os.path.join(os.getcwd(),'all.csv'))
all_df = all_df[~all_df['title'].isnull()].reset_index().drop('index',axis=1)

all_df['sentiment'] = sentiments_df['sentiment']
all_df['negative_score'] = scores_df['negative_score']
all_df['positive_score'] = scores_df['positive_score']

all_df.to_csv(os.path.join(os.getcwd(),'all_sentiments.csv'),index=False)

In [None]:
all_df.tail(20)

Unnamed: 0,datetime,title,up,down,code,sentiment,negative_score,positive_score
1400912,2017.07.18 10:19,도이치 잘도 판다 부럽당,2,0,32830,0,4.345006,-4.727149
1400913,2017.07.17 11:48,[삭제된 게시물의 답글]제2의 합병전 ...,9,4,32830,0,0.401274,-0.220131
1400914,2017.07.16 21:32,"삼생, 이건희 리스크 땜에 안 간다? [2]",6,5,32830,0,3.465622,-3.718369
1400915,2017.07.14 12:37,상장폐지하라,11,1,32830,0,4.201516,-4.536732
1400916,2017.07.13 15:49,해도해도 너무하는군! [1],10,3,32830,0,4.00235,-4.391201
1400917,2017.07.13 14:38,삼성전기한데 조만간 추월당할꺼 같네요!제...,4,2,32830,1,-4.232312,4.638601
1400918,2017.07.13 09:22,[삭제된 게시물의 답글]13일북한동해상에...,2,0,32830,0,0.401274,-0.220131
1400919,2017.07.12 16:10,상속후 폭탄 배당예상됨 [3],9,3,32830,1,-4.096404,4.533157
1400920,2017.07.12 12:28,먼가없음.그냥 삼생은 썩을주식이죠!!,4,1,32830,0,1.736688,-1.82595
1400921,2017.07.12 11:16,모종의 뭔가 있는듯 [1],5,3,32830,1,-2.335604,2.67272
