In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
# Load the data file containing the info from Naver Finance

df = pd.read_csv(os.path.join(os.getcwd(),'all.csv')).reset_index().drop('index',axis=1)

In [4]:
# Save the "title" column of all.csv as title_df
# Most of the titles end with "[n]" where n is the number of replies that the thread has
# Since it is redundant info as of now, we drop such info at the end of each title
title_df = pd.DataFrame(df['title'].dropna().apply(lambda x: x.split('[')[0]))

# To use the KOBERT model, we need to input the data in tsv format
title_df.to_csv(os.path.join(os.getcwd(),'title.txt'),sep='\t')

title_list = title_df['title'].to_list() 

In [7]:
# This research leverages on the pre-trained word embeddings from
# Naver Sentiment Movie Corpus project (https://github.com/e9t/nsmc) based on KOBERT
# It consists of 100K positive and 100K negative reviews from Naver Movie

!wget -O .cache/ratings_train.txt http://skt-lsl-nlp-model.s3.amazonaws.com/KoBERT/datasets/nsmc/ratings_train.txt
!wget -O .cache/ratings_test.txt http://skt-lsl-nlp-model.s3.amazonaws.com/KoBERT/datasets/nsmc/ratings_test.txt

--2022-12-29 14:03:34--  http://skt-lsl-nlp-model.s3.amazonaws.com/KoBERT/datasets/nsmc/ratings_train.txt
Resolving skt-lsl-nlp-model.s3.amazonaws.com (skt-lsl-nlp-model.s3.amazonaws.com)... 52.219.58.126
Connecting to skt-lsl-nlp-model.s3.amazonaws.com (skt-lsl-nlp-model.s3.amazonaws.com)|52.219.58.126|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14628807 (14M) [text/plain]
Saving to: ‘.cache/ratings_train.txt’


2022-12-29 14:03:35 (70.6 MB/s) - ‘.cache/ratings_train.txt’ saved [14628807/14628807]

--2022-12-29 14:03:35--  http://skt-lsl-nlp-model.s3.amazonaws.com/KoBERT/datasets/nsmc/ratings_test.txt
Resolving skt-lsl-nlp-model.s3.amazonaws.com (skt-lsl-nlp-model.s3.amazonaws.com)... 52.219.58.126
Connecting to skt-lsl-nlp-model.s3.amazonaws.com (skt-lsl-nlp-model.s3.amazonaws.com)|52.219.58.126|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4893335 (4.7M) [text/plain]
Saving to: ‘.cache/ratings_test.txt’


2022-12-29 14:03:35 (3

In [8]:
# We can verify that NSMC corpus consists of 75% of train data and 25% of test data
# Therefore, additional word inputs in this research will maintain the same train-test ratio
pd.read_csv('.cache/ratings_train.txt',sep='\t').shape[0] / pd.read_csv('.cache/ratings_test.txt',sep='\t').shape[0]

3.0

In [6]:
# Directly classifying Naver Finance sentiments based on NSMC corpus may lead to incomplete results
# So I designate several intuitive and definite positive/neagtive keywords
# and classify each "titles" as positive thread if
# it contains any positive keyword and does not contain negative keywords
# and classify each "titles" as negative thread if
# it contains any negative keyword and does not contain positive keywords
# Other threads are classified in the subsequent process

positive_keyword = ['매수','오르','오른','오를','상승','폭등','산다','안판다','안 판다','안내릴','안 내릴','안내린','안 내린','안내리','안 내리','벌']
negative_keyword = ['매도','내리','내린','내릴','하락','폭락','판다','안산다','안 산다','안오를','안 오를','안오른','안 오른','안오르','안 오르','비싸','락','떨어','잃','공매']

# Thread with signal=1 is positive, signal=-1 is negative, and signal=0 is indefinite
title_df['signal'] = title_df['title'].str.contains('|'.join(positive_keyword))*1+title_df['title'].str.contains('|'.join(negative_keyword))*(-1)
title_df.head()

Unnamed: 0,title,signal
0,물 탄 나를 칭찬한다!,0
1,안티들은 왜이리 차트차트,0
2,지엠 2025년 EV 100만대?,0
3,참 어이없네,0
4,"포스코 케미칼, 세계 전기차 배터리 최강...",0


In [10]:
# Extract positive/negative threads based on the above criteria
# and change the signal value of negative threads from -1 to 0

nonneutral_df = title_df[title_df['signal']!=0]
nonneutral_df['signal'].replace(-1,0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nonneutral_df['signal'].replace(-1,0,inplace=True)


In [11]:
# Shuffle nonneutral_df
nonneutral_df = nonneutral_df.sample(frac=1).reset_index().drop('index',axis=1)

# Split nonentural_df into the train and test dataset
nonneutral_train = nonneutral_df.loc[:nonneutral_df.shape[0]//4*3]
nonneutral_test = nonneutral_df.loc[nonneutral_df.shape[0]//4*3+1:]

In [12]:
# Save the entire/train/test dataset of nonneutral df, respectively

nonneutral_df.to_csv('nonneutral.txt',sep='\t')
nonneutral_train.to_csv('nonneutral_train.txt',sep='\t')
nonneutral_test.to_csv('nonneutral_test.txt',sep='\t')

In [13]:
# Concatenate the nonneutral_train with NSMC train dataset
# and save it as train_df and train_all.txt (tsv format)
 
train_df1 = pd.read_csv(os.path.join(os.getcwd(),'nonneutral_train.txt'),sep='\t').rename(columns={'Unnamed: 0':'id','title':'document','signal':'label'})
train_df2 = pd.read_csv('.cache/ratings_train.txt',sep='\t')
train_df = pd.concat([train_df1,train_df2],axis=0).sample(frac=1).reset_index().drop('index',axis=1)

train_df.to_csv(os.path.join(os.getcwd(),'train_all.txt'),sep='\t',index=False)

In [14]:
# Concatenate the nonneutral_test with NSMC test dataset
# and save it as test_df and test_all.txt (tsv format)

test_df1 = pd.read_csv(os.path.join(os.getcwd(),'nonneutral_test.txt'),sep='\t').rename(columns={'Unnamed: 0':'id','title':'document','signal':'label'})
test_df2 = pd.read_csv('.cache/ratings_test.txt',sep='\t')
test_df = pd.concat([test_df1,test_df2],axis=0).sample(frac=1).reset_index().drop('index',axis=1)

test_df.to_csv(os.path.join(os.getcwd(),'test_all.txt'),sep='\t',index=False)