# Library

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')

import numpy as np
from gensim.models import word2vec #  word2vec library  
import sentencepiece as spm # For sentencepiece model

# Load Data

In [7]:
data_path = '../../../../data/klue clf/'

train = pd.read_csv(data_path + 'train_data.csv').sample(frac = 1)
test = pd.read_csv(data_path + 'test_data.csv')

In [8]:
train.head()

Unnamed: 0,index,title,topic_idx
2114,2114,그래픽 자문안으로 본 대통령 개헌안 주요 내용,6
44885,44885,관악구 여름방학 아르바이트 대학생 200명 모집,2
8968,8968,남북정상회담 준비委 발족…문 대통령 국제지지 받도록 만전종합2보,6
2964,2964,아시안게임 일본전 결승 솔로포 김하성 금메달 꼭 ...,5
30826,30826,文대통령 모친 빈소에 野대표들 잇따라 조문…고인 애도,6


## Set Data

In [11]:
with open('title_train.txt', 'w', encoding = 'utf-8') as f : 
    for row in train.iterrows() : 
        txt = row[1]['title'] +'\n'
        f.write(txt)
f.close()

# Sentence Piece & Word 2 Vec

## Set Parameters

In [12]:
num_features = 300 # Embedding Vector Size
negative = 10 # words for negative sampling
min_word_count = 6 # minimum words in one sentence
window = 5 # context window size
downsampling = 0.75 # Lower frequency for high-frequency words 
epoch = 20

In [14]:
%%time

spm.SentencePieceTrainer.train(input="./title_train.txt", model_prefix='sp', vocab_size=3000, user_defined_symbols=['[CLS]', '[SEP]'])
sp = spm.SentencePieceProcessor(model_file='./sp.model')

Wall time: 2.89 s


In [15]:
sp_titles = []
titles = []
for line in train.iterrows() : 
    line = line[1]['title']
    titles.append(line.split(' '))
    sp_titles.append(sp.encode(line, out_type=str))

In [17]:
%%time

# skip-gram model training with naive splitted data
premises_normal = word2vec.Word2Vec(titles,
                                sg = 1, # CBOW = 0, skip-gram = 1
                                negative=negative,
                                size=num_features, 
                                min_count=min_word_count,
                                window=window,
                                sample=downsampling,
                                iter=epoch)


Wall time: 11.7 s


In [18]:
%%time
# skip-gram model training with sentencepiece data
premises_SP = word2vec.Word2Vec(sp_titles,
                                  sg = 1, # CBOW = 0, skip-gram = 1
                                  negative=negative,
                                  size=num_features, 
                                  min_count=min_word_count,
                                  window=window,
                                  sample=downsampling,
                                  iter=epoch)

Wall time: 1min 15s


In [22]:
premises_normal.wv.most_similar('대통령')

[('대선', 0.674289345741272),
 ('평양정상회담', 0.6496884226799011),
 ('탄핵', 0.6275056004524231),
 ('호세프', 0.6253585815429688),
 ('개헌안', 0.6250211000442505),
 ('대통령의', 0.6233727931976318),
 ('대통령에', 0.6217567920684814),
 ('대통령과', 0.6090984344482422),
 ('엘시시', 0.5995229482650757),
 ('문', 0.5868440866470337)]

In [23]:
premises_SP.wv.most_similar('대통령')

[('▁朴', 0.7994265556335449),
 ('▁文', 0.43013668060302734),
 ('潘', 0.39542412757873535),
 ('前', 0.39519405364990234),
 ('▁새누리', 0.36737316846847534),
 ('風', 0.3659006953239441),
 ('▁대통령', 0.3566558361053467),
 ('▁靑', 0.3528180718421936),
 ('▁최순실', 0.3427363634109497),
 ('▁여야', 0.3374297618865967)]