## Wikipedia 텍스트 전처리

In [6]:
import os
import re
import multiprocessing

from konlpy.tag import Mecab

In [2]:
# 품사태깅 후 
# 관계어, 기호를 제거하는 함수
def keywords_extractor(mecab, sentence):
    
    tagged = mecab.pos(sentence)
    tag_list = ['NNG', 'NNP','NNB', 'NNBC', 'NR', 'NP', # 체언
                'VV', 'VA', 'VX', 'VCP', 'VCN', # 용언
                'MM', 'MAG', 'MAJ', # 수식언
                'IC', # 독립언
                'XPN','XSN','XSV','XSA','XR' # 접사
                'SL', 'SH', 'SN' # 외국어, 한자, 숫자
               ]
    
    result = []
     
    for word, tag in tagged:
        # 필요한 태그 이외의 단어 제거
        if tag in tag_list:
            result.append(word)
    return result

In [3]:
# 파싱한 텍스트를 품사별로 분류하여
# 관계언 또는 기호를 제거하여 wiki_step2 폴더에 저장하는 함수
def corpus_preprocessor(data):
    mecab = Mecab() 
    remove_special_char = re.compile(r'[^가-힣^A-z^0-9^.^,^?^!^ ]') # 한글, 영어, 기본 기호를 제외한 문자들
    
    path, file_name = data
    print("process file: {}\n".format(file_name))
    with open(os.path.join(path, file_name), 'r', encoding='utf-8') as input: # wiki_step1 폴더에 있는 텍스트 로딩
        with open(os.path.join(os.getcwd(), 'wiki_step2', file_name), 'w', encoding='utf-8') as output: 
            for line in input:
                    
                # 특수 문자 제거 후 품사 분석 진행, 파일에 기록
                text = remove_special_char.sub(' ', line)
                keyword = keywords_extractor(mecab, text)
                
                output.write(' '.join(keyword))
                output.write('\n')

In [4]:
if __name__ == '__main__':
    pool = multiprocessing.Pool(processes=4)
    
    data = [] 
    # path: dirs 와 files 가 있는 경로
    # dirs: path 아래의 폴더들
    # files: path 아래의 파일들
    for path, dirs, files in os.walk('wiki_step1/'):
        for file_name in files: 
            data.append( (path, file_name) )
            
    pool.map(corpus_preprocessor, data) # 리스트의 각 요소에 대해 함수 적용
    pool.close()
    pool.join()

process file: wiki_14
process file: wiki_16
process file: wiki_03



process file: wiki_04

process file: wiki_08

process file: wiki_17

process file: wiki_12

process file: wiki_09

process file: wiki_11

process file: wiki_07

process file: wiki_15

process file: wiki_05

process file: wiki_10

process file: wiki_13

process file: wiki_02

process file: wiki_00

process file: wiki_06

process file: wiki_00-checkpoint

process file: wiki_01

process file: wiki_03-checkpoint



## Word2Vec 생성

In [1]:
import gensim
from gensim.models.callbacks import CallbackAny2Vec

from tqdm import tqdm
from time import time

  from cryptography.hazmat.backends import default_backend


In [2]:
# Corpus 를 읽어와서 
# 문장을 공백 단위로 끊어서 토큰화 시킴
class Loader(object):
    def __init__(self, source_dir):
        self.source_dir = source_dir
        
    def __iter__(self):
        for path, dirs, files in os.walk(self.source_dir):
            for file in files:
                with open(os.path.join(path, file), 'r', encoding='utf-8') as f:
                    for line in f:
                        yield line.replace('\n', '').replace(',', '').split(' ')

In [3]:
class callback(CallbackAny2Vec):
    def __init__(self): 
        self.epoch = 0 
        self.loss_to_be_subed = 0 
        
    def on_epoch_end(self, model): 
        loss = model.get_latest_training_loss() 
        loss_now = loss - self.loss_to_be_subed 
        self.loss_to_be_subed = loss 
        print('Loss after epoch {}: {}'.format(self.epoch, loss_now)) 
        self.epoch += 1

In [4]:
sentences_vocab = Loader('wiki_step2/') # 단어사전 생성을 위한 데이터
sentences_train = Loader('wiki_step2/') # 학습을 위한 데이터

In [7]:
config = {
    'vector_size': 300, # 100차원 Embedding Vector 생성
    'min_count': 5, # 단어 최소 빈도 수 제한 (빈도가 적은 단어들은 학습하지 않는다.)
    
    'batch_words': 10000, # 사전을 구축할때 한번에 읽을 단어 수
    'workers': multiprocessing.cpu_count(), # 학습을 위한 프로세스의 수
    
    'sg': 1, # 0이면 CBOW, 1이면 skip-gram을 사용한다
}

In [8]:
# Word2vec 모델 생성
model_w2v = gensim.models.Word2Vec(**config)

# 단어사전 생성
t = time()
model_w2v.build_vocab(sentences_vocab)

print('model.corpus_count: {}'.format(model_w2v.corpus_count))
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

model.corpus_count: 11692878
Time to build vocab: 0.86 mins


In [9]:
# Word2Vec 학습
t = time()

model_w2v.train(sentences_train, total_examples=model_w2v.corpus_count, epochs=10, callbacks=[callback()])
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Loss after epoch 0: 0.0
Loss after epoch 1: 0.0
Loss after epoch 2: 0.0
Loss after epoch 3: 0.0
Loss after epoch 4: 0.0
Loss after epoch 5: 0.0
Loss after epoch 6: 0.0
Loss after epoch 7: 0.0
Loss after epoch 8: 0.0
Loss after epoch 9: 0.0
Time to build vocab: 39.05 mins


In [10]:
# 모델 저장
#model_w2v.wv.save_word2vec_format('output/kor_w2v_save')
model_w2v.wv.save('output/kor_w2v')

## Word2Vec 사용

In [16]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

# 모델 로딩 
my_model = KeyedVectors.load('output/kor_w2v')

In [17]:
my_model.vectors.shape

(222084, 300)

In [20]:
# 컴퓨터의 word vector 출력
print(my_model.word_vec("컴퓨터"))

[-0.16263857 -0.39753538 -0.03941252 -0.2815207  -0.2504322  -0.62062925
 -0.1720547  -0.21039955  0.5109284   0.12770298  0.13278005 -0.289716
 -0.16259088  0.4745358   0.6118672   0.14764686 -0.16223449  0.54996294
  0.37376636  0.15286238  0.07673191  0.15515211  0.23582846  0.42239177
 -0.34096035 -0.09232024  0.27925557  0.2600624   0.18914807  0.54072857
 -0.17400432 -0.2149311   0.33158877 -0.06285863  0.6355217  -0.26313585
 -0.45463905 -0.13413161  0.04916585 -0.12672752 -0.09002984 -0.3089855
 -0.22445966 -0.17693715  0.6590697   0.10830338 -0.57296455 -0.01345103
 -0.16993403  0.07687641  0.13357393  0.1320958   0.08439571 -0.28127018
  0.00430054 -0.00850803 -0.03924747 -0.01759771 -0.23929155 -0.11643726
  0.02570553 -0.22309546 -0.11471161 -0.5833107  -0.1534721   0.27088657
 -0.3325114   0.25515217 -0.01697638  0.02127138 -0.0621896   0.30365613
 -0.33461082  0.15380535 -0.15494534  0.28394535 -0.38480294 -0.28978726
  0.06811369 -0.01378507  0.14118852 -0.4666334  -0.28

  


In [21]:
print(my_model.most_similar(positive=['전주시', '대학교'], negative=[], topn=30)) 

[('7327', 0.6948805451393127), ('완산구', 0.6878467798233032), ('중노송동', 0.6541063785552979), ('7526', 0.6446976661682129), ('남노송동', 0.6415050029754639), ('7326', 0.6390484571456909), ('동흥남동', 0.6365260481834412), ('조촌동', 0.635801374912262), ('7325', 0.6354224681854248), ('대학', 0.6339321136474609), ('덕진구', 0.6289939880371094), ('김세웅', 0.6245746612548828), ('구정문', 0.6237240433692932), ('7423', 0.6217697858810425), ('흥남동', 0.6212611198425293), ('7130', 0.6143994927406311), ('용순리', 0.612825870513916), ('7389', 0.6110311150550842), ('항가리', 0.6103420257568359), ('인후동', 0.6073480248451233), ('7528', 0.604963481426239), ('가톨릭상지대학', 0.6039727330207825), ('7866', 0.6037932634353638), ('연세대', 0.6037074327468872), ('7863', 0.6023799180984497), ('7483', 0.6023637652397156), ('7525', 0.6016020178794861), ('7387', 0.5990268588066101), ('28775', 0.5990211963653564), ('중앙교육연구소', 0.5987944006919861)]
