In [2]:
import logging
import gensim
import pandas as pd
import sys
import os
import multiprocessing
from ast import literal_eval
from collections import namedtuple
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts, get_tmpfile
from sklearn.model_selection import train_test_split
import json
sys.path.append(os.pardir)
from models.word_embedding import word2vec_model, doc2vec_model

## word embedding training

model hyperparameter change using kwargs
+ function : choose doc2vec or word2vec
+ min_cound : word embedding 

you can check more detail code in ./models/word_embedding.py

In [5]:
def main(**kwargs):
    data = pd.read_excel("../assets/data/doc_set_final_version3.xlsx")
    data.token = data.token.apply(lambda x : literal_eval(x))
    data = data.sample(frac=1, random_state=1234)

    token_list = data.token.tolist()
    target = data[['new_class', 'new_small_class']]
    train_x_data, test_x_data, train_y, test_y = train_test_split(token_list, target,
                                                                    test_size=0.3,
                                                                    stratify=target,
                                                                    shuffle=True,
                                                                    random_state=1234)

    model_select = kwargs['function']
    if model_select == 'w2v':
        print("모델 학습")
        word2vec_kargs = {'num_features':kwargs['num_features'],
                          'num_workers':4,
                          'window':kwargs['window'],
                          'seed':1234,
                          'min_word_count':kwargs['min_count'],
                          'min_alpha':kwargs['min_alpha'],
                          'iter': kwargs['iter']}
        model = word2vec_model(train_x_data, **word2vec_kargs)
        print("모델 저장")
        model_name = '../model_save/embedding_model/Word2vec1.model'
        model.save(model_name)

    elif model_select == 'd2v':
        TaggedDocument = namedtuple('TaggedDocument', 'words tags')
        tagged_train_docs = [TaggedDocument(d, [c[1]['new_class'], c[1]['new_small_class']]) for d, c in zip(train_x_data, train_y.iterrows())]
        print("모델 학습")
        doc2vec_kargs = {'size':kwargs['num_features'], #300
                         'window':kwargs['window'], #10
                         'min_count':kwargs['min_count'], # 3
                         'alpha':0.025,
                         'min_alpha':kwargs['min_alpha'], # 0.025
                         'workers':4,
                         'seed':1234,
                         'iter':kwargs['iter']} # 30
        model = doc2vec_model(tagged_train_docs, **doc2vec_kargs)
        print("모델 저장")
        model.save('../model_save/embedding_model/Doc2vec_new.model')

    else:
        print("2가지 방식 중에 고르시오")

In [6]:
if __name__ == '__main__':
#     args = parse_args()
#     print('Called with args:')
#     kwargs = dict(args._get_kwargs())
    kwargs = {'function': 'w2v', 'iter': 30, 'min_alpha': 0.025, 'min_count': 5, 'num_features': 300, 'window': 8}
    print(kwargs)
    main(**kwargs)

{'function': 'w2v', 'iter': 30, 'min_alpha': 0.025, 'min_count': 5, 'num_features': 300, 'window': 8}


2020-05-28 14:23:06,066 : INFO : collecting all words and their counts
2020-05-28 14:23:06,067 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


모델 학습


2020-05-28 14:23:06,420 : INFO : PROGRESS: at sentence #10000, processed 1590403 words, keeping 37975 word types
2020-05-28 14:23:06,449 : INFO : collected 38083 word types from a corpus of 1716401 raw words and 10789 sentences
2020-05-28 14:23:06,450 : INFO : Loading a fresh vocabulary
2020-05-28 14:23:06,597 : INFO : effective_min_count=5 retains 30223 unique words (79% of original 38083, drops 7860)
2020-05-28 14:23:06,598 : INFO : effective_min_count=5 leaves 1692506 word corpus (98% of original 1716401, drops 23895)
2020-05-28 14:23:06,701 : INFO : deleting the raw counts dictionary of 38083 items
2020-05-28 14:23:06,703 : INFO : sample=0.001 downsamples 25 most-common words
2020-05-28 14:23:06,703 : INFO : downsampling leaves estimated 1633826 word corpus (96.5% of prior 1692506)
2020-05-28 14:23:06,733 : INFO : constructing a huffman tree from 30223 words
2020-05-28 14:23:07,678 : INFO : built huffman tree with maximum node depth 18
2020-05-28 14:23:07,748 : INFO : estimated req

2020-05-28 14:24:17,282 : INFO : EPOCH 4 - PROGRESS: at 26.04% examples, 85998 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:24:18,567 : INFO : EPOCH 4 - PROGRESS: at 33.00% examples, 86425 words/s, in_qsize 7, out_qsize 0
2020-05-28 14:24:19,830 : INFO : EPOCH 4 - PROGRESS: at 39.84% examples, 86855 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:24:21,071 : INFO : EPOCH 4 - PROGRESS: at 46.75% examples, 87539 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:24:22,310 : INFO : EPOCH 4 - PROGRESS: at 53.59% examples, 88004 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:24:23,536 : INFO : EPOCH 4 - PROGRESS: at 60.70% examples, 88541 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:24:24,774 : INFO : EPOCH 4 - PROGRESS: at 67.67% examples, 88794 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:24:26,017 : INFO : EPOCH 4 - PROGRESS: at 74.55% examples, 89004 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:24:27,281 : INFO : EPOCH 4 - PROGRESS: at 81.31% examples, 89046 words/s, in_qsize 8, out_

2020-05-28 14:25:28,274 : INFO : EPOCH 8 - PROGRESS: at 19.21% examples, 86113 words/s, in_qsize 7, out_qsize 0
2020-05-28 14:25:29,455 : INFO : EPOCH 8 - PROGRESS: at 26.04% examples, 88424 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:25:30,620 : INFO : EPOCH 8 - PROGRESS: at 33.00% examples, 90053 words/s, in_qsize 7, out_qsize 0
2020-05-28 14:25:31,792 : INFO : EPOCH 8 - PROGRESS: at 39.84% examples, 91065 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:25:32,979 : INFO : EPOCH 8 - PROGRESS: at 46.75% examples, 91826 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:25:34,166 : INFO : EPOCH 8 - PROGRESS: at 53.59% examples, 92128 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:25:35,391 : INFO : EPOCH 8 - PROGRESS: at 60.70% examples, 92282 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:25:36,609 : INFO : EPOCH 8 - PROGRESS: at 67.67% examples, 92296 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:25:37,792 : INFO : EPOCH 8 - PROGRESS: at 74.55% examples, 92516 words/s, in_qsize 8, out_

2020-05-28 14:26:35,090 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-05-28 14:26:35,121 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-05-28 14:26:35,121 : INFO : EPOCH - 11 : training on 1716401 raw words (1633802 effective words) took 18.0s, 90777 effective words/s
2020-05-28 14:26:36,296 : INFO : EPOCH 12 - PROGRESS: at 5.21% examples, 72063 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:26:37,335 : INFO : EPOCH 12 - PROGRESS: at 11.07% examples, 81134 words/s, in_qsize 7, out_qsize 0
2020-05-28 14:26:38,545 : INFO : EPOCH 12 - PROGRESS: at 17.44% examples, 82805 words/s, in_qsize 7, out_qsize 0
2020-05-28 14:26:39,583 : INFO : EPOCH 12 - PROGRESS: at 23.76% examples, 86727 words/s, in_qsize 7, out_qsize 0
2020-05-28 14:26:40,605 : INFO : EPOCH 12 - PROGRESS: at 28.91% examples, 85863 words/s, in_qsize 7, out_qsize 0
2020-05-28 14:26:41,741 : INFO : EPOCH 12 - PROGRESS: at 34.74% examples, 85389 words/s, in_qsize 8, out_qsize 0
202

2020-05-28 14:27:43,462 : INFO : EPOCH 15 - PROGRESS: at 90.09% examples, 93243 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:27:44,694 : INFO : EPOCH 15 - PROGRESS: at 96.87% examples, 93147 words/s, in_qsize 6, out_qsize 0
2020-05-28 14:27:44,816 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-05-28 14:27:44,914 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-05-28 14:27:45,066 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-05-28 14:27:45,100 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-05-28 14:27:45,100 : INFO : EPOCH - 15 : training on 1716401 raw words (1633711 effective words) took 17.4s, 93780 effective words/s
2020-05-28 14:27:46,276 : INFO : EPOCH 16 - PROGRESS: at 5.21% examples, 72991 words/s, in_qsize 7, out_qsize 1
2020-05-28 14:27:47,443 : INFO : EPOCH 16 - PROGRESS: at 12.23% examples, 85007 words/s, in_qsize 7, out_qsize 0
2020-05-28 14:27:48,588 : INFO : EPOCH 16 - PRO

2020-05-28 14:28:48,837 : INFO : EPOCH 19 - PROGRESS: at 70.53% examples, 93133 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:28:50,058 : INFO : EPOCH 19 - PROGRESS: at 77.35% examples, 93035 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:28:51,211 : INFO : EPOCH 19 - PROGRESS: at 84.27% examples, 93421 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:28:52,261 : INFO : EPOCH 19 - PROGRESS: at 90.63% examples, 93839 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:28:53,414 : INFO : EPOCH 19 - PROGRESS: at 96.87% examples, 93519 words/s, in_qsize 6, out_qsize 0
2020-05-28 14:28:53,575 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-05-28 14:28:53,656 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-05-28 14:28:53,797 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-05-28 14:28:53,828 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-05-28 14:28:53,828 : INFO : EPOCH - 19 : training on 1716401 raw w

2020-05-28 14:29:54,277 : INFO : EPOCH 23 - PROGRESS: at 57.29% examples, 94159 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:29:55,355 : INFO : EPOCH 23 - PROGRESS: at 63.59% examples, 94363 words/s, in_qsize 7, out_qsize 0
2020-05-28 14:29:56,399 : INFO : EPOCH 23 - PROGRESS: at 69.95% examples, 94731 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:29:57,462 : INFO : EPOCH 23 - PROGRESS: at 75.66% examples, 94209 words/s, in_qsize 7, out_qsize 0
2020-05-28 14:29:58,613 : INFO : EPOCH 23 - PROGRESS: at 81.92% examples, 93945 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:29:59,812 : INFO : EPOCH 23 - PROGRESS: at 88.91% examples, 93908 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:30:00,997 : INFO : EPOCH 23 - PROGRESS: at 95.74% examples, 94092 words/s, in_qsize 7, out_qsize 0
2020-05-28 14:30:01,415 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-05-28 14:30:01,473 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-05-28 14:30:01,474 : I

2020-05-28 14:30:58,408 : INFO : EPOCH 27 - PROGRESS: at 31.85% examples, 93047 words/s, in_qsize 7, out_qsize 0
2020-05-28 14:30:59,437 : INFO : EPOCH 27 - PROGRESS: at 38.12% examples, 94208 words/s, in_qsize 7, out_qsize 0
2020-05-28 14:31:00,493 : INFO : EPOCH 27 - PROGRESS: at 43.85% examples, 93605 words/s, in_qsize 7, out_qsize 0
2020-05-28 14:31:01,592 : INFO : EPOCH 27 - PROGRESS: at 50.14% examples, 93729 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:31:02,761 : INFO : EPOCH 27 - PROGRESS: at 57.29% examples, 94019 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:31:03,776 : INFO : EPOCH 27 - PROGRESS: at 63.59% examples, 94731 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:31:04,858 : INFO : EPOCH 27 - PROGRESS: at 69.95% examples, 94858 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:31:05,896 : INFO : EPOCH 27 - PROGRESS: at 75.66% examples, 94598 words/s, in_qsize 8, out_qsize 0
2020-05-28 14:31:07,027 : INFO : EPOCH 27 - PROGRESS: at 81.92% examples, 94272 words/s, in_qsiz

2020-05-28 14:32:00,520 : INFO : saving Word2Vec object under ../model_save/embedding_model/Word2vec1.model, separately None
2020-05-28 14:32:00,520 : INFO : not storing attribute vectors_norm
2020-05-28 14:32:00,520 : INFO : not storing attribute cum_table


모델 저장


2020-05-28 14:32:02,626 : INFO : saved ../model_save/embedding_model/Word2vec1.model
