In [8]:
import os
import sqlite3
import pandas as pd
import tqdm

In [9]:
from konlpy.tag import Twitter

In [10]:
db_name = '[Full_TFIDF]Credibility_research_20180906.db' #DB 파일명
# db 생성
con = sqlite3.connect( db_name )
cur = con.cursor()

In [42]:
sql = '''
select p.Post_id, u.User_id, group_concat(DISTINCT t.Post_tag) as Tag
    FROM post as p
    Left Join User as u
        ON p.User_id = u.User_id
    Left JOIN Tag as t
        ON p.Post_id = t.Post_id
    GROUP BY p.Post_id;
'''
cur.execute(sql)
rows = cur.fetchall()

In [43]:
df = pd.DataFrame(rows).dropna()

In [46]:
Tag_text = df[2].tolist()

In [48]:
Tag_pre_processing_text = list(map(lambda x : x.replace(',',' '),Tag_text))

In [None]:
Tag_pre_processing_text

## TFIDF

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer(max_features=100)
x = v.fit_transform(Tag_pre_processing_text)

### TFIDF model save

In [68]:
import pickle
import numpy as np
#Save vectorizer.vocabulary_
pickle.dump(v.vocabulary_,open("[Tag]features_100.pkl","wb"))

## Doc2vec

In [56]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [57]:
sentences = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(Tag_pre_processing_text)]

In [59]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [60]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [61]:
doc_vectorizer = Doc2Vec(
    dm=0,            # PV-DBOW / default 1
    dbow_words=1,    # w2v simultaneous with DBOW d2v / default 0
    window=8,        # distance between the predicted word and context words
    size=100,        # vector size
    alpha=0.025,     # learning-rate
    seed=1234,
    min_count=20,    # ignore with freq lower
    min_alpha=0.025, # min learning-rate
    workers=cores,   # multi cpu
    hs = 1,          # hierarchical softmax / default 0
    negative = 10,   # negative sampling / default 5
)



In [62]:
doc_vectorizer.build_vocab(sentences)

2018-09-06 19:45:00,178 : INFO : collecting all words and their counts
2018-09-06 19:45:00,182 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-09-06 19:45:00,267 : INFO : PROGRESS: at example #10000, processed 364764 words (4312139/s), 1415 word types, 10000 tags
2018-09-06 19:45:00,350 : INFO : PROGRESS: at example #20000, processed 746632 words (4634779/s), 1682 word types, 20000 tags
2018-09-06 19:45:00,473 : INFO : PROGRESS: at example #30000, processed 1134561 words (3175632/s), 1814 word types, 30000 tags
2018-09-06 19:45:00,492 : INFO : collected 1863 word types and 32229 unique tags from a corpus of 32229 examples and 1223536 words
2018-09-06 19:45:00,493 : INFO : Loading a fresh vocabulary
2018-09-06 19:45:00,495 : INFO : min_count=20 retains 1022 unique words (54% of original 1863, drops 841)
2018-09-06 19:45:00,496 : INFO : min_count=20 leaves 1219712 word corpus (99% of original 1223536, drops 3824)
2018-09-06 19:45:00,501 : INFO : delet

In [63]:
print(str(doc_vectorizer))

Doc2Vec(dbow+w,d100,n10,hs,w8,mc20,s0.001,t4)


In [64]:
doc_vectorizer.iter

  """Entry point for launching an IPython kernel.


5

In [65]:
import time
start = time.time()
for epoch in range(10):
    doc_vectorizer.train(sentences, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002 # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay
end = time.time()
print("During Time: {}".format(end-start))

  after removing the cwd from sys.path.
2018-09-06 19:45:21,282 : INFO : training model with 4 workers on 1022 vocabulary and 100 features, using sg=1 hs=1 sample=0.001 negative=10 window=8
2018-09-06 19:45:22,342 : INFO : EPOCH 1 - PROGRESS: at 14.31% examples, 111333 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:45:23,390 : INFO : EPOCH 1 - PROGRESS: at 27.17% examples, 110707 words/s, in_qsize 8, out_qsize 0
2018-09-06 19:45:24,396 : INFO : EPOCH 1 - PROGRESS: at 40.61% examples, 112181 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:45:25,435 : INFO : EPOCH 1 - PROGRESS: at 53.29% examples, 112045 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:45:26,499 : INFO : EPOCH 1 - PROGRESS: at 67.80% examples, 114144 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:45:27,538 : INFO : EPOCH 1 - PROGRESS: at 82.99% examples, 117206 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:45:28,608 : INFO : EPOCH 1 - PROGRESS: at 96.42% examples, 116897 words/s, in_qsize 5, out_qsize 0
2018-09-06 19:45:28

2018-09-06 19:46:07,299 : INFO : EPOCH - 1 : training on 1223536 raw words (889028 effective words) took 7.7s, 115679 effective words/s
2018-09-06 19:46:08,323 : INFO : EPOCH 2 - PROGRESS: at 12.65% examples, 100973 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:46:09,463 : INFO : EPOCH 2 - PROGRESS: at 28.02% examples, 111020 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:46:10,517 : INFO : EPOCH 2 - PROGRESS: at 43.05% examples, 115362 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:46:11,559 : INFO : EPOCH 2 - PROGRESS: at 55.53% examples, 114299 words/s, in_qsize 8, out_qsize 0
2018-09-06 19:46:12,567 : INFO : EPOCH 2 - PROGRESS: at 69.42% examples, 115781 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:46:13,577 : INFO : EPOCH 2 - PROGRESS: at 82.22% examples, 115638 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:46:14,616 : INFO : EPOCH 2 - PROGRESS: at 94.88% examples, 115060 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:46:14,883 : INFO : worker thread finished; awaiting finish 

2018-09-06 19:46:57,127 : INFO : EPOCH 3 - PROGRESS: at 26.31% examples, 106086 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:46:58,285 : INFO : EPOCH 3 - PROGRESS: at 40.61% examples, 106188 words/s, in_qsize 8, out_qsize 0
2018-09-06 19:46:59,408 : INFO : EPOCH 3 - PROGRESS: at 54.82% examples, 108739 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:47:00,413 : INFO : EPOCH 3 - PROGRESS: at 67.96% examples, 109988 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:47:01,444 : INFO : EPOCH 3 - PROGRESS: at 78.96% examples, 108094 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:47:02,492 : INFO : EPOCH 3 - PROGRESS: at 90.92% examples, 107519 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:47:03,116 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-09-06 19:47:03,249 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-09-06 19:47:03,270 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-09-06 19:47:03,317 : INFO : worker thread fi

2018-09-06 19:47:50,028 : INFO : EPOCH 4 - PROGRESS: at 37.90% examples, 107504 words/s, in_qsize 8, out_qsize 0
2018-09-06 19:47:51,040 : INFO : EPOCH 4 - PROGRESS: at 50.35% examples, 107636 words/s, in_qsize 8, out_qsize 0
2018-09-06 19:47:52,068 : INFO : EPOCH 4 - PROGRESS: at 63.06% examples, 108624 words/s, in_qsize 8, out_qsize 0
2018-09-06 19:47:53,125 : INFO : EPOCH 4 - PROGRESS: at 77.32% examples, 111101 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:47:54,236 : INFO : EPOCH 4 - PROGRESS: at 91.67% examples, 112056 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:47:54,738 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-09-06 19:47:54,769 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-09-06 19:47:54,816 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-09-06 19:47:54,817 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-09-06 19:47:54,818 : INFO : EPOCH - 4 : training on 1223536 raw wo

2018-09-06 19:48:39,251 : INFO : EPOCH 5 - PROGRESS: at 37.90% examples, 102852 words/s, in_qsize 8, out_qsize 0
2018-09-06 19:48:40,288 : INFO : EPOCH 5 - PROGRESS: at 47.16% examples, 96580 words/s, in_qsize 8, out_qsize 0
2018-09-06 19:48:41,311 : INFO : EPOCH 5 - PROGRESS: at 56.44% examples, 94243 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:48:42,338 : INFO : EPOCH 5 - PROGRESS: at 67.15% examples, 93848 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:48:43,368 : INFO : EPOCH 5 - PROGRESS: at 78.96% examples, 95475 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:48:44,369 : INFO : EPOCH 5 - PROGRESS: at 90.15% examples, 96213 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:48:45,053 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-09-06 19:48:45,122 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-09-06 19:48:45,139 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-09-06 19:48:45,169 : INFO : worker thread finishe

2018-09-06 19:49:26,697 : INFO : EPOCH 1 - PROGRESS: at 12.70% examples, 101593 words/s, in_qsize 8, out_qsize 0
2018-09-06 19:49:27,751 : INFO : EPOCH 1 - PROGRESS: at 24.61% examples, 101823 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:49:28,763 : INFO : EPOCH 1 - PROGRESS: at 37.90% examples, 106009 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:49:29,898 : INFO : EPOCH 1 - PROGRESS: at 51.86% examples, 106774 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:49:30,939 : INFO : EPOCH 1 - PROGRESS: at 63.06% examples, 104897 words/s, in_qsize 8, out_qsize 0
2018-09-06 19:49:31,976 : INFO : EPOCH 1 - PROGRESS: at 74.90% examples, 104852 words/s, in_qsize 8, out_qsize 0
2018-09-06 19:49:32,985 : INFO : EPOCH 1 - PROGRESS: at 87.82% examples, 106275 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:49:34,037 : INFO : EPOCH 1 - PROGRESS: at 97.16% examples, 103300 words/s, in_qsize 4, out_qsize 0
2018-09-06 19:49:34,100 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-

2018-09-06 19:50:15,481 : INFO : EPOCH 2 - PROGRESS: at 12.70% examples, 96412 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:50:16,500 : INFO : EPOCH 2 - PROGRESS: at 25.42% examples, 104375 words/s, in_qsize 8, out_qsize 0
2018-09-06 19:50:17,518 : INFO : EPOCH 2 - PROGRESS: at 39.57% examples, 109793 words/s, in_qsize 8, out_qsize 0
2018-09-06 19:50:18,539 : INFO : EPOCH 2 - PROGRESS: at 53.98% examples, 114310 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:50:19,582 : INFO : EPOCH 2 - PROGRESS: at 67.15% examples, 113696 words/s, in_qsize 8, out_qsize 0
2018-09-06 19:50:20,654 : INFO : EPOCH 2 - PROGRESS: at 80.61% examples, 113880 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:50:21,721 : INFO : EPOCH 2 - PROGRESS: at 94.06% examples, 114096 words/s, in_qsize 8, out_qsize 0
2018-09-06 19:50:21,965 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-09-06 19:50:21,989 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-09-06 19:50:22,011 : IN

2018-09-06 19:51:03,219 : INFO : EPOCH 3 - PROGRESS: at 41.52% examples, 113808 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:51:04,225 : INFO : EPOCH 3 - PROGRESS: at 55.53% examples, 117627 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:51:05,329 : INFO : EPOCH 3 - PROGRESS: at 70.15% examples, 117728 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:51:06,340 : INFO : EPOCH 3 - PROGRESS: at 82.99% examples, 117249 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:51:07,385 : INFO : EPOCH 3 - PROGRESS: at 95.58% examples, 116327 words/s, in_qsize 6, out_qsize 0
2018-09-06 19:51:07,531 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-09-06 19:51:07,611 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-09-06 19:51:07,633 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-09-06 19:51:07,637 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-09-06 19:51:07,638 : INFO : EPOCH - 3 : training on 1223536 raw wo

2018-09-06 19:51:51,611 : INFO : EPOCH 4 - PROGRESS: at 82.22% examples, 116506 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:51:52,808 : INFO : EPOCH 4 - PROGRESS: at 97.16% examples, 116274 words/s, in_qsize 4, out_qsize 0
2018-09-06 19:51:52,830 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-09-06 19:51:52,856 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-09-06 19:51:52,875 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-09-06 19:51:52,924 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-09-06 19:51:52,924 : INFO : EPOCH - 4 : training on 1223536 raw words (888740 effective words) took 7.5s, 117964 effective words/s
2018-09-06 19:51:54,077 : INFO : EPOCH 5 - PROGRESS: at 15.13% examples, 108392 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:51:55,097 : INFO : EPOCH 5 - PROGRESS: at 29.78% examples, 117076 words/s, in_qsize 7, out_qsize 0
2018-09-06 19:51:56,119 : INFO : EPOCH 5 - PROGR

During Time: 399.1652069091797


## Model_save

In [66]:
model_name = '[Tag]Doc2vec_features_100.model'
doc_vectorizer.save(model_name)

2018-09-06 19:55:41,034 : INFO : saving Doc2Vec object under [Tag]Doc2vec_features_100.model, separately None
2018-09-06 19:55:41,317 : INFO : saved [Tag]Doc2vec_features_100.model
