In [78]:
import requests
from bs4 import BeautifulSoup
import time
import re
from tqdm import tqdm
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle

In [13]:
data = []

In [14]:
for i in tqdm(range(1, 972)) :
    table_url = 'https://kin.naver.com/best/listaha.nhn?page=' + str(i)
    table_request = requests.get(table_url)
    table_soup = BeautifulSoup(table_request.text, 'html.parser')
    url_list = table_soup.find('tbody', {'id' : 'au_board_list'}).find_all('td', {'class' : 'title'})
    for link in url_list :
        doc_url = 'https://kin.naver.com' + link.find('a').get('href')
        doc_request = requests.get(doc_url)
        # 네이버 서버에서 튕기는 것을 막기 위해 request마다 2초씩 term
#         time.sleep(2)
        
        doc_soup = BeautifulSoup(doc_request.text, 'html.parser')
        
        # doc_infomation
        if not doc_soup.find('ul', {'class' : 'location'}) :
            continue   # 삭제된 게시물은 continue
        title = link.text.strip()
        category = doc_soup.find('ul', {'class' : 'location'}).find_all('a')[2].text.strip()
        q_id = doc_url.split('=')[-1]
        
        # question part
        question = doc_soup.find('div', {'class' : 'end_question'})
        temp_dict = {}
        if question :
            temp_dict['title'] = title
            temp_dict['text'] = question.find('div', {'class' : '_endContentsText'}).text.strip()
            temp_dict['category'] = category
            temp_dict['datetime'] = [temp for temp in map(lambda x : re.search(r'(\d{4}.\d{2}.\d{2})', x), map(lambda x : x.text, question.find_all('dd', {'class' : 'date'}))) if temp is not None][0].group(1)
            temp_dict['q_id'] = q_id
            temp_dict['qa'] = 0   # 0 : question, 1 : adopted answser, 2 : other answers
            data.append(temp_dict)
        else :
            pass
        
        
        #answer part
        answers = doc_soup.find_all('div', {'class' : 'end_answer'})
        for answer in answers :
            temp_dict = {}
            try:
                if '삭제' in answer.find('h3').text.strip() :
                    continue
                else :
                    if answer.find('div', {'class' : 'end_title_ico adopt_question'}) :
                        temp_dict['qa'] = 1
                    else :
                        temp_dict['qa'] = 2
                    temp_dict['title'] = title
                    temp_dict['text'] = answer.find('div', {'class' : '_endContentsText'}).text.strip()
                    temp_dict['category'] = category
                    temp_dict['datetime'] = answer.find('div', {'class' : 'end_date'}).text.strip()
                    temp_dict['q_id'] = q_id
                    data.append(temp_dict)
            except :
                continue
        

100%|██████████| 971/971 [1:24:47<00:00,  5.42s/it]


### 중간정리
 - time.sleep 안해도 잘 스크래핑 됨
 - 리스트를 나누지 말고 dict 타입으로 instance 하나씩 넣기

In [50]:
# raw_df = pd.DataFrame(data)

In [51]:
# raw_df.to_csv('raw_kin_data.csv', index = False)

In [56]:
raw_df = pd.read_csv('raw_kin_data.csv', na_filter = False)

In [35]:
def clean_str(s) :
    # only include alphanumerics and Korean
    s = re.sub(r"[^A-Za-z0-9가-힣,!?\'\`]", " ", s)
    
    # insert spaces in words with apostrophes
    s = re.sub(r"\'s", " \'s", s)
    s = re.sub(r"\'ve", " \'ve", s)
    s = re.sub(r"n\'t", " n\'t", s)
    s = re.sub(r"\'re", " \'re", s)
    s = re.sub(r"\'d", " \'d", s)
    s = re.sub(r"\'ll", " \'ll", s)
    
    # insert spaces in special characters
    s = re.sub(r",", " , ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\?", " \? ", s)
    
    # only include alphanumerics and Korean again
    s = re.sub(r"[^A-Za-z0-9가-힣(),!?\'\`]", " ", s)
    
    # reduce multiple spaces to single spaces
    s = re.sub(r"\s{2,}", " ", s)
    return s.strip().lower()

In [57]:
raw_text = raw_df.text

In [61]:
clean_text = [clean_str(x) for x in raw_text]

In [62]:
from gensim.models import FastText, Word2Vec

In [11]:
#embedding_model = Word2Vec(list(map(lambda x : x.split(), clean_text)), size = 300, min_count = 10, workers = 12, negative = 10, iter = 10)

In [None]:
#embedding_model.save('word2vec_model')

In [63]:
embedding_model = Word2Vec.load('word2vec_model')

### Character별로 초성, 중성, 종성 나누기

In [64]:
import hgtk

In [65]:
word_vocab = embedding_model.wv.index2word

In [66]:
char_bag = ''.join(word_vocab)

In [67]:
char_bag =re.sub('[^가-힣]', '', char_bag)

In [68]:
first = []
middle = []
end = []
for s in char_bag :
    if s :
        f, m, e = hgtk.letter.decompose(s)
        first.append(f)
        middle.append(m)
        end.append(e)
        

In [69]:
first = list(set(first))
middle = list(set(middle))
end = list(set(end))

In [70]:
first_dict = {}
for i, v in enumerate(first) :
    first_dict[v] = i

In [71]:
middle_dict = {}
for i, v in enumerate(middle) :
    middle_dict[v] = i

In [72]:
end_dict = {}
for i, v in enumerate(end) :
    end_dict[v] = i

In [73]:
hangul_vocab = [word for word in word_vocab if hgtk.checker.is_hangul(word)]

In [74]:
length = list(map(lambda x : len(x), hangul_vocab))

In [75]:
def get_index(word_list) :
    return[first_dict[word_list[0]], middle_dict[word_list[1]], end_dict[word_list[2]]]

In [76]:
# train_text = list()
# for text in tqdm(clean_text) :
#     train_text.append([word for word in text.split() if word in hangul_vocab])

100%|██████████| 67717/67717 [1:39:50<00:00, 11.30it/s]  


In [79]:
# with open('train_text.pkl', 'wb') as f:
#     pickle.dump(train_text, f)

In [81]:
with open('train_text.pkl', 'rb') as f :
    train_text = pickle.load(f)

In [89]:
tokens = list()
lengths = list()
targets = list()
for word in train_text :
    temp = list(map(lambda x : hgtk.letter.decompose(x), word))
    temp = list(map(get_index, temp))
    pad_length = max(length) - len(word)
    temp = np.pad(temp, [[0, pad_length], [0, 0]], 'constant', constant_values = 0).tolist()
    tokens.append(temp)
    length.append(len(word))
    targets.append(embedding_model.wv.get_vector(word))

TypeError: ord() expected a character, but string of length 3 found

In [None]:
tokens = np.array(tokens)

In [None]:
batch_size = 256
embedding_size = 100
num_sample = 32
epochs = 3

train_inputs = tf.placeholder(tf.int32, shape = [batch_size])
train_y = tf.placeholder(tf.int32, shape = [batch_size, 300])

In [None]:
first_embeddings = tf.Variable(
    tf.random_uniform([len(first), embedding_size], -1.0, 1.0))

middle_embeddings = tf.Variable(
    tf.random_uniform([len(middle), embedding_size], -1.0, 1.0))

end_embeddings = tf.Variable(
    tf.random_uniform([len(end), embedding_size], -1.0, 1.0))

In [None]:
nce_weights = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

In [None]:
with tf.Session() as sess :
    train_inputs.eval()

In [None]:
first_index = list(map(lambda x : first.index(x), train_input))
first_index = list(map(lambda x : first.index(x), ['ㅁ', 'ㅈ', 'ㅂ', 'ㅁ', 'ㅅ']))

In [None]:
first_embed = tf.nn.embedding_lookup(first_embeddings, first_index)
middle_embed = tf.nn.embedding_lookup(middle_embeddings, middle_index)
end_embed = tf.nn.embedding_lookup(end_embeddings, end_index)