In [1]:
import argparse
import os
import logging
import numpy as np
import tensorflow as tf

from dataset import KinQueryDataset, preprocess

In [2]:
def _batch_loader(iterable, n=1):
    """
    데이터를 배치 사이즈만큼 잘라서 보내주는 함수입니다. PyTorch의 DataLoader와 같은 역할을 합니다

    :param iterable: 데이터 list, 혹은 다른 포맷
    :param n: 배치 사이즈
    :return:
    """
    length = len(iterable)
    for n_idx in range(0, length, n):
        yield iterable[n_idx:min(n_idx + n, length)]


def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)


def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

In [3]:
def load_dict(dict_path, max_vocab=None):
    logging.info("Try load dict from {}.".format(dict_path))
    try:
        dict_file = open(dict_path)
        dict_data = dict_file.readlines()
        dict_file.close()
    except:
        logging.info(
            "Load dict {dict} failed, create later.".format(dict=dict_path))
        return None

    dict_data = list(map(lambda x: x.split(), dict_data))
    if max_vocab:
        dict_data = list(filter(lambda x: int(x[0]) < max_vocab, dict_data))
    tok2id = dict(map(lambda x: (x[1], int(x[0])), dict_data))
    id2tok = dict(map(lambda x: (int(x[0]), x[1]), dict_data))
    logging.info(
        "Load dict {} with {} words.".format(dict_path, len(tok2id)))
    return (tok2id, id2tok)

def create_dict(dict_path, corpus, max_vocab=None):
    logging.info("Create dict {}.".format(dict_path))
    counter = {}
    for line in corpus:
        for word in line:
            try:
                counter[word] += 1
            except:
                counter[word] = 1

    for mark_t in MARKS:
        if mark_t in counter:
            del counter[mark_t]
            logging.warning("{} appears in corpus.".format(mark_t))

    counter = list(counter.items())
    counter.sort(key=lambda x: -x[1])
    words = list(map(lambda x: x[0], counter))
    words = [MARK_PAD, MARK_UNK] + words
    if max_vocab:
        words = words[:max_vocab]

    tok2id = dict()
    id2tok = dict()
    with open(dict_path, 'w') as dict_file:
        for idx, tok in enumerate(words):
            print(idx, tok, file=dict_file)
            tok2id[tok] = idx
            id2tok[idx] = tok

    logging.info(
        "Create dict {} with {} words.".format(dict_path, len(words)))
    return (tok2id, id2tok)

def corpus_map2id(data, tok2id):
    ret = []
    unk = 0
    tot = 0
    for doc in data:
        tmp = []
        for word in doc:
            tot += 1
            try:
                tmp.append(tok2id[word])
            except:
                tmp.append(ID_UNK)
                unk += 1
        ret.append(tmp)
    return ret, (tot - unk) / tot


def sen_map2tok(sen, id2tok):
    return list(map(lambda x: id2tok[x], sen))


def load_data(doc_filename,
              doc_dict_path,
              max_doc_vocab=None):
    logging.info(
        "Load document from {}.".format(
            doc_filename))

    with open(doc_filename) as docfile:
        docs = docfile.readlines()

    docs = list(map(lambda x: x.split(), docs))

    doc_dict = load_dict(doc_dict_path, max_doc_vocab)
    if doc_dict is None:
        doc_dict = create_dict(doc_dict_path, docs, max_doc_vocab)

    docid, cover = corpus_map2id(docs, doc_dict[0])
    logging.info(
        "Doc dict covers {:.2f}% words.".format(cover * 100))

    return docid, doc_dict

def corpus_preprocess(corpus):
    import re
    ret = []
    for line in corpus:
        x = re.sub('\\d', '#', line)
        ret.append(x)
    return ret


def sen_postprocess(sen):
    return sen

In [4]:
class KinQueryDataset:
    def __init__(self, dataset_path: str, max_length: int):
        # 데이터, 레이블 각각의 경로
        
        queries_path = os.path.join(dataset_path, 'train', 'train_data')
        labels_path = os.path.join(dataset_path, 'train', 'train_label')
        # 지식인 데이터를 읽고 preprocess까지 진행합니다
        dicId , dic = load_data(queries_path, "doc_dict", 3000)
        self.queries = preprocess(dicId, max_length)
        # 지식인 레이블을 읽고 preprocess까지 진행합니다.
        with open(labels_path) as f:
            self.labels = np.array([[np.float32(x)] for x in f.readlines()])
    def __len__(self):
        return len(self.queries)
    def __getitem__(self, idx):
        return self.queries[idx], self.labels[idx]



In [5]:
def preprocess(data: list, max_length: int):
    vectorized_data = data
    zero_padding = np.zeros((len(data), max_length), dtype=np.int32)
    for idx, seq in enumerate(vectorized_data):
        length = len(seq)
        if length >= max_length:
            length = max_length
            zero_padding[idx, :length] = np.array(seq)[:length]
        else:
            zero_padding[idx,] = np.append(zero_padding[idx,:-length], np.array(seq))
    return zero_padding

In [6]:
# User options
batch = 200
epochs = 100

embedding = 10
strmaxlen = 50
DATASET_PATH = '../sample_data/movie_review/'

# 모델의 specification
input_size = embedding*strmaxlen
output_size = 1
hidden_layer_size = 200
learning_rate = 0.001
character_size = 424

x = tf.placeholder(tf.int32, [None, strmaxlen])
y_ = tf.placeholder(tf.float32, [None, output_size])
# 임베딩
char_embedding = tf.get_variable('char_embedding', [character_size, embedding])
embedded = tf.nn.embedding_lookup(char_embedding, x)

# 첫 번째 레이어
cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_layer_size)
#cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=0.9)
#cell2 = tf.nn.rnn_cell.BasicLSTMCell(hidden_layer_size)

#multi_cell = tf.nn.rnn_cell.MultiRNNCell([cell, cell2])

output, state = tf.nn.dynamic_rnn(cell, embedded, dtype=tf.float32)
output = tf.transpose(output, [1, 0, 2])
output = output[-1]
print("output shaep",output.shape)

# 두 번째 (아웃풋) 레이어
second_layer_weight = weight_variable([hidden_layer_size, output_size])
second_layer_bias = bias_variable([output_size])
foutput = tf.matmul(output, second_layer_weight) + second_layer_bias
#output_sigmoid = tf.sigmoid(output)
print("output shaep",foutput.shape)
print("lables shaep",y_.shape)
# loss와 optimizer
#global_step = tf.Variable(0)

cost = tf.reduce_mean(tf.losses.mean_squared_error(labels=y_, predictions=foutput))
#cost = tf.reduce_mean(-(y_ * tf.log(output)) - (1-y_) * tf.log(1-output))
#learning_rate= tf.train.exponential_decay(learning_rate, global_step, 10000, 0.75)
train_step = tf.train.AdamOptimizer(learning_rate).minimize(cost)
#train_step = tf.train.AdadeltaOptimizer(1.0, 0.95, 1e-6).minimize(cost)

sess = tf.InteractiveSession()
tf.global_variables_initializer().run()



output shaep (?, 200)
output shaep (?, 1)
lables shaep (?, 1)


In [7]:
dataset = KinQueryDataset(DATASET_PATH, strmaxlen)
dataset_len = len(dataset)

In [8]:
print(dataset_len)

107


In [9]:
import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()
batch = 1
dataset = KinQueryDataset(DATASET_PATH, strmaxlen)
dataset_len = len(dataset)
one_batch_size = dataset_len//batch
if dataset_len % batch != 0:
    one_batch_size += 1
# epoch마다 학습을 수행합니다.
for epoch in range(epochs):
    avg_loss = 0.0
    for i, (data, labels) in enumerate(_batch_loader(dataset, batch)):
        #zero = np.zeros([batch,1])
        #zero[0] = 
        _, loss, o = sess.run([train_step, cost, foutput],
                           feed_dict={x: data, y_: labels})
        #print('Batch : ', i + 1, '/', one_batch_size,', BCE in this minibatch: ', (loss))
        avg_loss += float(loss)
        #print("output",o.reshape((1,-1)))
        #print("lablels", labels.reshape((1,-1)))
    print('epoch:', epoch, ' train_loss:', float(avg_loss/one_batch_size),timeSince(start))

epoch: 0  train_loss: 14.000910227328417 0m 1s
epoch: 1  train_loss: 7.394086530561304 0m 3s
epoch: 2  train_loss: 7.2915475905843845 0m 5s
epoch: 3  train_loss: 7.275028882750716 0m 7s
epoch: 4  train_loss: 7.255075617391141 0m 9s
epoch: 5  train_loss: 7.2094476247123644 0m 11s
epoch: 6  train_loss: 7.12834645439566 0m 12s
epoch: 7  train_loss: 6.963680754406987 0m 14s
epoch: 8  train_loss: 6.4129705685870855 0m 16s
epoch: 9  train_loss: 4.052321985119034 0m 18s
epoch: 10  train_loss: 2.4801265441935874 0m 20s
epoch: 11  train_loss: 1.6705004655090068 0m 22s
epoch: 12  train_loss: 1.2692343617034516 0m 23s
epoch: 13  train_loss: 0.8191741406489175 0m 25s
epoch: 14  train_loss: 0.57157738192323 0m 27s
epoch: 15  train_loss: 0.3581218612378966 0m 29s
epoch: 16  train_loss: 0.2804601522301004 0m 31s
epoch: 17  train_loss: 0.2933378097546223 0m 32s
epoch: 18  train_loss: 0.2424758811995527 0m 34s
epoch: 19  train_loss: 0.21722462545877985 0m 36s
epoch: 20  train_loss: 0.17963464545993413 

In [10]:
print("output",o.reshape((1,-1)))
print("lablels", labels.reshape((1,-1)))

output [[ 9.63156509]]
lablels [[ 10.]]


In [29]:
import data_util
def infer(raw_data, **kwargs):
        # dataset.py에서 작성한 preprocess 함수를 호출하여, 문자열을 벡터로 변환합니다
        preprocessed_data, dod = data_util.load_data(raw_data,'doc_dict', strmaxlen)
        queries = preprocess(preprocessed_data, strmaxlen)
        # 저장한 모델에 입력값을 넣고 prediction 결과를 리턴받습니다
        #print(preprocessed_data)
        pred = sess.run(foutput, feed_dict={x: queries})

        if np.any((np.logical_or(pred<0, pred>10))):
            pred[np.where(pred<0)] = 0
            pred[np.where(pred>10)] = 10
        point = pred.flatten()
        # DONOTCHANGE: They are reserved for nsml
        # 리턴 결과는 [(확률, 0 or 1)] 의 형태로 보내야만 리더보드에 올릴 수 있습니다. 리더보드 결과에 확률의 값은 영향을 미치지 않습니다
        return list(zip(np.zeros(len(point)), point))


In [13]:
def _batch_loader(iterable, n=1):

    length = len(iterable)
    for n_idx in range(0, length, n):
        yield iterable[n_idx:min(n_idx + n, length)]

In [30]:
with open(os.path.join(DATASET_PATH, 'train/train_data'), 'rt', encoding='utf-8') as f:
    queries = f.readlines()
res = []
for batch in _batch_loader(queries, 200):
    temp_res = infer(batch)
    res += temp_res
print(res)

[(0.0, 6.7659721), (0.0, 6.8927441), (0.0, 8.9276371), (0.0, 6.8059282), (0.0, 9.0036631), (0.0, 10.0), (0.0, 6.3613091), (0.0, 10.0), (0.0, 6.2177), (0.0, 7.7517004), (0.0, 9.5121126), (0.0, 6.1602783), (0.0, 9.2664976), (0.0, 8.8918076), (0.0, 9.0356712), (0.0, 8.9323883), (0.0, 9.4920063), (0.0, 8.9323883), (0.0, 8.6010523), (0.0, 9.4178371), (0.0, 6.941514), (0.0, 7.1960545), (0.0, 6.9128475), (0.0, 7.7637119), (0.0, 6.8020983), (0.0, 8.300272), (0.0, 9.2337198), (0.0, 9.2530594), (0.0, 6.5494986), (0.0, 6.5494986), (0.0, 8.9254465), (0.0, 9.2337198), (0.0, 7.668149), (0.0, 8.3516264), (0.0, 6.5494986), (0.0, 4.5342312), (0.0, 6.5494986), (0.0, 6.6981001), (0.0, 5.1153784), (0.0, 2.7503002), (0.0, 7.1914301), (0.0, 8.7108364), (0.0, 8.9117308), (0.0, 1.9282413), (0.0, 7.1914301), (0.0, 7.0425749), (0.0, 7.4378223), (0.0, 6.5494986), (0.0, 6.8590784), (0.0, 7.1914301), (0.0, 6.8583755), (0.0, 9.720521), (0.0, 4.2906008), (0.0, 9.6816006), (0.0, 9.0883255), (0.0, 6.5494986), (0.0, 6.