In [1]:
import argparse
import os
import logging
import numpy as np
import tensorflow as tf

from dataset import KinQueryDataset, preprocess

In [2]:
MARK_PAD = "<PAD>"
MARK_UNK = "<UNK>"

MARKS = [MARK_PAD, MARK_UNK]
ID_PAD = 0
ID_UNK = 1

In [3]:
def _batch_loader(iterable, n=1):
    """
    데이터를 배치 사이즈만큼 잘라서 보내주는 함수입니다. PyTorch의 DataLoader와 같은 역할을 합니다

    :param iterable: 데이터 list, 혹은 다른 포맷
    :param n: 배치 사이즈
    :return:
    """
    length = len(iterable)
    for n_idx in range(0, length, n):
        yield iterable[n_idx:min(n_idx + n, length)]


def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)


def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

In [4]:
def load_dict(dict_path, max_vocab=None):
    logging.info("Try load dict from {}.".format(dict_path))
    try:
        dict_file = open(dict_path)
        dict_data = dict_file.readlines()
        dict_file.close()
    except:
        logging.info(
            "Load dict {dict} failed, create later.".format(dict=dict_path))
        return None

    dict_data = list(map(lambda x: x.split(), dict_data))
    if max_vocab:
        dict_data = list(filter(lambda x: int(x[0]) < max_vocab, dict_data))
    tok2id = dict(map(lambda x: (x[1], int(x[0])), dict_data))
    id2tok = dict(map(lambda x: (int(x[0]), x[1]), dict_data))
    logging.info(
        "Load dict {} with {} words.".format(dict_path, len(tok2id)))
    return (tok2id, id2tok)

def create_dict(dict_path, corpus, max_vocab=None):
    logging.info("Create dict {}.".format(dict_path))
    counter = {}
    for line in corpus:
        for word in line:
            try:
                counter[word] += 1
            except:
                counter[word] = 1

    for mark_t in MARKS:
        if mark_t in counter:
            del counter[mark_t]
            logging.warning("{} appears in corpus.".format(mark_t))

    counter = list(counter.items())
    counter.sort(key=lambda x: -x[1])
    words = list(map(lambda x: x[0], counter))
    words = [MARK_PAD, MARK_UNK] + words
    if max_vocab:
        words = words[:max_vocab]

    tok2id = dict()
    id2tok = dict()
    with open(dict_path, 'w') as dict_file:
        for idx, tok in enumerate(words):
            print(idx, tok, file=dict_file)
            tok2id[tok] = idx
            id2tok[idx] = tok

    logging.info(
        "Create dict {} with {} words.".format(dict_path, len(words)))
    return (tok2id, id2tok)

def corpus_map2id(data, tok2id):
    ret = []
    unk = 0
    tot = 0
    for doc in data:
        tmp = []
        for word in doc:
            tot += 1
            try:
                tmp.append(tok2id[word])
            except:
                tmp.append(ID_UNK)
                unk += 1
        ret.append(tmp)
    return ret, (tot - unk) / tot


def sen_map2tok(sen, id2tok):
    return list(map(lambda x: id2tok[x], sen))


def load_data(data,
              doc_dict_path,
              max_doc_vocab=None):
    with open(data,'rt', encoding="utf-8") as docfile:
        docs = docfile.readlines()

    docs = list(map(lambda x: x.split(), data))

    doc_dict = load_dict(doc_dict_path, max_doc_vocab)
    if doc_dict is None:
        doc_dict = create_dict(doc_dict_path, docs, max_doc_vocab)

    docid, cover = corpus_map2id(docs, doc_dict[0])
    print("Doc dict covers %.2f%% words."%(cover*100))

    return docid, doc_dict

def load_test_data(doc, doc_dict):
    docs = corpus_preprocess(doc)

    print("Load %d testing documents."%(len(docs)))
    docs = list(map(lambda x: x.split(), docs))

    docid, cover = corpus_map2id(docs, doc_dict[0])
    print("Doc dict covers %.2f words."%(cover*100))

    return docid

def corpus_preprocess(corpus):
    import re
    ret = []
    for line in corpus:
        x = re.sub('\\d', '#', line)
        ret.append(x)
    return ret


def sen_postprocess(sen):
    return sen

In [5]:
class KinQueryDataset:
    def __init__(self, dataset_path: str, max_length: int, vocab_size: int):
        # 데이터, 레이블 각각의 경로
        queries_path = os.path.join(dataset_path, 'train', 'train_data')
        labels_path = os.path.join(dataset_path, 'train', 'train_label')

        # 지식인 데이터를 읽고 preprocess까지 진행합니다
        dict_name = str(vocab_size)+"dict"
        dicId , dic = load_data(queries_path, dict_name , vocab_size)
        self.queries = preprocess(dicId, dic, max_length)
        # 지식인 레이블을 읽고 preprocess까지 진행합니다.
        
        with open(labels_path) as f:
            self.labels = np.array([[np.float32(x)] for x in f.readlines()])

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, idx):
        return self.queries[idx], self.labels[idx]

In [6]:
def preprocess(data: list, dic: list, max_length: int):
    vectorized_data = data
    zero_padding = np.zeros((len(data), max_length), dtype=np.int32)
    #print(data)
    for idx, seq in enumerate(vectorized_data):
        length = len(seq)
        #print(idx, seq)
        if length >= max_length:
            length = max_length
            zero_padding[idx, :length] = np.array(seq)[:length]
        elif(length == 0):
            zero_padding[idx,] = zero_padding[idx,]
        else:
            zero_padding[idx,] = np.append(zero_padding[idx,:-length], np.array(seq))
    return zero_padding

In [7]:
#DATASET_PATH = '../sample_data/movie_review/'
#queries_path = os.path.join(DATASET_PATH, 'train', 'train_data')
#with open(queries_path, 'rt', encoding='utf8') as f:
#            print(f.readlines())
#dataset = KinQueryDataset(DATASET_PATH, strmaxlen)

In [8]:
def label_one_hot(labels):
    one_hot = (np.arange(output_size) == labels[:]).astype(np.int32)
    return one_hot
def catFromOut(output):
    idx = []
    for i in output:
        ids = np.argmax(i, axis = 0)
        idx.append(ids)
    idx = np.array(idx)
    return idx

In [9]:
# User options
batch = 1
epochs = 100

embedding = 30
strmaxlen = 50
DATASET_PATH = '../sample_data/movie_review/'

# 모델의 specification
input_size = embedding*strmaxlen
output_size = 1
hidden_layer_size = 200
learning_rate = 0.01
dict_size = 424

x = tf.placeholder(tf.int32, [None, strmaxlen])
y_ = tf.placeholder(tf.float32, [None, output_size])
# 임베딩
char_embedding = tf.get_variable('char_embedding', [dict_size, embedding])
embedded = tf.nn.embedding_lookup(char_embedding, x)

# 첫 번째 레이어
cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_layer_size)
#cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=0.9)
#cell2 = tf.nn.rnn_cell.BasicLSTMCell(hidden_layer_size)

#multi_cell = tf.nn.rnn_cell.MultiRNNCell([cell, cell2])

output, state = tf.nn.dynamic_rnn(cell, embedded, dtype=tf.float32)
output = tf.transpose(output, [1, 0, 2])
output = output[-1]
print("output shaep",output.shape)

# 두 번째 (아웃풋) 레이어
second_layer_weight = weight_variable([hidden_layer_size, output_size])
second_layer_bias = bias_variable([output_size])
output = tf.matmul(output, second_layer_weight) + second_layer_bias
#output_sigmoid = tf.sigmoid(output)
print("output shape",output.shape)
print("lables shaep",y_.shape)
# loss와 optimizer
#global_step = tf.Variable(0)
foutput = tf.nn.log_softmax(output)
cost = tf.reduce_mean(tf.losses.mean_squared_error(labels=y_, predictions=output))
#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=output, labels=y_))
#learning_rate= tf.train.exponential_decay(learning_rate, global_step, 10000, 0.75)
train_step = tf.train.AdamOptimizer(learning_rate).minimize(cost)


sess = tf.InteractiveSession()
tf.global_variables_initializer().run()



output shaep (?, 200)
output shape (?, 1)
lables shaep (?, 1)


In [10]:
import time
import math
from sklearn.metrics import mean_squared_error
all_losses=[]
current_loss=0
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()
#epochs = 10
dataset = KinQueryDataset(DATASET_PATH, strmaxlen, 3000)
dataset_len = len(dataset)
one_batch_size = dataset_len//batch
if dataset_len % batch != 0:
    one_batch_size += 1
# epoch마다 학습을 수행합니다.
for epoch in range(epochs):
    avg_loss = 0.0
    for epoch in range(epochs):
            avg_loss = 0.0
            for i, (data, labels) in enumerate(_batch_loader(dataset, batch)):
                _, loss = sess.run([train_step, cost],
                                   feed_dict={x: data, y_: labels})
                print('Batch : ', i + 1, '/', one_batch_size,
                      ', MSE in this minibatch: ', float(loss))
                avg_loss += float(loss)
            print('epoch:', epoch, ' train_loss:', float(avg_loss/one_batch_size),timeSince(start))

NameError: name 'data_util' is not defined

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

plt.figure()
plt.plot(all_losses)

plt.show()