In [1]:
from os import listdir, path
from tqdm import tqdm
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder

  from ._conv import register_converters as _register_converters


<h2><center> Подготовка данных</center></h2>

In [2]:
SOURCE_PATH = './data/sources/'

'''
sources_ws - split by white spaces and \n
sources_ch - split by chars
'''
sources_ws = []
sources_ch = []
for file_name in tqdm(listdir(SOURCE_PATH)):
    with open(path.join(SOURCE_PATH, file_name), 'r', encoding="ansi") as file:
        data=file.read()
        sources_ws.append(data.split())
        sources_ch.append([ord(char) for char in list(data)])

100%|███████████████████████████████████| 21941/21941 [00:41<00:00, 526.75it/s]


In [3]:
print(len(sources_ws))
print(len(sources_ch))

21941
21941


In [4]:
lengths_ws = [len(data) for data in sources_ws]
lengths_ch = [len(data) for data in sources_ch]
max_l_ws = max(lengths_ws)
min_l_ws = min(lengths_ws)
max_l_ch = max(lengths_ch)
min_l_ch = min(lengths_ch)
print('Максимальная длина разбинения по пробелам: ', max_l_ws)
print('Минимальная длина разбинения по пробелам: ', min_l_ws)
print('Максимальная длина разбинения по символам: ', max_l_ch)
print('Минимальная длина разбинения по символам: ', min_l_ch)

Максимальная длина разбинения по пробелам:  289
Минимальная длина разбинения по пробелам:  1
Максимальная длина разбинения по символам:  1023
Минимальная длина разбинения по символам:  7


In [5]:
PAD_SYMBOL = ord(' ')
flat_sources_ch = [ch for source in sources_ch for ch in source]
flat_sources_ch.append(PAD_SYMBOL)
unique_ch = set(flat_sources_ch)
ch2index = {ch: i for (i, ch) in enumerate(unique_ch)}
index2ch = {i: ch for (i, ch) in enumerate(unique_ch)}

In [6]:
a = [ch for source in sources_ch for ch in source]
ch_encoder = OneHotEncoder()
ch_one_hot = ch_encoder.fit_transform(np.reshape(a, (-1, 1)))

In [7]:
ch_one_hot_sources = []
end_index = 0
for l in lengths_ch:
    ch_one_hot_sources.append(ch_one_hot[end_index:end_index+l])
    end_index += l

In [8]:
def get_batch(x, batch_size):
    x_batch = np.array([seq.toarray() for seq in x[:batch_size]])
    seq_lengths = [seq.shape[0] for seq in x_batch]
    return x_batch, seq_lengths

def next_batch(x, size):
    idx = np.arange(0 , len(x))
    np.random.shuffle(idx)
    idx = idx[:size]
    x_batch = np.array([x[i].toarray() for i in idx])
    seq_lengths = [seq.shape[0] for seq in x_batch]
    return x_batch, seq_lengths

def add_padding(batch, pad_size, padding):
    result = np.empty((0, pad_size, batch[0].shape[1]), np.float32)
    for seq in batch:
        if len(seq) < pad_size:
            seq = np.concatenate([seq, [padding for _ in range(pad_size - len(seq))]])
        result = np.append(result, [seq], axis=0)
    return result

In [9]:
tf.reset_default_graph()

ENCODER_LAYERS = [1024, 512]
DECODER_LAYERS = [1024]
VEC_DIM = len(unique_ch)

X = tf.placeholder(tf.float32, [None, None, VEC_DIM])
Y = tf.placeholder(tf.float32, [None, None, VEC_DIM])
seq_length = tf.placeholder(tf.int32, [None])

with tf.variable_scope('encoder'):
    encoder_layers = [tf.contrib.rnn.BasicRNNCell(size) for size in ENCODER_LAYERS]
    encoder_layer_cell = tf.contrib.rnn.MultiRNNCell(encoder_layers)
    _, encoder_state = tf.nn.dynamic_rnn(cell=encoder_layer_cell,
                                         inputs=X,
                                         dtype=tf.float32,
                                         sequence_length = seq_length)


with tf.variable_scope('decoder'):
    decoder_input_pad = tf.zeros((tf.shape(X)[0], tf.shape(X)[1]-1, ENCODER_LAYERS[-1]))
    decoder_input = tf.concat([tf.reshape(encoder_state[-1], (-1, 1, ENCODER_LAYERS[-1])), decoder_input_pad], axis=1)
    decoder_layers = [tf.contrib.rnn.BasicRNNCell(size) for size in DECODER_LAYERS + [VEC_DIM]]
    decoder_layer_cell = tf.contrib.rnn.MultiRNNCell(decoder_layers)
    decoder_out, decoder_state = tf.nn.dynamic_rnn(cell=decoder_layer_cell,
                                                   inputs=decoder_input,
                                                   dtype=tf.float32,
                                                   sequence_length = seq_length)
    logit = tf.nn.softmax(decoder_out, axis=2)
    
    target = tf.argmax(Y, axis=2)
#     t = tf.reduce_sum(logit, axis=2)
    target_weights = tf.cast(tf.sequence_mask(seq_length, tf.reduce_max(seq_length)), tf.float32)
#     rmse_loss = tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(targets, outputs))))
    loss = tf.contrib.seq2seq.sequence_loss(logit, target, target_weights)

In [10]:
BATCH_SIZE = 1

init = tf.global_variables_initializer()
sess = tf.Session()

sess.run(init)

x_batch, batch_seq_len = next_batch(ch_one_hot_sources, BATCH_SIZE)
print(batch_seq_len)
print(x_batch.shape)
# print(x_batch[:,:10,:].shape)
# print(x_batch[1].shape)
print('---=-=')
x_batch = add_padding(x_batch, max(batch_seq_len), np.eye(VEC_DIM)[PAD_SYMBOL])
print(x_batch.shape)
x_batch = x_batch[:,:10,:]
batch_seq_len = [10]
print(x_batch.shape)
# train_x = np.array(ch_one_hot_sources[:2])
# dim = train_x[0].toarray()
# print(dim)
# print(train_x)
print('---')
print(sess.run(encoder_state, feed_dict={X: x_batch, seq_length: batch_seq_len})[-1].shape)
# print(sess.run(decoder_input_pad, feed_dict={X: x_batch, seq_length: batch_seq_len}))
print(sess.run(decoder_input, feed_dict={X: x_batch, seq_length: batch_seq_len}).shape)
print(sess.run(decoder_out, feed_dict={X: x_batch, seq_length: batch_seq_len}).shape)
print(sess.run(logit, feed_dict={X: x_batch, seq_length: batch_seq_len}))
print(sess.run(target, feed_dict={Y: x_batch, seq_length: batch_seq_len}))
print(sess.run(target_weights, feed_dict={Y: x_batch, seq_length: batch_seq_len}))
print(sess.run(loss, feed_dict={X: x_batch, Y: x_batch, seq_length: batch_seq_len}))

# print(sess.run(t, feed_dict={X: x_batch, seq_length: batch_seq_len}))


[515]
(1, 515, 113)
---=-=
(1, 515, 113)
(1, 10, 113)
---
(1, 512)
(1, 10, 512)
(1, 10, 113)
[[[0.00823788 0.01020501 0.00885439 ... 0.01071051 0.00840354 0.01011684]
  [0.00990466 0.01107283 0.00885214 ... 0.00907372 0.00723025 0.00864497]
  [0.00848261 0.00834738 0.00850012 ... 0.00746135 0.00885945 0.00923185]
  ...
  [0.00851918 0.00858712 0.00948008 ... 0.00922577 0.00895549 0.00855347]
  [0.00863538 0.00919989 0.00831656 ... 0.00937999 0.00932554 0.00901209]
  [0.00853429 0.00920118 0.00876028 ... 0.00854011 0.00897201 0.00857319]]]
[[20 15  5 59 74 87 78 75 94 31]]
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]
4.727465


In [11]:
train_size = 0.99
X_train = ch_one_hot_sources[:int(train_size*len(ch_one_hot_sources))]
X_test = np.array([seq.toarray() for seq in ch_one_hot_sources[int(train_size*len(ch_one_hot_sources)):]])
print('Train set size: ', len(X_train))
print('Test set size: ', len(X_test))
test_seq_length = [seq.shape[0] for seq in X_test]
max_test_seq_length = max(test_seq_length)
X_test = add_padding(X_test, max_test_seq_length, np.eye(VEC_DIM)[PAD_SYMBOL])

Train set size:  21721
Test set size:  220


In [12]:
LEARNING_RATE = 0.01
BATCH_SIZE = 10
EPOCHES = 1
N_BATHCES = int(len(X_train)/BATCH_SIZE)
print('Batches: ', N_BATHCES)

optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
sess = tf.Session()

sess.run(init)

for epoch in range(EPOCHES):
    print(X_test.shape)
    test_feed_dict = {
        X: X_test,
        Y: X_test,
        seq_length: test_seq_length
    }
    test_loss = sess.run(loss, feed_dict=test_feed_dict)
    print('Epoch %u, test loss: %f' %(epoch, test_loss))
    for i in range(N_BATHCES):
        x_batch, batch_seq_len = next_batch(X_train, BATCH_SIZE)
        x_batch = add_padding(x_batch, max(batch_seq_len), np.eye(VEC_DIM)[PAD_SYMBOL])
        feed_dict = {
            X: x_batch,
            Y: x_batch,
            seq_length: batch_seq_len
        }
        sess.run(loss, feed_dict=feed_dict)
        train_loss = sess.run(loss, feed_dict=feed_dict)
        print(train_loss)

max_test_seq_length

Batches:  2172
(220, 1021, 113)
Epoch 0, test loss: 4.727473
4.7273808
4.727367
4.727393
4.727385
4.727394
4.727403
4.7273717
4.727387


KeyboardInterrupt: 