In [None]:
%load_ext autoreload
%autoreload 2

import tensorflow as tf
import numpy as np
import os
from tqdm import tqdm
import random

In [None]:
MIN_PARAGRAPH_LEN = 5

In [None]:
def load_data(_dir):
    ret = []
    for each in os.listdir(_dir):
        full_path = os.path.join(_dir, each)
        if each.endswith("txt"):
            with open(full_path, "rb") as f:
                aux = f.read().decode("utf-8").split('\n\n')
                for paragraph in aux:
                    paragraph = paragraph.strip('\n')
                    paragraph += '\n'
                    if len(paragraph) < MIN_PARAGRAPH_LEN:
                        continue
                    ret.append(paragraph)
    return ret

In [None]:
ps = load_data("./speeches/")

In [None]:
print("Number of Paragraphs: {}".format(len(ps)))

arr = np.asarray([len(x) for x in ps])

print("Mean {}".format(np.mean(arr)))
print("Median {}".format(np.median(arr)))
print("Std {}".format(np.std(arr)))
print("Max {}".format(np.max(arr)))
print("Min {}".format(np.min(arr)))

In [None]:
for i in range(3):
    idx = random.choice(range(len(ps)))
    print(ps[idx])
    print("----------------------------------")

In [None]:
def preprocess(paragraphs):
    chars = set()
    
    for each in paragraphs:
        chars.update(set(each))
    
    char_to_ix = { ch:i for i,ch in enumerate(chars) }
    ix_to_char = { i:ch for i,ch in enumerate(chars) }
    
    # Add START token
    aux = len(char_to_ix)
    char_to_ix["<START>"] = aux
    ix_to_char[aux] = "<START>"
    
    vocab_size = len(char_to_ix)

    max_p = max([len(i) for i in paragraphs]) + 1 # Plus one because of the START token
    
    ret = np.zeros(shape=(len(paragraphs), max_p, vocab_size))
    lens = np.zeros(shape=len(paragraphs))

    for idx, each in enumerate(paragraphs):
        lens[idx] = len(each) + 1
        for i in range(max_p - len(each) - 1):
            each += '\n'

        aux = np.zeros(shape=(max_p, vocab_size))
        aux[0][char_to_ix["<START>"]] = 1
        for i, c in enumerate(each):
            aux[i+1][char_to_ix[c]] = 1
        ret[idx] = aux
        
    return ret, lens, char_to_ix, ix_to_char

In [None]:
data, lens, char_to_ix, ix_to_char = preprocess(ps)

In [None]:
print(data.shape)

# Defining model

In [None]:
BATCH_SIZE = 64
INPUT_SIZE = len(ix_to_char)

TIMES = 32
N_HIDDEN = 512

tf.reset_default_graph()


init = tf.contrib.layers.xavier_initializer()
x = tf.placeholder(tf.float32, shape=(None, TIMES, INPUT_SIZE), name="x")
y = tf.placeholder(tf.float32, shape=(None, TIMES, INPUT_SIZE))
seq_len = tf.placeholder(tf.int64, shape=(None), name="seq_len")

x_2 = tf.unstack(x, axis=1)

init_state_c_1 = tf.placeholder(tf.float32, shape=[None, N_HIDDEN], name="init_state_c_1")
init_state_h_1 = tf.placeholder(tf.float32, shape=[None, N_HIDDEN], name="init_state_h_1")

init_state_c_2 = tf.placeholder(tf.float32, shape=[None, N_HIDDEN], name="init_state_c_2")
init_state_h_2 = tf.placeholder(tf.float32, shape=[None, N_HIDDEN], name="init_state_h_2")

cell_1 = tf.contrib.rnn.BasicLSTMCell(N_HIDDEN)
cell_2 = tf.contrib.rnn.BasicLSTMCell(N_HIDDEN)

cell = tf.contrib.rnn.MultiRNNCell([cell_1, cell_2])
    
t_1 = tf.contrib.rnn.LSTMStateTuple(init_state_c_1, init_state_h_1)
t_2 = tf.contrib.rnn.LSTMStateTuple(init_state_c_2, init_state_h_2)

outputs, states = tf.contrib.rnn.static_rnn(cell, x_2, dtype=tf.float32, sequence_length=seq_len, initial_state=(t_1, t_2))

states_0 = tf.nn.rnn_cell.LSTMStateTuple(tf.identity(states[0][0], name="states_0_c"), tf.identity(states[0][1], name="states_0_h"))
states_1 = tf.nn.rnn_cell.LSTMStateTuple(tf.identity(states[1][0], name="states_1_c"), tf.identity(states[1][1], name="states_1_h"))

states = (states_0, states_1)

outputs_2 = tf.stack(outputs, axis=1)

out = tf.layers.dense(outputs_2, units=INPUT_SIZE, kernel_initializer=init, name="out")

out_softmax = tf.nn.softmax(out, name="out_softmax")

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=out, labels=y))

tf.summary.scalar('loss', loss)

merge = tf.summary.merge_all()

upd = tf.train.AdamOptimizer().minimize(loss)

# Running RNN

In [None]:
print(data.shape)

In [None]:
def test(max_=1000, T=None):

    pred = "<START>"
    
    c_1 = np.zeros((1, N_HIDDEN))
    h_1 = np.zeros((1, N_HIDDEN))
    
    c_2 = np.zeros((1, N_HIDDEN))
    h_2 = np.zeros((1, N_HIDDEN))
    
    ret = []
        
    while True:
        
        in_ = np.zeros(shape=(1, TIMES, INPUT_SIZE), dtype=np.uint)
        in_[0, 0, char_to_ix[pred]] = 1

        if T is None:
            net_out, net_states = sess.run([out_softmax, states], feed_dict={x: in_, init_state_c_1: c_1, init_state_h_1: h_1, init_state_c_2: c_2, init_state_h_2: h_2, seq_len: np.ones(shape=(1,))})
            c_1, h_1 = net_states[0].c, net_states[0].h
            c_2, h_2 = net_states[1].c, net_states[1].h
            p = np.squeeze(net_out)[0]
        else:
            net_out, net_states = sess.run([out, states], feed_dict={x: in_, init_state_c_1: c_1, init_state_h_1: h_1, init_state_c_2: c_2, init_state_h_2: h_2, seq_len: np.ones(shape=(1,))})
            c_1, h_1 = net_states[0].c, net_states[0].h
            c_2, h_2 = net_states[1].c, net_states[1].h
            p = np.squeeze(net_out)[0]
            p = np.exp(p/T) / np.sum(np.exp(p/T))
            
        char_out = ix_to_char[int(np.random.choice(np.arange(INPUT_SIZE), p=p))]
        ret.append(char_out)

        pred = char_out
                                                                         
        if char_out == '\n' or len(ret) > max_:
            break
        
    return ret                                                                       

In [None]:
EPOCHS = 1000

N, M, V = data.shape

sess = tf.Session()
sess.run(tf.global_variables_initializer())

zeros = np.zeros(shape=(BATCH_SIZE))
times_minus_one = (TIMES - 1) * np.ones(shape=(BATCH_SIZE))

train_writer = tf.summary.FileWriter('./logs/train', sess.graph)

counter = 0
for e in tqdm(range(EPOCHS)):
    
    idxs = np.random.choice(N, BATCH_SIZE, replace=False)
    batch = data[idxs]
    batch_lens = lens[idxs].astype(np.int32)
    
    ts = (M-1) // TIMES # + 1
    
    # Initial state
    c_1 = np.zeros((BATCH_SIZE, N_HIDDEN))
    h_1 = np.zeros((BATCH_SIZE, N_HIDDEN))

    c_2 = np.zeros((BATCH_SIZE, N_HIDDEN))
    h_2 = np.zeros((BATCH_SIZE, N_HIDDEN))
    
    if e % 10 == 0:
        print("".join(test(max_=100)))
    
    for t in range(ts):
        batch_x = batch[:, t*TIMES:TIMES*(t+1), :]
        batch_y = batch[:, t*TIMES+1:TIMES*(t+1)+1, :]
        
        batch_lens_aux = batch_lens -  (TIMES * t)
        
        batch_lens_aux = np.maximum(zeros, batch_lens_aux)
        batch_lens_aux = np.minimum(times_minus_one, batch_lens_aux)
        
        batch_lens_aux = batch_lens_aux.astype(np.uint8)
        
        non_zero_idxs = batch_lens_aux > 0
        batch_lens_aux = batch_lens_aux[non_zero_idxs]

        batch_x = batch_x[non_zero_idxs, :, :]
        batch_y = batch_y[non_zero_idxs, :, :]
        c_l_1 = c_1[non_zero_idxs]
        h_l_1 = h_1[non_zero_idxs]
        
        c_l_2 = c_2[non_zero_idxs]
        h_l_2 = h_2[non_zero_idxs]
        
        if np.all(batch_lens_aux == 0):
            break
    
           
        m, states_, _ = sess.run([merge, states, upd], feed_dict={x: batch_x, y: batch_y, init_state_c_1: c_l_1, init_state_h_1: h_l_1, init_state_c_2: c_l_2, init_state_h_2: h_l_2, seq_len: batch_lens_aux})
        train_writer.add_summary(m, counter)
        
        counter += 1
        
        c_1[non_zero_idxs] = states_[0].c
        h_1[non_zero_idxs] = states_[0].h

        c_2[non_zero_idxs] = states_[1].c
        h_2[non_zero_idxs] = states_[1].h
        