### Create Dataset

In [18]:
import copy as cp
import numpy as np


def fill_with(matrix, thing):
    """
    Fills list of lists with 'thing' from the left so its matrix-like with no empty spaces
    :param matrix: The list of lists
    :param thing: What we fill with
    :return: The matrix with all the rows the same len
    """
    length = max([len(i) for i in matrix])
    return [[thing] * (length - len(i)) + i for i in matrix]


def one_hot_encode(x, y, tokens):
    """
    One-hot encodes x and y
    :param x: The x element to One-hot encode
    :param y: The target to One-hot encode
    :param tokens: The token list so we can encode
    :return: x and y One-hot encoded
    """
    # we know len(x[0]) = 2*max_len-1
    oh_x = np.zeros((len(x), len(x[0]), len(tokens)), dtype=np.bool)
    oh_y = np.zeros((len(x), len(tokens)), dtype=np.bool)
    for i, sentence in enumerate(x):
        for j, char in enumerate(sentence):
            oh_x[i, j, char] = 1
        oh_y[i, y[i]] = 1
    return oh_x, oh_y

def one_hot_decode(x, tokens):
    xn = []
    for thing in x:
        for i, other in enumerate(thing):
            if other != False:
                xn.append(i)
    return xn

def create_dataset(path, max_len=30, n_of_sentences=50, tokens=None):
    """
    Prepares a dataset so we can use it in a NN
    :param max_len: The maximum length of the sentences
    :param n_of_sentences: The number of sencences
    :return: x y and the tokens
    """
    fail = open(path, "r+")
    list_lines = fail.readlines()
    fail.close()
    # We always get the same encoding this way
    if tokens is None:
        tokens = sorted(list(set("".join(list_lines))))
        if "\n" in tokens:
            tokens.remove("\n")
        tokens.insert(0, "Null")
    # print("Total tokens: " + str(len(tokens)))
    # print("Tokens " + str(tokens))
    char2index = dict((c, i) for i, c in enumerate(tokens))
    # print("char2index " + str(char2index))
    index2char = dict((i, c) for i, c in enumerate(tokens))
    count = 0
    final_y = []
    x = []
    for line in list_lines:
        if (line != "\n") and (count < n_of_sentences):
            # We get the example text from lorem web and texts have 4 blank spaces at the begining
            # with the next line, we delete them
            inserted_line = line[4:]
            if len(inserted_line) > max_len:
                inserted_line = inserted_line[:max_len-1] + "."
            elif "\n" in inserted_line:  # We use this only for this example. We remove the \n from the end of the list.
                inserted_line = inserted_line[:len(inserted_line)-1]
            # We get the reversed line in list format
            reversed_line = ' '.join(inserted_line.split()[::-1])
            x.append(list(inserted_line))
            # We encode x elements. Now on, every insertion will be encoded
            for i, letter in enumerate(x[-1]):
                x[-1][i] = char2index[letter]
            y = []
            # This is to know where to start inserting
            start = len(x) - 1
            for j in range(len(reversed_line)-1):
                anterior = cp.deepcopy(x[start+j])
                anterior.append(char2index[reversed_line[j]])
                x.append(anterior)
                y.append(char2index[reversed_line[j]])
            y.append(char2index[reversed_line[-1]])
            final_y += y
            count += 1

    # We fill with "Null" values, we do the One-hot encoding and we return the values
    x = fill_with(x, char2index["Null"])
    oh_x, oh_y = one_hot_encode(x, final_y, tokens)
    return oh_x, oh_y, tokens, index2char


### Rewrite Words

In [2]:
def create_reverse_by_words_dataset():
    fail = open("Files/lorem2.txt", "r+")
    list_lines = fail.readlines()
    # max_len = len(max(list_lines, key=len))
    fail.seek(0)
    count = 0
    for line in list_lines:
        if (line != "\n") and (count < 80):
            count += 1
            # We get the example text from lorem web and texts have 4 blank spaces at the begining
            # with the next line, we delete them
            inserted_line = line[4:]
            if len(inserted_line) > 40:
                # We want our sentences with len=40
                inserted_line = inserted_line[:39] + "."
            # while len(inserted_line) < max_len:
            #     inserted_line = "0" + inserted_line
            fail.write(inserted_line[:len(inserted_line)] +
                       "," +
                       " ".join(inserted_line.split()[::-1]) +
                       "\n")
    fail.truncate()
    fail.close()


### Rewrite Total

In [3]:
def create_reverse_dataset():
    fail = open("Files/lorem.txt", "r+")
    list_lines = fail.readlines()
    # max_len = len(max(list_lines, key=len))
    fail.seek(0)
    count = 0
    for line in list_lines:
        if (line != "\n") and (count < 80):
            count += 1
            # We get the example text from lorem web and texts have 4 blank spaces at the begining
            # with the next line, we delete them
            inserted_line = line[4:]
            if len(inserted_line) > 40:
                # We want our sentences with len=40
                inserted_line = inserted_line[:39] + "."
            # while len(inserted_line) < max_len:
            #     inserted_line = "0" + inserted_line
            fail.write(inserted_line[:len(inserted_line)] +
                       "," +
                       inserted_line[len(inserted_line)-2::-1] +
                       "\n")
    fail.truncate()
    fail.close()


### Run LSTM

In [4]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [5]:
maxlen = 30
parameters = 50
units = 16
path = "../Data/lorem2.txt"
path2 = "../Data/lorem3.txt"

In [6]:
x_train, y_train, tokens = create_dataset(path, maxlen, parameters)

In [81]:
a = (parameters*maxlen, 2*maxlen-1, len(tokens))
a

(1500, 59, 40)

In [82]:
# x.shape = (parameters*maxlen, 2*maxlen-1, len(tokens))
x_train.shape

(1500, 59, 40)

In [83]:
# y.shape = (parameters*maxlen, len(tokens))
y_train.shape

(1500, 40)

In [84]:
x_test, y_test, tok = create_dataset(path2, maxlen, parameters//3, tokens)

In [85]:
x_test.shape

(480, 59, 40)

In [86]:
y_test.shape

(480, 40)

In [95]:
model = Sequential()

In [88]:
# model.add(Embedding(len(tokens)+1, 32))  # , input_length=2*maxlen-1))
# imput length se puede quitar en teoria

In [97]:
model.add(LSTM(units, input_shape=(2*maxlen-1, len(tokens))))

ValueError: Input 0 is incompatible with layer lstm_10: expected ndim=3, found ndim=2

In [98]:
model.add(Dense(len(tokens), activation='softmax'))

In [91]:
# model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [99]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"])

In [100]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_9 (LSTM)                (None, 16)                3648      
_________________________________________________________________
dense_9 (Dense)              (None, 40)                680       
Total params: 4,328
Trainable params: 4,328
Non-trainable params: 0
_________________________________________________________________


In [104]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, shuffle=True, verbose=1)

Train on 1500 samples, validate on 480 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff21c14acf8>

In [107]:
model.evaluate(steps=10)

AttributeError: 'NoneType' object has no attribute 'shape'

In [110]:
x_train.shape

(1500, 59, 40)

In [131]:
x_train[0].shape

(59, 40)

In [15]:
x_train, y_train, tokens, index2 = create_dataset(path, maxlen, parameters)

In [19]:
cosas = one_hot_decode(x_train[0], tokens)

In [24]:
decoded = [index2[cosa] for cosa in cosas]
''.join(decoded)

'NullNullNullNullNullNullNullNullNullNullNullNullNullNullNullNullNullNullNullNullNullNullNullNullNullNullNullNullNullLorem ipsum dolor sit amet, c.'