### Create Dataset

In [None]:
import copy as cp
import numpy as np


def fill_with(matrix, thing):
    """
    Fills list of lists with 'thing' from the left so its matrix-like with no empty spaces
    :param matrix: The list of lists
    :param thing: What we fill with
    :return: The matrix with all the rows the same len
    """
    length = max([len(i) for i in matrix])
    return [[thing] * (length - len(i)) + i for i in matrix]


def one_hot_encode(x, y, tokens):
    """
    One-hot encodes x and y
    :param x: The x element to One-hot encode
    :param y: The target to One-hot encode
    :param tokens: The token list so we can encode
    :return: x and y One-hot encoded
    """
    # we know len(x[0]) = 2*max_len-1
    oh_x = np.zeros((len(x), len(x[0]), len(tokens)), dtype=np.bool)
    oh_y = np.zeros((len(x), len(tokens)), dtype=np.bool)
    for i, sentence in enumerate(x):
        for j, char in enumerate(sentence):
            oh_x[i, j, char] = 1
        oh_y[i, y[i]] = 1
    return oh_x, oh_y


def create_dataset(max_len=30, n_of_sentences=50, n=1):
    """
    Prepares a dataset so we can use it in a NN
    :param max_len: The maximum length of the sentences
    :param n_of_sentences: The number of sencences
    :param n: The file i open xD
    :return: x y and the tokens
    """
    if n is 1:
        fail = open("Files/lorem2.txt", "r+")
    else:
        fail = open("Files/lorem3.txt", "r+")
    list_lines = fail.readlines()
    fail.close()
    # We always get the same encoding this way
    tokens = sorted(list(set("".join(list_lines))))
    if "\n" in tokens:
        tokens.remove("\n")
    tokens.insert(0, "Null")
    # print("Total tokens: " + str(len(tokens)))
    # print("Tokens " + str(tokens))
    char2index = dict((c, i) for i, c in enumerate(tokens))
    # print("char2index " + str(char2index))
    # index2char = dict((i, c) for i, c in enumerate(tokens))
    count = 0
    final_y = []
    x = []
    for line in list_lines:
        if (line != "\n") and (count < n_of_sentences):
            # We get the example text from lorem web and texts have 4 blank spaces at the begining
            # with the next line, we delete them
            inserted_line = line[4:]
            if len(inserted_line) > max_len:
                inserted_line = inserted_line[:max_len-1] + "."
            elif "\n" in inserted_line:  # We use this only for this example. We remove the \n from the end of the list.
                inserted_line = inserted_line[:len(inserted_line)-1]
            # We get the reversed line in list format
            reversed_line = ' '.join(inserted_line.split()[::-1])
            x.append(list(inserted_line))
            # We encode x elements. Now on, every insertion will be encoded
            for i, letter in enumerate(x[-1]):
                x[-1][i] = char2index[letter]
            y = []
            # This is to know where to start inserting
            start = len(x) - 1
            for j in range(len(reversed_line)-1):
                anterior = cp.deepcopy(x[start+j])
                anterior.append(char2index[reversed_line[j]])
                x.append(anterior)
                y.append(char2index[reversed_line[j]])
            y.append(char2index[reversed_line[-1]])
            final_y += y
            count += 1

    # We fill with "Null" values, we do the One-hot encoding and we return the values
    x = fill_with(x, char2index["Null"])
    oh_x, oh_y = one_hot_encode(x, final_y, tokens)
    return oh_x, oh_y, tokens


### Rewrite Words

In [None]:
def create_reverse_by_words_dataset():
    fail = open("Files/lorem2.txt", "r+")
    list_lines = fail.readlines()
    # max_len = len(max(list_lines, key=len))
    fail.seek(0)
    count = 0
    for line in list_lines:
        if (line != "\n") and (count < 80):
            count += 1
            # We get the example text from lorem web and texts have 4 blank spaces at the begining
            # with the next line, we delete them
            inserted_line = line[4:]
            if len(inserted_line) > 40:
                # We want our sentences with len=40
                inserted_line = inserted_line[:39] + "."
            # while len(inserted_line) < max_len:
            #     inserted_line = "0" + inserted_line
            fail.write(inserted_line[:len(inserted_line)] +
                       "," +
                       " ".join(inserted_line.split()[::-1]) +
                       "\n")
    fail.truncate()
    fail.close()


### Rewrite Total

In [None]:
def create_reverse_dataset():
    fail = open("Files/lorem.txt", "r+")
    list_lines = fail.readlines()
    # max_len = len(max(list_lines, key=len))
    fail.seek(0)
    count = 0
    for line in list_lines:
        if (line != "\n") and (count < 80):
            count += 1
            # We get the example text from lorem web and texts have 4 blank spaces at the begining
            # with the next line, we delete them
            inserted_line = line[4:]
            if len(inserted_line) > 40:
                # We want our sentences with len=40
                inserted_line = inserted_line[:39] + "."
            # while len(inserted_line) < max_len:
            #     inserted_line = "0" + inserted_line
            fail.write(inserted_line[:len(inserted_line)] +
                       "," +
                       inserted_line[len(inserted_line)-2::-1] +
                       "\n")
    fail.truncate()
    fail.close()


### Run LSTM

In [None]:
from create_dataset import create_dataset
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.layers.embeddings import Embedding


def run_lstm(maxlen=50, parameters=100, units=32):

    x_train, y_train, tokens = create_dataset(maxlen, parameters, 1)
    # x.size = (parameters*maxlen, 2*maxlen-1, len(tokens))
    # y.size = (parameters*maxlen, len(tokens))
    x_test, y_test, tok = create_dataset(maxlen, parameters//3, 2)
    model = Sequential()
    # model.add(Embedding(len(tokens)+1, 32))  # , input_length=2*maxlen-1))  # imput length se puede quitar en teoria
    model.add(LSTM(units, input_shape=(2*maxlen-1, len(tokens))))
    model.add(Dense(units))
    model.add(Activation('softmax'))
    # optimizer = RMSprop(lr=0.01)
    # model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"])
    model.summary()
    model.fit(x_train, y_test, validation_data=(x_test, y_test))
