In [1]:
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten
import pandas as pd
import numpy as np


def create_raw_set(string_length, letter_base, sequence, target_set_size):
    with_sequences = []
    without_sequences = []
    new_string = ""
    set_size = 0
    target_divided = target_set_size / 2
    while set_size < target_set_size:
        new_string = ""
        for l in range(string_length):
            rc = random.choice(letter_base)
            new_string = new_string + rc

        if sequence in new_string:
            if len(with_sequences) < target_divided:
                with_sequences.append(new_string)
        else:
            if len(without_sequences) < target_divided:
                without_sequences.append(new_string)
            elif len(without_sequences) >= target_divided:
                inserted_string = insert_into_string(new_string, sequence)
                with_sequences.append(inserted_string)

        set_size = len(with_sequences) + len(without_sequences)
    return with_sequences, without_sequences


def insert_into_string(source_str, sequence):
    last_possible_index = len(source_str) - len(sequence)
    start = random.randint(0, last_possible_index)
    end = start + len(sequence)
    inserted_string = source_str[:start] + sequence + source_str[end:]
    return inserted_string


# THIS ENCODING IS ORDER SENSITIVE!!!
def create_one_hot_encoding(string_list, letter_base):
    one_hot_encoded_list = []
    rows = len(string_list[0])
    columns = len(letter_base)
    for string in string_list:
        zeros_arr = np.zeros((rows, columns), dtype=int)
        for i in range(len(string)):
            if string[i] == letter_base[0]:
                zeros_arr[i][0] = 1
            elif string[i] == letter_base[1]:
                zeros_arr[i][1] = 1
            elif string[i] == letter_base[2]:
                zeros_arr[i][2] = 1
            elif string[i] == letter_base[3]:
                zeros_arr[i][3] = 1
        one_hot_encoded_list.append(zeros_arr)
    return one_hot_encoded_list


def get_labels_from_dataset(dataset):
    labels = []
    data = []
    for element in dataset:
        data.append(element[0])
        labels.append(element[1])
    return data, labels


def split_into_training_and_testing_sets(dataset, target_set_size):
    random.shuffle(dataset)
    # 80-20 split
    training_set = []
    training_size = int(target_set_size * 0.8)
    test_size = target_set_size - training_size
    i = 0
    while i < training_size:
        idx = random.randrange(len(dataset))
        training_set.append(dataset[idx])
        dataset.pop(idx)
        i = i + 1
    training_tuple = get_labels_from_dataset(training_set)
    test_tuple = get_labels_from_dataset(dataset)
    return training_tuple, test_tuple


str_length = 15
letter_base = "abcd"
letter_sequence = "abcda"
target_set_size = 10000

output = create_raw_set(str_length, letter_base, letter_sequence, target_set_size)
encoded_with = create_one_hot_encoding(output[0], letter_base)
encoded_without = create_one_hot_encoding(output[1], letter_base)
fltr = create_one_hot_encoding([letter_sequence], letter_base)
letter_filter = fltr[0]

combined_with = []
for ele in encoded_with:
    descriptor = [ele, 1]
    combined_with.append(descriptor)

combined_without = []
for ele in encoded_without:
    descriptor = [ele, 0]
    combined_without.append(descriptor)

dataset = np.concatenate((combined_with, combined_without), axis=0)

num_filters = 1
kernel_size = letter_filter.shape

tuples = split_into_training_and_testing_sets(dataset.tolist(), target_set_size)

td = tuples[0][0]
tl = tuples[0][1]
testd = tuples[1][0]
testl = tuples[1][1]

training_data = np.array(td)
training_labels = np.array(tl)
test_data = np.array(testd)
test_labels = np.array(testl)

training_data = training_data.reshape((8000, 15, 4, 1))
training_labels = training_labels.reshape((8000, 1))
print(training_data.shape)

# BATCH SIZE NOT IN MODEL!!!
model = Sequential([
    Conv2D(num_filters, kernel_size, strides=1, padding='same', input_shape=(15, 4, 1), data_format='channels_last'),
    Flatten(),
    Dense(1, activation='relu')
])

model.compile(optimizer='adam', loss="binary_crossentropy", metrics=['accuracy'])
model.fit(training_data, training_labels)



(8000, 15, 4, 1)


<tensorflow.python.keras.callbacks.History at 0x17a03f4bf48>

In [4]:
training_data

array([[[[1],
         [0],
         [0],
         [0]],

        [[0],
         [1],
         [0],
         [0]],

        [[0],
         [1],
         [0],
         [0]],

        ...,

        [[1],
         [0],
         [0],
         [0]],

        [[0],
         [0],
         [0],
         [1]],

        [[0],
         [0],
         [1],
         [0]]],


       [[[0],
         [0],
         [1],
         [0]],

        [[0],
         [0],
         [0],
         [1]],

        [[0],
         [0],
         [0],
         [1]],

        ...,

        [[0],
         [0],
         [1],
         [0]],

        [[1],
         [0],
         [0],
         [0]],

        [[0],
         [0],
         [1],
         [0]]],


       [[[0],
         [1],
         [0],
         [0]],

        [[0],
         [0],
         [0],
         [1]],

        [[0],
         [0],
         [0],
         [1]],

        ...,

        [[0],
         [1],
         [0],
         [0]],

        [[0],
         [0]

In [7]:
td

[array([[1, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 1, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [0, 0, 0, 1],
        [0, 0, 1, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 1],
        [0, 0, 1, 0],
        [0, 0, 1, 0],
        [1, 0, 0, 0],
        [0, 0, 0, 1],
        [0, 0, 1, 0]]),
 array([[0, 0, 1, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 1],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        [0, 1, 0, 0],
        [0, 1, 0, 0],
        [1, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 1],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 1, 0],
        [1, 0, 0, 0],
        [0, 0, 1, 0]]),
 array([[0, 1, 0, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 1],
        [0, 0, 1, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        [1, 0, 0, 0],
        [0, 0, 0, 1],
        [1, 0, 0, 0],
        [0, 0, 0, 1],
        [0, 0, 1, 0],
        [1, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 0, 1],
        [1, 0, 0, 0]]),
 arr

In [13]:
tuples[0][0]

AttributeError: 'list' object has no attribute 'shape'