Neural network designer
========================

In [None]:
# Extensies en dependencies laden
%load_ext tensorboard

import shutil
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.callbacks import TensorBoard

Training dataset inlezen
------------------------

In [None]:
file = "dataset.txt"
values = []
labels = []

with open(file, "r", encoding="UTF-8") as file:
    line = file.readline().strip()
    while line == "" or not line.startswith("#"):
        file.readline()
    header = line.replace("#", "").split("\t")
    col_values = header.index("Value")
    col_labels = header.index("Label")
    for line in file:
        splitted_line = line.strip().split("\t")
        values.append(str(splitted_line[col_values]))
        labels.append(int(splitted_line[col_labels]))

print(f"Count of values: {len(values)}")
print(f"Count of labels: {len(labels)}")

Pre-processing van dataset
--------------------------

In [None]:
# Bibliotheek opbouwen van alle unieke karakters.
tokenizer = Tokenizer(char_level=True, lower=False)
tokenizer.fit_on_texts(values)

print(f"Unieke karakters: {sorted(tokenizer.word_counts.keys())}", end="\n\n")
print(f"Index: {tokenizer.word_index}", end="\n\n")
print(f"Aantal: {len(tokenizer.word_index)}")

In [None]:
# Strings omzetten naar sequences.
sequences = tokenizer.texts_to_sequences(values)

for value, sequence in zip(values[:5], sequences[:5]):
    print(f"{value.ljust(12, ' ')}: {str(sequence)}")

In [None]:
# Sequences omzetten naar een binaire matrix.
values_binary_matrix = tokenizer.sequences_to_matrix(sequences, mode="binary")

print(values_binary_matrix[:3])
print(f"Vorm van matrix: {values_binary_matrix.shape}")

In [None]:
# Labels naar een array omzetten.
labels_array = np.array(labels)
print(labels_array)

Model opbouwen
--------------

In [None]:
# Model opbouwen en een samenvatting geven.
model = Sequential([
    Dense(units=60, activation="relu", input_shape=values_binary_matrix[0].shape),
    Dense(units=50, activation="relu"),
    Dense(units=50, activation="relu"),
    Dense(units=40, activation="relu"),
    Dense(units=20, activation="relu"),
    Dense(units=2, activation="softmax")
])

model.summary()

In [None]:
# Model compileren.
model.compile(optimizer=Adam(learning_rate=0.00001),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

Model trainen
-------------

In [None]:
# TensorBoard voorbereiden.
log_dir = "logs/fit/"
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
# Model trainen.
model.fit(
    x=values_binary_matrix,
    y=labels_array,
    validation_split=0.2,
    shuffle=True,
    batch_size=10,
    epochs=5,
    callbacks=[tensorboard_callback]
)

In [None]:
# TensorBoard starten.
%tensorboard --logdir logs/fit

In [None]:
test_value = ("potverdriedubbeltjesnogantoe",)

test_sequence = tokenizer.texts_to_sequences(test_value)
print(f"Sequence: {test_sequence}", end="\n\n")

test_matrix = tokenizer.sequences_to_matrix(test_sequence)
print(f"Matrix: {test_matrix}")

In [None]:
model.predict(test_matrix)