Neural network designer
========================

In [None]:
# Extensies en dependencies laden

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.callbacks import TensorBoard

Training-dataset inlezen
------------------------

In [None]:
def read_variables(filename: str) -> list:
    """A function which reads a file and returns the
       contents of the file as two seperate lists.
    
    Input = filename to read the contents from(str).
    Output = -list of all values in the file (list).
             -list of all labels in the file (list).
    """
    values = []
    labels = []
    with open(filename, "r", encoding="UTF-8") as content:
        # Search the header and index the columns.
        line = content.readline().strip()
        while line == "" or not line.startswith("#"):
            content.readline()
        header = line.replace("#", "").split("\t")
        col_values = header.index("Value")
        col_labels = header.index("Label")
        # Put the rest of the file in the corresponding lists.
        for line in content:
            splitted_line = line.strip().split("\t")
            values.append(str(splitted_line[col_values]))
            labels.append(int(splitted_line[col_labels]))
    return values, labels

In [None]:
file_training = "dataset.txt"
values_training, labels_training = read_variables(file_training)

print(f"Count of values (training): {len(values_training)}")
print(f"Count of labels (training): {len(labels_training)}", end="\n\n")

for value, label in zip(values_training[:5], labels_training[:5]):
    print(f"{value} {label}")

Pre-processing van dataset
--------------------------

In [None]:
# Bibliotheek opbouwen van alle unieke karakters.
tokenizer = Tokenizer(char_level=True, lower=False)
tokenizer.fit_on_texts(values_training)

print(f"Unieke karakters: {sorted(tokenizer.word_counts.keys())}", end="\n\n")
print(f"Index: {tokenizer.word_index}", end="\n\n")
print(f"Aantal: {len(tokenizer.word_index)}")

In [None]:
# Strings omzetten naar sequences.
sequences = tokenizer.texts_to_sequences(values_training)

for value, sequence in zip(values_training[:5], sequences[:5]):
    print(f"{value.ljust(12, ' ')}: {str(sequence)}")

In [None]:
# Sequences omzetten naar een binaire matrix.
values_binary_matrix = tokenizer.sequences_to_matrix(sequences, mode="binary")

print(values_binary_matrix[:3], end="\n\n")
print(f"Vorm van matrix: {values_binary_matrix.shape}")

In [None]:
# Labels naar een array omzetten.
labels_array = np.array(labels_training)
print(labels_array)

Model opbouwen
--------------

In [None]:
# Model opbouwen en een samenvatting geven.
model = Sequential([
    Dense(units=60, activation="relu", input_shape=values_binary_matrix[0].shape),
    Dense(units=50, activation="relu"),
    Dense(units=50, activation="relu"),
    Dense(units=40, activation="relu"),
    Dense(units=20, activation="relu"),
    Dense(units=2, activation="softmax")
])

model.summary()

In [None]:
# Model compileren.
model.compile(optimizer=Adam(learning_rate=0.00001),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

Model trainen
-------------

In [None]:
model.fit(
    x=values_binary_matrix,
    y=labels_array,
    shuffle=True,
    validation_split=0.2,
    batch_size=10,
    epochs=2
)

Demo: voorspellingen doen
=========================

In [None]:
example_values = ("Rhodopsin", "VS", "RHO", "rho1", "brandweerauto", "Covid-19", "kattenpoot", "envelope", "rhodopsinterklaas")
longest_example = len(sorted(example_values, key=lambda value: len(value), reverse=True)[0])

# Waardes omzetten naar sequences.
example_sequences = tokenizer.texts_to_sequences(example_values)

# Sequences omzetten naar matrixes.
example_matrix = tokenizer.sequences_to_matrix(example_sequences)

# Voorspellingen doen op matrixes.
example_predictions = model.predict(example_matrix)
example_rounded_predictions = np.argmax(example_predictions, axis=-1)

print("Voorspellingen, 0=woord, 1=symbool:")
for value, prediction, percentage in zip(example_values, example_rounded_predictions, example_predictions):
    print(f"{value.ljust(longest_example, ' ')} : {prediction} ({round(percentage[prediction]*100)}%)")