In [1]:
import numpy as np
from pathlib import Path
import os
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
import pandas as pd

2023-12-18 14:44:21.872779: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-18 14:44:21.911253: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-18 14:44:21.911292: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-18 14:44:21.912656: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-18 14:44:21.919035: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-18 14:44:21.919908: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [2]:
REBER_VOCAB = {
    0: 'B',
    1: 'T',
    2: 'S',
    3: 'X',
    4: 'P',
    5: 'V',
    6: 'E',
              }

default_reber = {
    0 : [('B', 1)],
    1 : [('T', 2), ('P', 3)],
    2 : [('S', 2), ('X', 4)],
    3 : [('T', 3), ('V', 5)],
    4 : [('X', 3), ('S', 6)],
    5 : [('P', 4), ('V', 6)],
    6 : [('E', None)]
         }

embedded_reber = {
    0 : [('B', 1)], 
    1 : [('T', 2), ('P', 3)],
    2 : [(default_reber, 4)],
    3 : [(default_reber, 5)],
    4 : [('T', 6)],
    5 : [('P', 6)],
    6 : [('E', None)]
                 }

In [3]:
def generate_string(grammar):
    state = 0
    output = []
    while state is not None:
        rnd_index = np.random.randint(len(grammar[state]))
        char, state = grammar[state][rnd_index]
        if isinstance(char, dict):
            char = generate_string(char)
        output.append(char)
    return ''.join(output)

def generate_error_string(grammar):
    legal_string = generate_string(grammar)
    illegal_string = list(legal_string)
    rnd_indices = np.random.randint(len(illegal_string), size=np.random.randint(len(illegal_string)))    
    for i in rnd_indices:
        illegal_string[i] = REBER_VOCAB[np.random.randint(7)]
    return ''.join(illegal_string)

In [4]:
def generate_data(n_samples, legal_perc, illegal_perc):
    legal_strings = []
    illegal_strings = []

    # legal
    for _ in range(n_samples * legal_perc // 100):
        legal_strings.append(generate_string(embedded_reber))
    # Illegal
    for _ in range(n_samples * illegal_perc // 100):
        illegal_strings.append(generate_error_string(embedded_reber))

    return legal_strings, illegal_strings

In [5]:
def generate_data_file(filepath, n_samples, legal_perc=50, illegal_perc=50):

    legal_strings, illegal_strings = generate_data(n_samples, legal_perc, legal_perc)
    
    with open(filepath, 'w') as file:
        for legal_string in legal_strings:
            file.write(legal_string + ',' + "legal")
            file.write('\n')
        for illegal_string in illegal_strings:
            file.write(illegal_string + ',' + "illegal")
            file.write('\n')

In [6]:
dataset_dir = os.path.join('datasets', 'reber_grammar')
file_path_train = os.path.join(dataset_dir, "reber_strings_train.txt")
file_path_test = os.path.join(dataset_dir, "reber_strings_test.txt")

N_SAMPLES = 15000
train_size = N_SAMPLES * 80 // 100
test_size = N_SAMPLES * 20 // 100

generate_data_file(file_path_train, train_size)
generate_data_file(file_path_test, test_size)

In [7]:
def load_data(filepath):
    data = Path(filepath).read_text()
    lines = data.splitlines()
    pairs = [line.split(',') for line in lines]
    X = np.array([X for X,Y in pairs])
    Y = np.array([Y for X,Y in pairs])
    return X, Y

In [8]:
def vectorize_data(string):
    return [list(REBER_VOCAB.keys())[list(REBER_VOCAB.values()).index(char)] for char in string]

In [9]:
def create_dataset(data, label, buffer_size, batch_size=32):
        
    dataset = tf.data.Dataset.from_tensor_slices((data, label))
    dataset = dataset.shuffle(buffer_size)
    dataset = dataset.batch(batch_size)

    return dataset

In [10]:
def process_data(X, Y):
    X = tf.ragged.constant([vectorize_data(string) for string in X])
    Y = tf.constant([0 if label == "illegal" else 1 for label in Y])
    return X, Y

In [11]:
X_train, Y_train = load_data(Path(file_path_train))
X_test, Y_test = load_data(Path(file_path_test))

X_train, Y_train = process_data(X_train, Y_train)
X_test, Y_test = process_data(X_test, Y_test)

train_dataset = create_dataset(X_train, Y_train, buffer_size=train_size)
test_dataset = create_dataset(X_test, Y_test, buffer_size=test_size)

# val_dataset = test_dataset.take(5 * test_size // 100)
# test_dataset = test_dataset.skip(5 * test_size // 100)

2023-12-18 14:44:24.406320: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-18 14:44:24.447552: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [12]:
model = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=[None], dtype=tf.int32, ragged=True),
    keras.layers.Embedding(input_dim=len(REBER_VOCAB), output_dim=512),
    keras.layers.GRU(30),
    keras.layers.Dense(1, activation="sigmoid")
])

optimizer = keras.optimizers.SGD(learning_rate=1e-02, momentum=0.95, nesterov=True)
model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model.fit(train_dataset, epochs=20, validation_data=test_dataset)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [41]:
test_strings = ["BPBTSSSSSSSXXTTVPXVPXTTTTTVVETE",
                "BPBTSSSSSSSXXTTVPXVPXTTTTTVVEPE"]

X_test = tf.ragged.constant([vectorize_data(string) for string in test_strings], ragged_rank=1)

y_proba = model.predict(X_test)
print()
print("Estimated probability that these are Reber strings:")
for index, string in enumerate(test_strings):
    print("{}: {:.2f}%".format(string, 100 * y_proba[index][0]))


Estimated probability that these are Reber strings:
BPBTSSSSSSSXXTTVPXVPXTTTTTVVETE: 92.39%
BPBTSSSSSSSXXTTVPXVPXTTTTTVVEPE: 92.47%
