In [1]:
import numpy as np
from pathlib import Path
import os

In [2]:
REBER_VOCAB = {
    0: 'B',
    1: 'T',
    2: 'P',
    3: 'S',
    4: 'X',
    5: 'V',
    6: 'E',
              }

default_reber = {
    0 : [('B', 1)],
    1 : [('T', 2), ('P', 3)],
    2 : [('S', 2), ('X', 4)],
    3 : [('T', 3), ('V', 5)],
    4 : [('X', 3), ('S', 6)],
    5 : [('P', 4), ('V', 6)],
    6 : [('E', None)]
         }

embedded_reber = {
    0 : [('B', 1)], 
    1 : [('T', 2), ('P', 3)],
    2 : [(default_reber, 4)],
    3 : [(default_reber, 5)],
    4 : [('T', 6)],
    5 : [('P', 6)],
    6 : [('E', None)]
                 }

In [3]:
def generate_string(grammar):
    state = 0
    output = []
    while state is not None:
        rnd_index = np.random.randint(len(grammar[state]))
        char, state = grammar[state][rnd_index]
        if isinstance(char, dict):
            char = generate_string(char)
        output.append(char)
    return ''.join(output)

def generate_error_string(grammar):
    legal_string = generate_string(grammar)
    illegal_string = list(legal_string)
    rnd_indices = np.random.randint(len(illegal_string), size=np.random.randint(len(illegal_string)))    
    for i in rnd_indices:
        illegal_string[i] = REBER_VOCAB[np.random.randint(7)]
    return ''.join(legal_string), ''.join(illegal_string)

In [4]:
legal_strings = []
illegal_strings = []
for _ in range(15000):
    legal_string, illegal_string = generate_error_string(embedded_reber)
    legal_strings.append(legal_string)
    illegal_strings.append(illegal_string)

In [5]:
dataset_dir = os.path.join('datasets', 'reber_grammar')
file_path = os.path.join(dataset_dir, "reber_strings.txt")
with open(file_path, 'w') as file:
    for legal_string in legal_strings:
        file.write(legal_string + ' ' + "legal")
        file.write('\n')
    for illegal_string in illegal_strings:
        file.write(illegal_string + '  '+ "illegal")
        file.write('\n')