In [1]:
import tensorflow as tf
import os
import re

2023-06-05 17:40:21.188195: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-05 17:40:21.474773: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
block_size = 256
batch_size = 64

In [3]:
data = []
for file in os.listdir("./data"):
    print(f"Working on file : {file}")
    with open(f"./data/{file}", "r") as f:
        data.append(f.read())

Working on file : republic


In [4]:
# concatenate
data = "\n".join(data)

In [5]:
data = re.sub("\n+", "\n", data)

In [6]:
data[:500]

'I went down yesterday to the Piraeus with Glaucon the son of Ariston, that I might offer up my prayers to the goddess (Bendis, the Thracian Artemis.); and also because I wanted to see in what manner they would celebrate the festival, which was a new thing. I was delighted with the procession of the inhabitants; but that of the Thracians was equally, if not more, beautiful. When we had finished our prayers and viewed the spectacle, we turned in the direction of the city; and at that instant Polem'

In [7]:
unique_char = sorted(list(set(data)))
vocab_size = len(unique_char)

str_to_int = {ch:i for i, ch in enumerate(unique_char)}
int_to_str = {i:ch for i, ch in enumerate(unique_char)}
encode = lambda s: [str_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_str[i] for i in l])

In [8]:
vocab_size == len(str_to_int)

True

In [9]:
# Serialize for loading in the training notebook
import pickle
with open("str_to_int", "wb") as f:
    pickle.dump(str_to_int, f)

with open("int_to_str", "wb") as f:
    pickle.dump(int_to_str, f)

In [10]:
encode(data)[:12]

[32, 1, 72, 54, 63, 69, 1, 53, 64, 72, 63, 1]

In [11]:
data = tf.convert_to_tensor(encode(data), tf.float16)

2023-06-05 17:40:37.683794: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-05 17:40:37.856436: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-05 17:40:37.856744: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-05 17:40:37.860592: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the

In [12]:
data.shape

TensorShape([637087])

In [13]:
x = tf.stack([data[idx:idx+block_size] for idx in range(len(data)-block_size)])
y = tf.stack([data[idx+1:idx+block_size+1] for idx in range(len(data)-block_size)])

In [14]:
x.shape

TensorShape([636831, 256])

In [15]:
data = tf.data.Dataset.from_tensor_slices((x, y))  

In [16]:
len(data)

636831

In [17]:
def train_val_test_split(dataset, train_split, val_split, test_split):
    dataset_size = len(dataset)
    dataset = dataset.shuffle(dataset_size)
    train_size = int(train_split * dataset_size)
    val_size = int(val_split * dataset_size)
    
    train_data = dataset.take(train_size)
    val_data = dataset.skip(train_size).take(val_size)
    test_data = dataset.skip(train_size+val_size)
    return train_data, val_data, test_data

In [18]:
train, val, test = train_val_test_split(data, 0.7, 0.2, 0.1)

In [19]:
# For the train:
# We split it into 4 shards
offset = len(train) // 4
offset

111445

In [20]:
shard_list = []
for i in range(4):
    if i == 3:
        shard_list.append(train.skip(offset * i).shuffle(offset).batch(batch_size))
    else:
        shard_list.append(train.skip(offset * i).take(offset).shuffle(offset).batch(batch_size))

In [21]:
len(shard_list[0]) == len(shard_list[1]) == len(shard_list[2])

True

In [22]:
len(shard_list[3])

1742

In [23]:
shard_list

[<BatchDataset element_spec=(TensorSpec(shape=(None, 256), dtype=tf.float16, name=None), TensorSpec(shape=(None, 256), dtype=tf.float16, name=None))>,
 <BatchDataset element_spec=(TensorSpec(shape=(None, 256), dtype=tf.float16, name=None), TensorSpec(shape=(None, 256), dtype=tf.float16, name=None))>,
 <BatchDataset element_spec=(TensorSpec(shape=(None, 256), dtype=tf.float16, name=None), TensorSpec(shape=(None, 256), dtype=tf.float16, name=None))>,
 <BatchDataset element_spec=(TensorSpec(shape=(None, 256), dtype=tf.float16, name=None), TensorSpec(shape=(None, 256), dtype=tf.float16, name=None))>]

In [24]:
for ind, ds in enumerate(shard_list):
    ds.save(f"./shards/shard_simple_{ind}")

In [25]:
val = val.batch(batch_size)
val.save("./shards/val_shard_simple")

In [26]:
test = test.batch(batch_size)
test.save("./shards/test_shard_simple")

In [27]:
path = "./shards"
shards = []
for file in os.listdir(path):
    if file.startswith("shard_simple"):
        shards.append(tf.data.Dataset.load(f"{path}/{file}"))

In [28]:
shards

[<_LoadDataset element_spec=(TensorSpec(shape=(None, 256), dtype=tf.float16, name=None), TensorSpec(shape=(None, 256), dtype=tf.float16, name=None))>,
 <_LoadDataset element_spec=(TensorSpec(shape=(None, 256), dtype=tf.float16, name=None), TensorSpec(shape=(None, 256), dtype=tf.float16, name=None))>,
 <_LoadDataset element_spec=(TensorSpec(shape=(None, 256), dtype=tf.float16, name=None), TensorSpec(shape=(None, 256), dtype=tf.float16, name=None))>,
 <_LoadDataset element_spec=(TensorSpec(shape=(None, 256), dtype=tf.float16, name=None), TensorSpec(shape=(None, 256), dtype=tf.float16, name=None))>]

In [29]:
next(iter(shards[0]))

(<tf.Tensor: shape=(64, 256), dtype=float16, numpy=
 array([[67., 60., 50., ...,  1., 61., 70.],
        [69.,  1., 69., ..., 63.,  1., 69.],
        [69., 57., 54., ..., 64., 67., 53.],
        ...,
        [58., 52., 57., ...,  1., 57., 54.],
        [62.,  1., 69., ...,  1., 68., 50.],
        [57., 58., 63., ..., 68., 23.,  1.]], dtype=float16)>,
 <tf.Tensor: shape=(64, 256), dtype=float16, numpy=
 array([[60., 50., 51., ..., 61., 70., 53.],
        [ 1., 69., 74., ...,  1., 69., 57.],
        [57., 54.,  1., ..., 67., 53., 68.],
        ...,
        [52., 57.,  1., ..., 57., 54.,  1.],
        [ 1., 69., 57., ..., 68., 50., 58.],
        [58., 63., 60., ..., 23.,  1., 46.]], dtype=float16)>)

In [30]:
len(shards[0])

1742