In [1]:
import tensorflow as tf
import numpy as np
import random

In [2]:
from datetime import datetime
datetime.now()

datetime.datetime(2021, 1, 5, 17, 34, 29, 859321)

In [3]:
data = np.array([(x,str(x)) for x in np.arange(10e5)],dtype=[('label','int32'),('input', 'U10')])
np.random.shuffle(data)

In [4]:
training_fraction = 0.85
train_end = int(len(data) * training_fraction)

labels, inputs = zip(*data)
train_labels, train_inputs = np.array(labels[:train_end]), np.array(inputs[:train_end])
test_labels, test_inputs = np.array(labels[train_end:]), np.array(inputs[train_end:])

In [5]:
print(np.where(train_labels == 0.0))
print(np.where(train_inputs == '0.0'))

(array([], dtype=int64),)
(array([], dtype=int64),)


In [6]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_inputs, train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_inputs, test_labels))

In [7]:
train_dataset.take(1)

<TakeDataset shapes: ((), ()), types: (tf.string, tf.int32)>

In [8]:
# How much it loads into memory for sampling
BUFFER_SIZE = 100000
# Batch for gradient averaging
BATCH_SIZE = 64
# prefetch parrallelising loading + execution (not huge so not necessary)

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(BATCH_SIZE*2)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(BATCH_SIZE*2)

In [9]:
sample, label = next(iter(train_dataset.take(1)))
print(sample, label)

tf.Tensor(
[b'427973.0' b'478331.0' b'417521.0' b'73951.0' b'18784.0' b'27280.0'
 b'806339.0' b'651322.0' b'419850.0' b'56585.0' b'25182.0' b'512466.0'
 b'2606.0' b'106428.0' b'390431.0' b'124802.0' b'45026.0' b'745485.0'
 b'555457.0' b'673331.0' b'247627.0' b'453826.0' b'77916.0' b'560558.0'
 b'748156.0' b'413292.0' b'681367.0' b'830974.0' b'289767.0' b'659475.0'
 b'366807.0' b'192804.0' b'180115.0' b'774274.0' b'20257.0' b'504850.0'
 b'18695.0' b'821506.0' b'766839.0' b'560016.0' b'367098.0' b'40062.0'
 b'605977.0' b'320166.0' b'32677.0' b'823291.0' b'881591.0' b'159.0'
 b'812290.0' b'237645.0' b'243848.0' b'14616.0' b'982153.0' b'25481.0'
 b'921283.0' b'343517.0' b'295695.0' b'861066.0' b'797402.0' b'592360.0'
 b'866738.0' b'996210.0' b'972485.0' b'839795.0'], shape=(64,), dtype=string) tf.Tensor(
[427973 478331 417521  73951  18784  27280 806339 651322 419850  56585
  25182 512466   2606 106428 390431 124802  45026 745485 555457 673331
 247627 453826  77916 560558 748156 413292 681

In [10]:
def standardize(batch):
    '''
    Designed to seperate digits in number
    '''
    DEFAULT_REGEX = r'[!"#$%&()\*\+,-\./:;<=>?@\[\\\]^_`{|}~\']'
    # Remove any pennies/cents
    batch = tf.strings.regex_replace(batch, r'([\.|,][0-9].*)', '')
    # Normal punc strip
    batch = tf.strings.regex_replace(batch, DEFAULT_REGEX, "")
    # Spread out the values so we can get them frequent enough to appear in our vocab
    batch = tf.strings.regex_replace(batch, r'([0-9])', r'\1 ')
    return batch

VOCAB_SIZE = 10000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE, standardize=standardize, ngrams=(1,)
)


In [11]:
encoder.adapt(train_dataset.map(lambda text, label: text))

In [12]:
encoded_sample = encoder(sample).numpy()[:3]
print(encoded_sample)

[[3 2 5 9 5 6]
 [3 5 4 6 6 8]
 [3 8 5 7 2 8]]


In [13]:
vocab = np.array(encoder.get_vocabulary())
for n in range(3):
  print("Original: ", sample[n].numpy())
  print("Round-trip: ", " ".join(vocab[encoded_sample[n]]))
  print()

Original:  b'427973.0'
Round-trip:  4 2 7 9 7 3

Original:  b'478331.0'
Round-trip:  4 7 8 3 3 1

Original:  b'417521.0'
Round-trip:  4 1 7 5 2 1



In [14]:
model = tf.keras.Sequential(
    [
        encoder,
        tf.keras.layers.Embedding(
            input_dim=len(encoder.get_vocabulary()), output_dim=64, mask_zero=True
        ),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(1),
    ]
)

model.compile(
    loss=tf.keras.losses.MeanSquaredError(),
    optimizer=tf.keras.optimizers.Adam(),
)

In [None]:
history = model.fit(
    train_dataset, epochs=100, validation_steps=30, validation_data=test_dataset
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100

In [None]:
res = model.evaluate(test_dataset)

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history["val_" + metric], "")
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, "val_" + metric])

In [None]:
plot_graphs(history,'loss')

In [None]:
test_predictions = model.predict(test_inputs)
train_predictions = model.predict(train_inputs)
plt.scatter(train_labels, train_predictions, label='train', s=2)
plt.scatter(test_labels, test_predictions, label='test', s=2)
plt.legend()
plt.xlabel('Truth')
plt.ylabel('Prediction')
plt.savefig('digits_unigrams.png')
plt.show()