In [1]:
import tensorflow as tf
import numpy as np
import random

In [2]:
from datetime import datetime
datetime.now()

datetime.datetime(2020, 12, 18, 2, 13, 57, 2962)

In [3]:
data = np.array([(x,str(x)) for x in np.arange(10e5)],dtype=[('label','int32'),('input', 'U10')])
np.random.shuffle(data)

In [4]:
training_fraction = 0.85
train_end = int(len(data) * training_fraction)

labels, inputs = zip(*data)
train_labels, train_inputs = np.array(labels[:train_end]), np.array(inputs[:train_end])
test_labels, test_inputs = np.array(labels[train_end:]), np.array(inputs[train_end:])

In [5]:
print(np.where(train_labels == 0.0))
print(np.where(train_inputs == '0.0'))

(array([127612]),)
(array([127612]),)


In [6]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_inputs, train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_inputs, test_labels))

In [7]:
train_dataset.take(1)

<TakeDataset shapes: ((), ()), types: (tf.string, tf.int32)>

In [8]:
# How much it loads into memory for sampling
BUFFER_SIZE = 100000
# Batch for gradient averaging
BATCH_SIZE = 64
# prefetch parrallelising loading + execution (not huge so not necessary)

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(BATCH_SIZE*2)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(BATCH_SIZE*2)

In [9]:
sample, label = next(iter(train_dataset.take(1)))
print(sample, label)

tf.Tensor(
[b'635320.0' b'861467.0' b'556418.0' b'528254.0' b'280875.0' b'486079.0'
 b'886119.0' b'924523.0' b'238540.0' b'539371.0' b'328721.0' b'864181.0'
 b'463193.0' b'252465.0' b'92172.0' b'426401.0' b'184290.0' b'182992.0'
 b'60584.0' b'312699.0' b'93371.0' b'886704.0' b'161878.0' b'934094.0'
 b'949257.0' b'148043.0' b'497368.0' b'20916.0' b'553428.0' b'854921.0'
 b'544782.0' b'444468.0' b'788741.0' b'162833.0' b'906293.0' b'882122.0'
 b'164355.0' b'155497.0' b'441263.0' b'877413.0' b'955427.0' b'836715.0'
 b'730972.0' b'848987.0' b'398787.0' b'803066.0' b'652655.0' b'360773.0'
 b'682497.0' b'247572.0' b'171618.0' b'841396.0' b'658311.0' b'392022.0'
 b'555893.0' b'118304.0' b'951138.0' b'629005.0' b'20390.0' b'930356.0'
 b'368249.0' b'109488.0' b'541823.0' b'560843.0'], shape=(64,), dtype=string) tf.Tensor(
[635320 861467 556418 528254 280875 486079 886119 924523 238540 539371
 328721 864181 463193 252465  92172 426401 184290 182992  60584 312699
  93371 886704 161878 934094 9492

In [10]:
def standardize(batch):
    '''
    Designed to seperate digits in number
    '''
    DEFAULT_REGEX = r'[!"#$%&()\*\+,-\./:;<=>?@\[\\\]^_`{|}~\']'
    # Remove any pennies/cents
    batch = tf.strings.regex_replace(batch, r'([\.|,][0-9].*)', '')
    # Normal punc strip
    batch = tf.strings.regex_replace(batch, DEFAULT_REGEX, "")
    # Spread out the values so we can get them frequent enough to appear in our vocab
    batch = tf.strings.regex_replace(batch, r'([0-9])', r'\1 ')
    return batch

VOCAB_SIZE = 10000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE, standardize=standardize, ngrams=(1,2)
)


In [11]:
encoder.adapt(train_dataset.map(lambda text, label: text))

In [12]:
encoded_sample = encoder(sample).numpy()[:3]
print(encoded_sample)

[[ 6  7  3  7  9 11 64 16 48 88 69]
 [10  6  5  2  6  8 92 67 17 26 37]
 [ 3  3  6  2  5 10 79 15 81 60 98]]


In [13]:
vocab = np.array(encoder.get_vocabulary())
for n in range(3):
  print("Original: ", sample[n].numpy())
  print("Round-trip: ", " ".join(vocab[encoded_sample[n]]))
  print()

Original:  b'635320.0'
Round-trip:  6 3 5 3 2 0 6 3 3 5 5 3 3 2 2 0

Original:  b'861467.0'
Round-trip:  8 6 1 4 6 7 8 6 6 1 1 4 4 6 6 7

Original:  b'556418.0'
Round-trip:  5 5 6 4 1 8 5 5 5 6 6 4 4 1 1 8



In [14]:
model = tf.keras.Sequential(
    [
        encoder,
        tf.keras.layers.Embedding(
            input_dim=len(encoder.get_vocabulary()), output_dim=64, mask_zero=True
        ),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(1),
    ]
)

model.compile(
    loss=tf.keras.losses.MeanSquaredError(),
    optimizer=tf.keras.optimizers.Adam(),
)

In [None]:
history = model.fit(
    train_dataset, epochs=100, validation_steps=30, validation_data=test_dataset
)

Epoch 1/100

In [None]:
res = model.evaluate(test_dataset)

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history["val_" + metric], "")
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, "val_" + metric])

In [None]:
plot_graphs(history,'loss')

In [None]:
test_predictions = model.predict(test_inputs)
train_predictions = model.predict(train_inputs)
plt.scatter(train_labels, train_predictions, label='train', s=2)
plt.scatter(test_labels, test_predictions, label='test', s=2)
plt.legend()
plt.xlabel('Truth')
plt.ylabel('Prediction')
plt.savefig('digits_bigrams.png')
plt.show()