# Use a DNN to learn the mapping between a hex string and a digit string

The goal is to use a Keras Sequential model to learn the mapping between a hex string (e.g. "dbb9f") and digit string (e.g. "899999"). The digit string represents the base numerical 10 value of the hex string.

In [1]:
import os
import keras
from keras import models
from keras import layers
import numpy as np
import pandas as pd

import tensorflow as tf
from keras.layers import Input, Dense, Reshape, Flatten
from keras.activations import selu
from keras.models import Sequential
import numpy as np


Using TensorFlow backend.


## Data and Methods for One Hot Encoding

In [2]:
digits = "0123456789"
hex_digits = digits + "abcdef"

digits_char_to_int = dict((c, i) for i, c in enumerate(digits))
digits_int_to_char = dict((i, c) for i, c in enumerate(digits))

hex_char_to_int = dict((c, i) for i, c in enumerate(hex_digits))
hex_int_to_char = dict((i, c) for i, c in enumerate(hex_digits))

# Encoding

def one_hot_encode_hex_string(str):
    int_encoded = [hex_char_to_int[c] for c in str]
    return keras.utils.to_categorical(int_encoded, num_classes=len(hex_digits))

def one_hot_encode_digits_string(str):
    int_encoded = [digits_char_to_int[c] for c in str]
    return keras.utils.to_categorical(int_encoded, num_classes=len(digits))

# Decoding

def one_hot_decode_hex_to_str(arr):
    s = ""
    for row in arr:
        s += hex_int_to_char[np.argmax(row)]
    return s   

def one_hot_decode_digits_to_str(arr):
    s = ""
    for row in arr:
        s += digits_int_to_char[np.argmax(row)]
    return s   

## Example hex string
#### Model Input

In [3]:
# A input/hex string is *always* 5 characters
# Range: "186a0" - "dbb9f"

hex_str = "186a2"

one_hot_encoded = one_hot_encode_hex_string(hex_str)
print(one_hot_encoded)
print(one_hot_encoded.shape)
original_string = one_hot_decode_hex_to_str(one_hot_encoded)
print(original_string)

[[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]
(5, 16)
186a2


## Example digit string
#### Training label

In [4]:
# A label/digit string is *always* 6 characters
# Range: "100000" - "899999"

digit_str = "100002"

one_hot_encoded = one_hot_encode_digits_string(digit_str)
print(one_hot_encoded)
print(one_hot_encoded.shape)
original_string = one_hot_decode_digits_to_str(one_hot_encoded)
print(original_string)

[[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]]
(6, 10)
100002


## Create Data

In [6]:
def create_csv(fname, start_num, count):
    """
    Create a csv named fname containing n count.
    Each row will contain:
        * number (starting at start_num)
            * incremented by 1 each row
        * the hex value of the string version of number
    """
    with open(fname, "w+") as f:
        result_str = ""

        end_num = start_num + count
        i = start_num

        while i < end_num:
            s = str(i)
            hex_str = hex(i)[2:] # drop the leading "0x"

            result_str += "{},{}\n".format(s, hex_str)
            i += 1

        f.write("string,hex\n")
        f.write(result_str)
        
data_filename = 'data.csv'        

# Create it if it doesn't already exist
if not os.path.isfile(data_filename):
    # from 100_000 (0x186a0) to 899_999 (0xdbb9f)
    create_csv(data_filename, 100000, 800000)

## Load data

In [7]:
df = pd.read_csv(data_filename, dtype={'string': str, 'hex': str})

# Used to test with less data
use_full_dataset = True

if not use_full_dataset:
    df = df[0:5]

# randomize rows when using the full dataset
if use_full_dataset:
    df = df.sample(frac=1).reset_index(drop=True)
    df.set_index('string')

df.head()

Unnamed: 0,string,hex
0,560510,88d7e
1,516793,7e2b9
2,753922,b8102
3,136933,216e5
4,325765,4f885


## One hot encode all hex and label strings

In [9]:
all_labels_encoded = []
all_hexes_encoded   = []

for row in df.itertuples():
    label_string = row[1] # row['string']
    hex_str      = row[2] # row['hex']

    label_encoded = one_hot_encode_digits_string(label_string)
    hex_encoded   = one_hot_encode_hex_string(hex_str)

    all_labels_encoded.append(label_encoded)
    all_hexes_encoded.append(hex_encoded)

all_labels_encoded = np.asarray(all_labels_encoded)
all_hexes_encoded  = np.asarray(all_hexes_encoded)

print(all_labels_encoded.shape)
print(all_hexes_encoded.shape)

(800000, 6, 10)
(800000, 5, 16)


## Verify that encoding decoding works

In [11]:
for i in range(5):
    print(one_hot_decode_digits_to_str(all_labels_encoded[i]), 
          "->", 
          one_hot_decode_hex_to_str(all_hexes_encoded[i]))

560510 -> 88d7e
516793 -> 7e2b9
753922 -> b8102
136933 -> 216e5
325765 -> 4f885


## Split encoded arrays into training, validation, and test sets

In [12]:
if use_full_dataset:
    example_cnt = len(df)
    n_training_examples   = int(example_cnt * 0.7)
    n_validation_examples = int(example_cnt * 0.15)
    n_test_examples       = int(example_cnt * 0.15)
else:
    n_training_examples   = 3
    n_validation_examples = 1
    n_test_examples       = 1

print("Total expected  :", len(df))
print("Total calculated:", n_training_examples + n_validation_examples + n_test_examples)

x_train = all_hexes_encoded[:n_training_examples]
y_train = all_labels_encoded[:n_training_examples]
print("Training examples:", len(x_train))

x_validation = all_hexes_encoded[n_training_examples  : n_training_examples + n_validation_examples]
y_validation = all_labels_encoded[n_training_examples : n_training_examples + n_validation_examples]
print("Validation examples:", len(x_validation))

x_test = all_hexes_encoded[len(all_hexes_encoded) - n_test_examples:]
y_test = all_labels_encoded[len(all_labels_encoded) - n_test_examples:]
print("Test examples", len(x_test))

Total expected  : 800000
Total calculated: 800000
Training examples: 560000
Validation examples: 120000
Test examples 120000


## Shape of training and label data

In [13]:
print("x_train.shape:", x_train.shape)
print("y_train.shape:", y_train.shape)

x_train.shape: (560000, 5, 16)
y_train.shape: (560000, 6, 10)


## Verify our dimensions match our expectations

In [14]:
#
# Dimensions of the input matrix: one hot encoded hex string (.e.g "186a2")
#

# fixed length (in chars) of a hex string
hex_n = len(one_hot_decode_hex_to_str(all_hexes_encoded[0]))

# number of symbols in the "alphabet" of a hex string
hex_k = len(hex_digits)

print("Training dimensions match:", x_train.shape[1:3] == (hex_n, hex_k))

#
# Dimensions of the label matrix: one hot encoded digits string (.e.g "100002")
#

# fixed length (in chars) of a hex string
label_n = len(one_hot_decode_digits_to_str(all_labels_encoded[0]))

# number of symbols in the "alphabet" of a digits string
label_k = len(digits)

print("Label dimensions match:", y_train.shape[1:3] == (label_n, label_k))

Training dimensions match: True
Label dimensions match: True


# Build and compile model

### Questions
* How do we determine the correct number of nodes in the initial layer?
* The shape of the input data is `(5, 16)` and the shape of our labels is `(6, 10)`. How do we define a model that can convert between these differing dimensions?
* Do we have to create our own [loss function implementation](https://stackoverflow.com/questions/43576922/keras-custom-metric-iteration) to make this work?

In [17]:
n = 5 #input size
m = 6 #output size
l = 16 #input alphabet size
k = 10 #output alpahbet size
bs = 512 #batch size


model = Sequential()
model.add(Flatten(input_shape = (n,l)))
model.add(Dense(64, activation = selu))
model.add(Dense(64, activation = selu))
model.add(Dense(64, activation = selu))
model.add(Dense(64, activation = selu))
model.add(Dense(64, activation = selu))
model.add(Dense(64, activation = selu))
model.add(Dense(m * k, activation=None))
model.add(Reshape((m, k)))

def batch_accuracy(truth, pred):
    truth_decoded = tf.argmax(truth, axis = 2)
    pred_decoded = tf.argmax(pred, axis = 2)
    correct = tf.cast(tf.equal(truth_decoded, pred_decoded), tf.float32)
    #correct is bs x m
    accuracy = tf.reduce_sum(correct, axis = 1)/float(m)
    return accuracy

def batch_crossentropy(truth, pred):
    batch_size = tf.shape(truth)[0]
    #input shapes are BxMxK where B is batchsize
    #reshape so that every row is a separate probability distribution
    truth = tf.reshape(truth, (-1, k))
    pred = tf.reshape(pred, (-1, k))
    ce = tf.nn.softmax_cross_entropy_with_logits(labels = truth, logits = pred)
    #crossentropy is now bs * m, and we want it reshaped to bs x m
    ce = tf.reshape(ce, (batch_size,-1))
    loss = tf.reduce_sum(ce, axis = 1)
    return loss

model.compile(optimizer='rmsprop', 
              loss=batch_crossentropy,
              metrics = [batch_accuracy])


## Train

In [18]:
model.fit(x_train, y_train, 
          batch_size = bs, verbose = 1, epochs = 10, 
          validation_data=(x_validation, y_validation))

Train on 560000 samples, validate on 120000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f38bc475d30>


# Evaluate the trained model on data it has never seen

In [24]:
results = model.evaluate(x_test, y_test)
print("\nloss    : {}\naccuracy: {}".format(results[0], results[1] * 100))

test_amount = 100
output = model.predict(x_test)[0:test_amount]
pred_digits = map(one_hot_decode_digits_to_str, output)
truth_digits = map(one_hot_decode_digits_to_str, y_test[:test_amount])

for pred, truth in zip(pred_digits, truth_digits):
    print(pred, truth)

loss    : 4.290548793411255
accuracy: 69.20875106811523
485130 485164
859656 859720
384937 384963
192497 192507
232498 232472
810555 810523
524577 524557
838347 838359
339155 339155
195037 195101
841457 841439
627010 627070
502015 502053
650954 650990
809445 809489
720150 720114
397210 397322
142850 142904
104727 104765
782917 782915
519725 519743
758177 758133
378017 378099
378625 378695
669155 669149
747087 747095
358750 358766
150344 150366
328150 328170
430550 430582
725577 725541
188287 188279
439550 439582
840850 840894
723010 723072
252050 252074
309276 309252
364037 364087
798050 798106
436377 436341
499855 499855
362795 362789
580340 580358
763827 763875
227441 227467
348485 348483
652187 652181
466180 466108
250555 250581
725376 725330
405827 405879
785537 785607
254624 254648
577827 577889
540750 540710
654650 654654
389655 389617
563887 563833
390344 390384
212725 212799
248150 248120
653571 653491
290354 290388
441757 441727
212695 212621
611372 611378
887440 887444
554480