# Use a DNN to learn the mapping between a hex string and a digit string

The goal is to use a Keras Sequential model to learn the mapping between a hex string (e.g. "dbb9f") and digit string (e.g. "899999"). The digit string represents the base numerical 10 value of the hex string.

In [1]:
import os
import keras
from keras import models
from keras import layers
import numpy as np
import pandas as pd

Using TensorFlow backend.


## Data and Methods for One Hot Encoding

In [2]:
digits = "0123456789"
hex_digits = digits + "abcdef"

digits_char_to_int = dict((c, i) for i, c in enumerate(digits))
digits_int_to_char = dict((i, c) for i, c in enumerate(digits))

hex_char_to_int = dict((c, i) for i, c in enumerate(hex_digits))
hex_int_to_char = dict((i, c) for i, c in enumerate(hex_digits))

# Encoding

def one_hot_encode_hex_string(str):
    int_encoded = [hex_char_to_int[c] for c in str]
    return keras.utils.to_categorical(int_encoded, num_classes=len(hex_digits))

def one_hot_encode_digits_string(str):
    int_encoded = [digits_char_to_int[c] for c in str]
    return keras.utils.to_categorical(int_encoded, num_classes=len(digits))

# Decoding

def one_hot_decode_hex_to_str(arr):
    s = ""
    for row in arr:
        s += hex_int_to_char[np.argmax(row)]
    return s   

def one_hot_decode_digits_to_str(arr):
    s = ""
    for row in arr:
        s += digits_int_to_char[np.argmax(row)]
    return s   

## Example hex string
#### Model Input

In [3]:
# A input/hex string is *always* 5 characters
# Range: "186a0" - "dbb9f"

hex_str = "186a2"

one_hot_encoded = one_hot_encode_hex_string(hex_str)
print(one_hot_encoded)
print(one_hot_encoded.shape)
original_string = one_hot_decode_hex_to_str(one_hot_encoded)
print(original_string)

[[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]
(5, 16)
186a2


## Example digit string
#### Training label

In [4]:
# A label/digit string is *always* 6 characters
# Range: "100000" - "899999"

digit_str = "100002"

one_hot_encoded = one_hot_encode_digits_string(digit_str)
print(one_hot_encoded)
print(one_hot_encoded.shape)
original_string = one_hot_decode_digits_to_str(one_hot_encoded)
print(original_string)

[[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]]
(6, 10)
100002


## Create Data

In [5]:
def create_csv(fname, start_num, count):
    """
    Create a csv named fname containing n count.
    Each row will contain:
        * number (starting at start_num)
            * incremented by 1 each row
        * the hex value of the string version of number
    """
    with open(fname, "w+") as f:
        result_str = ""

        end_num = start_num + count
        i = start_num

        while i < end_num:
            s = str(i)
            hex_str = hex(i)[2:] # drop the leading "0x"

            result_str += "{},{}\n".format(s, hex_str)
            i += 1

        f.write("string,hex\n")
        f.write(result_str)
        
data_filename = 'data.csv'        

# Create it if it doesn't already exist
if not os.path.isfile(data_filename):
    # from 100_000 (0x186a0) to 899_999 (0xdbb9f)
    create_csv(data_filename, 100_000, 800_000)

## Load data

In [6]:
df = pd.read_csv(data_filename, dtype={'string': str, 'hex': str})

# Used to test with less data
use_full_dataset = False

if not use_full_dataset:
    df = df[0:5]

# randomize rows when using the full dataset
if use_full_dataset:
    df = df.sample(frac=1).reset_index(drop=True)
    df.set_index('string')

df.head()

Unnamed: 0,string,hex
0,100000,186a0
1,100001,186a1
2,100002,186a2
3,100003,186a3
4,100004,186a4


## One hot encode all hex and label strings

In [7]:
all_labels_encoded = []
all_hexes_encoded   = []

for row in df.itertuples():
    label_string = row[1] # row['string']
    hex_str      = row[2] # row['hex']

    label_encoded = one_hot_encode_digits_string(label_string)
    hex_encoded   = one_hot_encode_hex_string(hex_str)

    all_labels_encoded.append(label_encoded)
    all_hexes_encoded.append(hex_encoded)

all_labels_encoded = np.asarray(all_labels_encoded)
all_hexes_encoded  = np.asarray(all_hexes_encoded)

print(all_labels_encoded.shape)
print(all_hexes_encoded.shape)

(5, 6, 10)
(5, 5, 16)


## Verify that encoding decoding works

In [8]:
for i in range(5):
    print(one_hot_decode_digits_to_str(all_labels_encoded[i]), 
          "->", 
          one_hot_decode_hex_to_str(all_hexes_encoded[i]))

100000 -> 186a0
100001 -> 186a1
100002 -> 186a2
100003 -> 186a3
100004 -> 186a4


## Split encoded arrays into training, validation, and test sets

In [9]:
if use_full_dataset:
    n_training_examples   = int(example_cnt * 0.7)
    n_validation_examples = int(example_cnt * 0.15)
    n_test_examples       = int(example_cnt * 0.15)
else:
    n_training_examples   = 3
    n_validation_examples = 1
    n_test_examples       = 1

print("Total expected  :", len(df))
print("Total calculated:", n_training_examples + n_validation_examples + n_test_examples)

x_train = all_hexes_encoded[:n_training_examples]
y_train = all_labels_encoded[:n_training_examples]
print("Training examples:", len(x_train))

x_validation = all_hexes_encoded[n_training_examples  : n_training_examples + n_validation_examples]
y_validation = all_labels_encoded[n_training_examples : n_training_examples + n_validation_examples]
print("Validation examples:", len(x_validation))

x_test = all_hexes_encoded[len(all_hexes_encoded) - n_test_examples:]
y_test = all_labels_encoded[len(all_labels_encoded) - n_test_examples:]
print("Test examples", len(x_test))

Total expected  : 5
Total calculated: 5
Training examples: 3
Validation examples: 1
Test examples 1


## Shape of training and label data

In [10]:
print("x_train.shape:", x_train.shape)
print("y_train.shape:", y_train.shape)

x_train.shape: (3, 5, 16)
y_train.shape: (3, 6, 10)


## Verify our dimensions match our expectations

In [11]:
#
# Dimensions of the input matrix: one hot encoded hex string (.e.g "186a2")
#

# fixed length (in chars) of a hex string
hex_n = len(one_hot_decode_hex_to_str(all_hexes_encoded[0]))

# number of symbols in the "alphabet" of a hex string
hex_k = len(hex_digits)

print("Training dimensions match:", x_train.shape[1:3] == (hex_n, hex_k))

#
# Dimensions of the label matrix: one hot encoded digits string (.e.g "100002")
#

# fixed length (in chars) of a hex string
label_n = len(one_hot_decode_digits_to_str(all_labels_encoded[0]))

# number of symbols in the "alphabet" of a digits string
label_k = len(digits)

print("Label dimensions match:", y_train.shape[1:3] == (label_n, label_k))

Training dimensions match: True
Label dimensions match: True


# Build and compile model

### Questions
* How do we determine the correct number of nodes in the initial layer?
* The shape of the input data is `(n, 5, 16)` and the shape of our labels is `(n, 6, 10)`. How do we define a model that can convert between these differing dimensions?
* Do we have to create our own [loss function implementation](https://stackoverflow.com/questions/43576922/keras-custom-metric-iteration) to make this work?

In [12]:
if False:
    unknown_val = 16
    model = models.Sequential()
    model.add(layers.Dense(unknown_val, activation='relu', input_shape=((unknown_val,))))

    # TODO add more dense layers

    # Reshape layer
    model.add(layers.Dense(label_n * label_n, activation=None))
    model.add(layers.Reshape((label_n, label_n)))

    def softmaxAxis1(x):
        return keras.activations.softmax(x, axis=1)

    model.add(layers.Dense(label_n, activation=softmaxAxis1))

    model.compile(optimizer='rmsprop', 
                 # loss='categorical_crossentropy',
                 loss='binary_crossentropy',
                 metrics=['accuracy'])          

## Train

In [13]:
if False:
    history = model.fit(x_train,
                        y_train,
                        epochs=20,
                        batch_size=512,
                        validation_data=(x_validation, y_validation))

# Evaluate the trained model on data it has never seen

In [14]:
if False:
    results = model.evaluate(x_test, y_test)
    print("\nloss    : {}\naccuracy: {}".format(results[0], results[1] * 100))