# Use a DNN to learn the mapping between a hex string and a digit string

The goal is to use a Keras Sequential model to learn the mapping between a hex string (e.g. "dbb9f") and digit string (e.g. "899999"). The digit string represents the base numerical 10 value of the hex string.

In [None]:
import keras
from keras import models
from keras import layers
import numpy as np
import pandas as pd

## Data and Methods for One Hot Encoding

In [None]:
digits = "0123456789"
hex_digits = digits + "abcdef"

digits_char_to_int = dict((c, i) for i, c in enumerate(digits))
digits_int_to_char = dict((i, c) for i, c in enumerate(digits))

hex_char_to_int = dict((c, i) for i, c in enumerate(hex_digits))
hex_int_to_char = dict((i, c) for i, c in enumerate(hex_digits))

# Encoding

def one_hot_encode_hex_string(str):
    int_encoded = [hex_char_to_int[c] for c in str]
    return keras.utils.to_categorical(int_encoded, num_classes=len(hex_digits))

def one_hot_encode_digits_string(str):
    int_encoded = [digits_char_to_int[c] for c in str]
    return keras.utils.to_categorical(int_encoded, num_classes=len(digits))

# Decoding

def one_hot_decode_hex_to_str(arr):
    s = ""
    for row in arr:
        s += hex_int_to_char[np.argmax(row)]
    return s   

def one_hot_decode_digits_to_str(arr):
    s = ""
    for row in arr:
        s += digits_int_to_char[np.argmax(row)]
    return s   

## Example hex string
#### Model Input

In [None]:
# A input/hex string is *always* 5 characters
# Range: "186a0" - "dbb9f"

hex_str = "186a2"

one_hot_encoded = one_hot_encode_hex_string(hex_str)
print(one_hot_encoded)
print(one_hot_encoded.shape)
original_string = one_hot_decode_hex_to_str(one_hot_encoded)
print(original_string)

## Example digit string
#### Training label

In [None]:
# A label/digit string is *always* 6 characters
# Range: "100000" - "899999"

digit_str = "100002"

one_hot_encoded = one_hot_encode_digits_string(digit_str)
print(one_hot_encoded)
print(one_hot_encoded.shape)
original_string = one_hot_decode_digits_to_str(one_hot_encoded)
print(original_string)

## Load data

In [None]:
df = pd.read_csv('data.csv', dtype={'string': str, 'hex': str})

# Used to test with less data
use_full_dataset = False

if not use_full_dataset:
    df = df[0:5]

# randomize rows when using the full dataset
if use_full_dataset:
    df = df.sample(frac=1).reset_index(drop=True)
    df.set_index('string')

df.head()

## One hot encode all hex and label strings

In [None]:
all_labels_encoded = []
all_hexes_encoded   = []

for row in df.itertuples():
    label_string = row[1] # row['string']
    hex_str      = row[2] # row['hex']

    label_encoded = one_hot_encode_digits_string(label_string)
    hex_encoded   = one_hot_encode_hex_string(hex_str)

    all_labels_encoded.append(label_encoded)
    all_hexes_encoded.append(hex_encoded)

all_labels_encoded = np.asarray(all_labels_encoded)
all_hexes_encoded  = np.asarray(all_hexes_encoded)

print(all_labels_encoded.shape)
print(all_hexes_encoded.shape)

## Verify that encoding decoding works

In [None]:
for i in range(5):
    print(one_hot_decode_digits_to_str(all_labels_encoded[i]), 
          "->", 
          one_hot_decode_hex_to_str(all_hexes_encoded[i]))

## Split encoded arrays into training, validation, and test sets

In [None]:
if use_full_dataset:
    n_training_examples   = int(example_cnt * 0.7)
    n_validation_examples = int(example_cnt * 0.15)
    n_test_examples       = int(example_cnt * 0.15)
else:
    n_training_examples   = 3
    n_validation_examples = 1
    n_test_examples       = 1

print("Total expected  :", len(df))
print("Total calculated:", n_training_examples + n_validation_examples + n_test_examples)

x_train = all_hexes_encoded[:n_training_examples]
y_train = all_labels_encoded[:n_training_examples]
print("Training examples:", len(x_train))

x_validation = all_hexes_encoded[n_training_examples  : n_training_examples + n_validation_examples]
y_validation = all_labels_encoded[n_training_examples : n_training_examples + n_validation_examples]
print("Validation examples:", len(x_validation))

x_test = all_hexes_encoded[len(all_hexes_encoded) - n_test_examples:]
y_test = all_labels_encoded[len(all_labels_encoded) - n_test_examples:]
print("Test examples", len(x_test))

## Shape of training and label data

In [None]:
print("x_train.shape:", x_train.shape)
print("y_train.shape:", y_train.shape)

## Verify our dimensions match our expectations

In [None]:
#
# Dimensions of the input matrix: one hot encoded hex string (.e.g "186a2")
#

# fixed length (in chars) of a hex string
hex_n = len(one_hot_decode_hex_to_str(all_hexes_encoded[0]))

# number of symbols in the "alphabet" of a hex string
hex_k = len(hex_digits)

print("Training dimensions match:", x_train.shape[1:3] == (hex_n, hex_k))

#
# Dimensions of the label matrix: one hot encoded digits string (.e.g "100002")
#

# fixed length (in chars) of a hex string
label_n = len(one_hot_decode_digits_to_str(all_labels_encoded[0]))

# number of symbols in the "alphabet" of a digits string
label_k = len(digits)

print("Label dimensions match:", y_train.shape[1:3] == (label_n, label_k))

In [None]:
# Build and compile model


model = models.Sequential()
#model.add(layers.Dense(5*16, activation='relu', input_shape=((560000,))))
#model.add(layers.Dense(16, activation='relu', input_shape=((6 * 10,)))) 5 * 16
model.add(layers.Dense(16, activation='relu', input_shape=((60,))))
model.add(layers.Dense(32, activation='relu'))

#model.add(layers.Dense(n * k, activation=None))
#model.add(layers.Reshape((n, k)))

def softmaxAxis1(x):
    return keras.activations.softmax(x, axis=1)
# The model outputs an n by k matrix M where Mij is the probability that the ith letter is symbol j.
# To achieve that we need to use the softmax activation along the k axis
model.add(layers.Dense(n, activation=softmaxAxis1))

model.compile(optimizer='rmsprop', 
             #loss='categorical_crossentropy',
              loss='binary_crossentropy',
             metrics=['accuracy'])          

In [None]:
history = model.fit(x_train,
                    y_train)#,
                    #epochs=20,
                    #batch_size=512)#,
#                    validation_data=(x_validation, y_validation))