# LSTM Ciphertext Decryption Breakdown/Demonstration

### Relevant Imports

In [2]:
import tensorflow_text as tf_text
import numpy as np
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing.sequence import pad_sequences
from model_util.ciphers import *

### Preparing Data

In [5]:
def cipher(words):
    inp = int(input('What numeric key would you like to use for railfence ciphering? '))
    func = np.vectorize(railfence)
    return func(words, inp)

def one_hot_encoding(word, uniques):
    N = word.shape[0]
    enc = np.zeros((N, uniques.shape[0]))
    for i in range(N):
        enc[i] = (word[i] == uniques)
    return enc

In [6]:
# Load words
words = np.loadtxt('words_alpha.txt', dtype = str)[:150000]

M = words.shape[0]
MAX_WORD_LENGTH = len(max(words, key = len))

# Shuffle word list
np.random.seed(100)
idx = np.random.permutation(M)
words = words[idx]
# Encipher words
words_enc = cipher(words)

# Tokenize words for RNN input
tokenizer = tf_text.UnicodeCharTokenizer()
X_tokens = tokenizer.tokenize(words_enc).to_list()
y_tokens = tokenizer.tokenize(words).to_list()

# Pad tokens so inputs are all the same size
X_pad = pad_sequences(X_tokens, maxlen = MAX_WORD_LENGTH, padding = 'post', truncating = 'post')
y_pad = pad_sequences(y_tokens, maxlen = MAX_WORD_LENGTH, padding = 'post', truncating = 'post')

uniques = np.unique(y_pad)
NUM_UNIQUES = uniques.shape[0]

# Build dataset and labels
X = np.zeros((M, MAX_WORD_LENGTH, NUM_UNIQUES))
y = np.zeros((M, MAX_WORD_LENGTH, NUM_UNIQUES))
for i in range(M):
    X[i] = one_hot_encoding(X_pad[i], uniques)
    y[i] = one_hot_encoding(y_pad[i], uniques)

# Construct test and training sets
split = int(M * 0.8)
train_x, test_x = X[:split], X[split:]
train_y, test_y = y[:split], y[split:]

In [7]:
print(f'Number of words: {M}')
print(f'Max word length: {MAX_WORD_LENGTH}')
print(f'Some words: {words[:3]}')
print(f'Some encryptions: {words_enc[:3]}', end = '\n\n')
print(f'Some tokens:\n{y_pad[:3]}', end = '\n\n')
print(f'Number of unique tokens: {NUM_UNIQUES}')
print(f'All unique tokens:\n{uniques}', end = '\n\n')
print(f'Shape of one-hot encoding (by character): {X.shape}')
print(f'Example:\n{X[0][:3]}')

Number of words: 150000
Max word length: 28
Some words: ['imsonic' 'arminianism' 'duckers']
Some encryptions: ['imscoin' 'airnsmamiin' 'ducskre']

Some tokens:
[[105 109 115 111 110 105  99   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [ 97 114 109 105 110 105  97 110 105 115 109   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [100 117  99 107 101 114 115   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]]

Number of unique tokens: 27
All unique tokens:
[  0  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
 114 115 116 117 118 119 120 121 122]

Shape of one-hot encoding (by character): (150000, 28, 27)
Example:
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 0. 0.]]


### Build Model: Bidirectional LSTM

In [15]:
def build_model(X, y, learning_rate = 0.01, epochs = 3, validation_split = 0.3, units = 128, batch_size = 36):
    model = Sequential([
        Bidirectional(LSTM(units, return_sequences = True), input_shape = (MAX_WORD_LENGTH, NUM_UNIQUES)),
        Bidirectional(LSTM(32, return_sequences = True)),
        Dense(NUM_UNIQUES, activation = 'softmax')
    ])

    model.compile(optimizer = Adam(learning_rate = learning_rate), loss = 'categorical_crossentropy', metrics = ['accuracy'])
    history = model.fit(X, y, epochs = epochs, validation_split = validation_split, batch_size = batch_size, use_multiprocessing = True)
    return model, history

def decode_preds(preds, uniques):
    func = np.vectorize(chr)
    chars = preds.argmax(axis = 2)
    words = func(uniques[chars]).tolist()
    words = np.array([''.join(word) for word in words], dtype = str)

    return words

In [16]:
model, history = build_model(train_x, train_y, epochs = 5, validation_split = 0.2, learning_rate = 0.001)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
predictions = model.predict(test_x)



In [21]:
preds = decode_preds(predictions, uniques)
true = decode_preds(test_y, uniques)
acc = np.sum(preds == true) / true.shape[0]
print(f'The word-by-word accuracy of the model in deciphering the given cipher is {(acc * 100):.2f}%')

The word-by-word accuracy of the model in deciphering the given cipher is 89.19%


In [19]:
preds[0]

'epiphyllous'

In [20]:
true[0]

'epiphyllous'