In [1]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys


Using Theano backend.


In [2]:
path = 'RNU_random.txt'
text = open(path).read().lower()
text = text.strip('\n')
test = text[:20000]
text = text[20000:]
print('length:', len(text))

length: 634798


In [3]:
def chunkstring(string, length):
    return [string[0+i:length+i] for i in range(0, len(string), length)]

In [4]:
chunk_len = 2
chunk = chunkstring(text,chunk_len)

In [5]:
hexs = sorted(list(set(chunk)))
print(hexs)

print('total number of hex:', len(hexs))

hex_indices = dict((c, i) for i, c in enumerate(hexs))
indices_hex = dict((i, c) for i, c in enumerate(hexs))

['00', '01', '02', '03', '04', '05', '06', '07', '08', '09', '0a', '0b', '0c', '0d', '0e', '0f', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '1a', '1b', '1c', '1d', '1e', '1f', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '2a', '2b', '2c', '2d', '2e', '2f', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '3a', '3b', '3c', '3d', '3e', '3f', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '4a', '4b', '4c', '4d', '4e', '4f', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '5a', '5b', '5c', '5d', '5e', '5f', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '6a', '6b', '6c', '6d', '6e', '6f', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '7a', '7b', '7c', '7d', '7e', '7f', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '8a', '8b', '8c', '8d', '8e', '8f', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '9a', '9b', '9c', '9d', '9e', '9f', 'a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6

In [6]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 11
step = 3
sequences = []
next_hex = []
for i in range(0, len(chunk) - maxlen, step):
    sequences.append(text[i*chunk_len: (i + maxlen)*chunk_len])
    next_hex.append(text[(i + maxlen)*chunk_len:(i + maxlen + 1)*chunk_len])
print('number of sequences:', len(sequences))


print('Vectorization...')
X = np.zeros((len(sequences), maxlen, len(hexs)), dtype=np.bool)
y = np.zeros((len(sequences), len(hexs)), dtype=np.bool)
for i, sequence in enumerate(sequences):
    #print ('sentence', sentence)
    for t, char in enumerate(chunkstring(sequence,chunk_len)):        
        X[i, t, hex_indices[char]] = 1
    y[i, hex_indices[next_hex[i]]] = 1

number of sequences: 105796
Vectorization...


In [7]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(256, return_sequences=False,input_shape=(maxlen, len(hexs))))
model.add(Dense(len(hexs)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])


def sample(preds, temperature=1.0):    
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

Build model...


In [8]:
# train the model
for iteration in range(1, 2):    
    print('Iteration', iteration)
    model.fit(X, y, batch_size=128, nb_epoch=1,  verbose=1)



Iteration 1
Epoch 1/1


In [9]:
# testing with test dataset
text = test
print('corpus length:', len(text))
chunk = chunkstring(text,2)


corpus length: 20000


In [10]:

step = 1
sequences = []
next_hex = []
for i in range(0, len(chunk) - maxlen, step):
    sequences.append(text[i*2: (i + maxlen)*2])
    next_hex.append(text[(i + maxlen)*2:(i + maxlen + 1)*2])
print('sequences:', len(sequences))


print('Vectorization...')
X = np.zeros((len(sequences), maxlen, len(hexs)), dtype=np.bool)
y = np.zeros((len(sequences), len(hexs)), dtype=np.bool)
for i, sequence in enumerate(sequences):
    #print ('sequences', sequences)
    for t, char in enumerate(chunkstring(sequence,2)):        
        X[i, t, hex_indices[char]] = 1
    y[i, hex_indices[next_hex[i]]] = 1

sequences: 9989
Vectorization...


In [24]:
n_true = 0
diversity = 1
for i,x in enumerate(X):
    if i % 500 == 0:
        print ("Processed %d %d" % (i,n_true))
    x = x.reshape(1,maxlen,-1)
    preds = model.predict(x, verbose=0)[0]

    next_index = sample(preds, diversity)
    #print (next_index)
    next_hex = indices_hex[next_index]   
    if next_hex == indices_hex[np.argmax(y[i])]:
        n_true += 1
print ("%d %.2f%%" % (n_true,100.0*n_true/len(sequences)))

Processed 0 0
Processed 500 1
Processed 1000 1
Processed 1500 4
Processed 2000 7
Processed 2500 8
Processed 3000 11
Processed 3500 13
Processed 4000 13
Processed 4500 14
Processed 5000 16
Processed 5500 19
Processed 6000 19
Processed 6500 21
Processed 7000 22
Processed 7500 22
Processed 8000 25
Processed 8500 28
Processed 9000 29
Processed 9500 32
32 0.32%
