In [56]:
from __future__ import print_function
import sys
import os
import numpy as np
import random

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [57]:
path = './data/bitcount.txt'
text = open(path).read().lower().splitlines()
text = text[:-1] # last letter is space -> remove

In [58]:
text = ''.join(text)
test = text[-20001:-1]
text = text[:-20001]
print('Training length:', len(text))
print ('Test length', len(test))

Training length: 334824
Test length 20000


In [59]:
def chunkstring(string, length):
    '''
    Cut given "string" into chunks of size "length"
    '''
    return [string[0+i:length+i] for i in range(0, len(string), length)]

In [60]:
# Cut the bit string into single bit chunks
chunk_len = 1
chunk = chunkstring(text,chunk_len)

In [62]:
# We are considering each bit as one "word"
chars = sorted(list(set(chunk)))
print(chars)
print('Total words:', len(chars))

char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

['0', '1']
Total words: 2


In [63]:
'''
Preparing training set. Each sample consists of "maxlen" bits.
The next bit is assigned as the label.
"step" variable is used to decide how much overlap between samples.
'''
maxlen = 32
step = 3
sentences = []
next_chars = []
for i in range(0, len(text)/chunk_len - maxlen, step):
    sentences.append(text[i*chunk_len: (i + maxlen)*chunk_len])
    next_chars.append(text[(i + maxlen)*chunk_len:(i + maxlen + 1)*chunk_len])

X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):    
    for t, char in enumerate(chunkstring(sentence,chunk_len)):        
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
    
print ('Training set', X.shape, y.shape)    

Training set (111598, 32, 2) (111598, 2)


In [79]:
LSTM_activation = 'relu'

model = Sequential()
model.add(LSTM(256, return_sequences=True, activation=LSTM_activation,
               input_shape=(maxlen, len(chars))))
model.add(Dropout(0.5))
model.add(LSTM(128, return_sequences=False, activation=LSTM_activation))
model.add(Dense(128, activation='sigmoid'))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.0005)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, 
              metrics=['accuracy'])

print (model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_23 (LSTM)               (None, 32, 256)           265216    
_________________________________________________________________
dropout_12 (Dropout)         (None, 32, 256)           0         
_________________________________________________________________
lstm_24 (LSTM)               (None, 128)               197120    
_________________________________________________________________
dense_21 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_22 (Dense)             (None, 2)                 258       
_________________________________________________________________
activation_11 (Activation)   (None, 2)                 0         
Total params: 479,106
Trainable params: 479,106
Non-trainable params: 0
_________________________________________________________________
None

In [80]:
early_stopping = EarlyStopping(monitor='val_loss', patience=4, verbose=1)
monitoring = ModelCheckpoint('weights_LSTM_v1.hdf5', monitor='val_loss', 
                             verbose=1, save_best_only=True)
model.fit(X, y, batch_size=128, epochs=10, validation_split=0.2, verbose=1,
         callbacks=[early_stopping,monitoring])

Train on 89278 samples, validate on 22320 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 00006: early stopping


<keras.callbacks.History at 0x7ff5bdbd3e50>

In [81]:
'''
Preparing test set.
'''
step = 1
sentences = []
next_chars = []
for i in range(0, len(test)/chunk_len - maxlen, step):
    sentences.append(test[i*chunk_len: (i + maxlen)*chunk_len])
    next_chars.append(test[(i + maxlen)*chunk_len:(i + maxlen + 1)*chunk_len])

Xt = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
yt = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):    
    for t, char in enumerate(chunkstring(sentence,1)):        
        Xt[i, t, char_indices[char]] = 1
    yt[i, char_indices[next_chars[i]]] = 1
    
print ('Test set', Xt.shape, yt.shape)    

Test set (19968, 32, 2) (19968, 2)


In [82]:
n_true = 0
y_true = []
y_pred = []

batch_size = 1000
nb_batch = Xt.shape[0]/batch_size+1

for i in range(nb_batch):
    if i % 2 == 0:
        print ("Predicted %d next bits, %d correct" % (i*batch_size,n_true))
    x = Xt[i*batch_size:(i+1)*batch_size]
    preds = model.predict(x, verbose=0)
    pred_next_indexes = list(np.argmax(preds,axis=1))
    pred_next_chars = [indices_char[next_index] for next_index in pred_next_indexes]        
    y_pred += pred_next_chars

    true_next_indexes = list(np.argmax(yt[i*batch_size:(i+1)*batch_size],axis=1))
    true_next_chars = [indices_char[next_index] for next_index in true_next_indexes]
    y_true += true_next_chars

    n_true += np.sum(np.array(pred_next_chars)==np.array(true_next_chars))

y_true = map(int,y_true)
y_pred = map(int,y_pred)
y_true = np.asarray(y_true)
y_pred = np.asarray(y_pred)
print ("Predicted %d next bits in total, %d correct. Accuracy is %.5f." 
       % (yt.shape[0],n_true,(float(n_true)/yt.shape[0])))

Predicted 0 next bits, 0 correct
Predicted 2000 next bits, 1006 correct
Predicted 4000 next bits, 1994 correct
Predicted 6000 next bits, 2991 correct
Predicted 8000 next bits, 3951 correct
Predicted 10000 next bits, 4933 correct
Predicted 12000 next bits, 5971 correct
Predicted 14000 next bits, 6982 correct
Predicted 16000 next bits, 7996 correct
Predicted 18000 next bits, 9029 correct
Predicted 19968 next bits in total, 10012 correct. Accuracy is 0.50140.
