In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, MaxPooling1D, Flatten
from keras.layers import LSTM,Convolution1D
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils.data_utils import get_file
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import random
import sys
import os
import glob

In [None]:
filename = './QRNG Raw/170517/170517_RW_E--ch0.dat'

data = np.fromfile(filename, dtype='>i2')
data = data[2:]    
data = data >> 3
    
mu, sigma = np.mean(data), np.std(data)
print mu, sigma
# the histogram of the data
n, bins, patches = plt.hist(data, 1024, normed=1, facecolor='green', alpha=0.75)

# add a 'best fit' line
y = mlab.normpdf( bins, mu, sigma)
l = plt.plot(bins, y, 'r--', linewidth=1)

plt.xlabel('Values')
plt.ylabel('Probability')
plt.title(r'$\mathrm{Histogram\ of\ QNRG\ raw\ data:}\ $')
plt.grid(True)

plt.show()
    


In [None]:
# Training set
text = data[:5000000]

# Test sets
test1 = data[5000000:6000000]
test2 = data[6000000:7000000]
test3 = data[7000000:8000000]
test4 = data[8000000:9000000]
test5 = data[9000000:]

text = list(text)
test1 = list(test1)
test2 = list(test2)
test3 = list(test3)
test4 = list(test4)
test5 = list(test5)

# Treating each number as a "word"
text = map(str,text)
test1 = map(str,test1)
test2 = map(str,test2)
test3 = map(str,test3)
test4 = map(str,test4)
test5 = map(str,test5)

print len(text), len(test1), len(test5)
print text[:10]


In [None]:
# Treating each number as a "word". Creating a dictionary.
data = data.astype(np.str)
chars = sorted(list(set(data)))
print(chars)
del data
print('Total words:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
# Length of input. Treating each input that consists of 100 "words" as a "sentence".
maxlen = 100
# Distance between 2 consecutive "sentences"
step = 13

sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: (i + maxlen)])
    next_chars.append(text[(i + maxlen)])
print('Number of sentences:', len(sentences))


print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):    
    for t, char in enumerate(sentence):        
        X[i, t, char_indices[char]] = 1    
    y[i, char_indices[next_chars[i]]] = 1
print('Done vectorization!')    

In [None]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(Convolution1D(filters=64, kernel_size=9, padding='same', activation='relu', input_shape=(maxlen, len(chars))))
model.add(MaxPooling1D(pool_size=2))
model.add(Convolution1D(filters=128, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.0005)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
print model.summary()
early_stopping = EarlyStopping(monitor='val_loss', patience=4, verbose=1)
monitoring = ModelCheckpoint('weights_E1_ch0_ver5.hdf5', monitor='val_loss', verbose=1, save_best_only=True)


In [None]:
model.fit(X, y, nb_epoch=50, batch_size=128, validation_split=0.2, verbose=1, callbacks=[early_stopping,monitoring])
model.load_weights('weights_E1_ch0_ver5.hdf5')

In [None]:
tests = [test1,test2,test3,test4,test5]
del test1
del test2
del test3
del test4
del test5

for test in tests:
    y_true = []
    y_pred = []
    maxlen = 100
    step = 1
    sentences = []
    next_chars = []
    for i in range(0, len(test) - maxlen, step):
        sentences.append(test[i: (i + maxlen)])
        next_chars.append(test[(i + maxlen)])
    
    Xt = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    yt = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        
        for t, char in enumerate(sentence):        
            Xt[i, t, char_indices[char]] = 1       
        yt[i, char_indices[next_chars[i]]] = 1
    n_true = 0
    diversity = 1
    
    batch_size = 1000
    nb_batch = Xt.shape[0]/batch_size
    
    for i in range(nb_batch):
        if i % 100 == 0:
            print ("Processed %d, %d" % (i*batch_size,n_true))
        x = Xt[i*batch_size:(i+1)*batch_size]
        preds = model.predict(x, verbose=0)
        pred_next_indexes = list(np.argmax(preds,axis=1))
        pred_next_chars = [indices_char[next_index] for next_index in pred_next_indexes]        
        y_pred += pred_next_chars
        
        true_next_indexes = list(np.argmax(yt[i*batch_size:(i+1)*batch_size],axis=1))
        true_next_chars = [indices_char[next_index] for next_index in true_next_indexes]
        y_true += true_next_chars
        
        n_true += np.sum(np.array(pred_next_chars)==np.array(true_next_chars))
        
    y_true = map(int,y_true)
    y_pred = map(int,y_pred)
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    mse = np.mean(np.square(y_true-y_pred))
    print ('mse',mse)
    print ("%d_%d_%.5f" % (n_true,yt.shape[0],(float(n_true)/yt.shape[0])))        