In [None]:
import json
import codecs
import glob
import numpy as np
import os

from nlppln.utils import remove_ext

data_files = glob.glob('/home/jvdzwaan/data/ocr/*[0-9].json')

print 'Number of texts', len(data_files)

np.random.seed(4)
np.random.shuffle(data_files)

n = len(data_files) / 10 / 2
print 'Number of texts put in validation and test set', n
#n = 1

validation_texts = data_files[0:n]
test_texts = data_files[n:n+n]
train_texts = data_files[n+n:]

# save to json file, so it can be reused on DAS5
division = {'train': [remove_ext(os.path.basename(t)) for t in train_texts], 
            'val': [remove_ext(os.path.basename(t)) for t in validation_texts],
            'test': [remove_ext(os.path.basename(t)) for t in test_texts]}
with codecs.open('/home/jvdzwaan/data/ocr/datadivision2.json', 'wb', encoding='utf-8') as f:
    json.dump(division, f, indent=4)

In [None]:
with codecs.open('/home/jvdzwaan/data/ocr/datadivision.json', encoding='utf-8') as f:
    division = json.load(f)
print division

In [None]:
import json
import codecs
import glob
import numpy as np
import os

def read_texts(data_files, data_dir):
    raw_text = []
    gs = []
    ocr = []
    
    for df in data_files:
        with codecs.open(os.path.join(data_dir, df), encoding='utf-8') as f:
            aligned = json.load(f)
        
        ocr.append(aligned['ocr'])
        gs.append(aligned['gs'])
        
        raw_text.append(''.join(aligned['ocr']))
        raw_text.append(''.join(aligned['gs']))
    return ' '.join(raw_text), gs, ocr
    
seq_length = 25

raw_val, gs_val, ocr_val = read_texts(division['val'], '/home/jvdzwaan/data/ocr')
raw_test, gs_test, ocr_test = read_texts(division['test'], '/home/jvdzwaan/data/ocr')
raw_train, gs_train, ocr_train = read_texts(division['train'], '/home/jvdzwaan/data/ocr')

raw_text = ''.join([raw_val, raw_test, raw_train])

chars = sorted(list(set(raw_text)))
chars.append(u'\n')
char_to_int = dict((c, i) for i, c in enumerate(chars))    

n_chars = len(raw_text)
n_vocab = len(chars)
print "Total Characters: ", n_chars
print "Total Vocab: ", n_vocab

In [None]:
chars

In [None]:
def create_data(ocr_texts, gs_texts, seq_length=25):
    """Create padded one-hot encoded data sets from text.
    
    A sample consists of seq_length characters from texts from ocr_texts 
    (includes empty characters) (input), and seq_length characters from 
    gs_texts (includes empty characters) (output).
    ocr_texts and gs_tetxts contain aligned arrays of characters.
    Because of the empty characters ('' in the character arrays), the input
    and output sequences may not have equal length. Therefore input and 
    output are padded with a padding character (newline).
    """
    dataX = []
    dataY = []
    for ocr, gs in zip(ocr_texts, gs_texts):
        text_length = len(ocr)
        for i in range(0, text_length-seq_length +1, 1):
            seq_in = ocr[i:i+seq_length]
            seq_out = gs[i:i+seq_length]
            dataX.append(''.join(seq_in))
            dataY.append(''.join(seq_out))
    X = np.zeros((len(dataX), seq_length, n_vocab), dtype=np.bool)
    Y = np.zeros((len(dataY), seq_length, n_vocab), dtype=np.bool)

    for i, sentence in enumerate(dataX):
        for j, c in enumerate(sentence):
            X[i, j, char_to_int[c]] = 1
        for j in range(seq_length-len(sentence)):
            X[i, len(sentence) + j, char_to_int[u'\n']] = 1
            #print len(sentence)+j
        #print X[i]
        #print X[i].shape

    for i, sentence in enumerate(dataY):
        #print sentence
        for j, c in enumerate(sentence):
            Y[i, j, char_to_int[c]] = 1
        for j in range(seq_length-len(sentence)):
            Y[i, len(sentence)+j, char_to_int[u'\n']] = 1
        #print Y[i]
        #print Y[i].shape
    return X, Y

In [None]:
def check_data(data):
    res = xTrain.sum(axis=2)
    b = np.ones(res.shape, dtype=np.int)
    
    return (res==b).all()

In [None]:
xTrain, yTrain = create_data(ocr_train, gs_train)
xTest, yTest = create_data(ocr_test, gs_test)
xVal, yVal = create_data(ocr_val, gs_val)

print 'Train data OK?:', check_data(xTrain), check_data(yTrain)
print 'Test data OK?:', check_data(xTest), check_data(yTest)
print 'Val data OK?:', check_data(xVal), check_data(yVal)

n_patterns = len(xTrain)
print "Total Patterns: ", n_patterns
print 'val + test + train', len(xTrain) + len(xVal) + len(xTest)

In [None]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import TimeDistributed
from keras.layers import RepeatVector
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

LAYERS = 1
NODES = 256

model = Sequential()
# "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE
# note: in a situation where your input sequences have a variable length,
# use input_shape=(None, nb_feature).
model.add(LSTM(NODES, input_shape=(seq_length, len(chars)), return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(NODES, return_sequences=True))
model.add(Dropout(0.5))
# For the decoder's input, we repeat the encoded input for each time step
#model.add(RepeatVector(25))
# The decoder RNN could be multiple layers stacked or a single layer
#for _ in range(LAYERS):
    #model.add(LSTM(NODES, return_sequences=True))

# For each of step of the output sequence, decide which character should be chosen
model.add(TimeDistributed(Dense(len(chars), activation='softmax')))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# define the checkpoint
filepath="/home/jvdzwaan/data/tmp/dncvu-ad-aligned/padded-256-seed4-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
BATCH_SIZE = 100
model.fit(xTrain, yTrain, batch_size=BATCH_SIZE, epochs=50, validation_data=(xVal, yVal), callbacks=callbacks_list)

In [None]:
# load the network weights
filename = "/home/jvdzwaan/data/tmp/dncvu-ad-aligned/padded-256-02-0.3721.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
int_to_char = dict((i, c) for i, c in enumerate(chars))
print int_to_char

In [None]:
for vector in yVal[0:1,:,:]:
    indices = np.where(vector==True)[1]
    for i in indices:
        print int_to_char[i]

In [None]:
predicted = model.predict(xVal)
print predicted.shape
#for i in range(len(xVal)):   
#    r=model.predict(xVal[0:1,:,:])
#    for vector in r:
#        for p in vector:
#            i = np.random.choice(n_vocab, p=p)
#            print int_to_char[i]

In [None]:
match = 0
no_match = 0
in_is_out = 0
for i, sequence in enumerate(predicted):
    predicted_indices = [np.random.choice(n_vocab, p=p) for p in sequence]
    indices = np.where(yVal[i:i+1,:,:]==True)[2]
    if predicted_indices != list(indices):
        no_match += 1
        pred_str = u''.join([int_to_char[j] for j in predicted_indices])
        pred_str = pred_str.replace(u'\n', u'')
        
        gs = u''.join([int_to_char[j] for j in indices])
        gs = gs.replace(u'\n', u'')
        #print pred_str
        #print gs
        print u'"{}"\t"{}"'.format(gs, pred_str)
    else:
        match += 1
    indices2 = np.where(xVal[i:i+1,:,:]==True)[2]
    if list(indices) == list(indices2):
        in_is_out += 1
        
print 'Match', match
print 'No match', no_match
print 'Input == output', in_is_out

In [None]:
a = np.array([[0,0,0,0,0],[0,0,0,0,0]])
np.where(a==1)