In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import json
import codecs
import glob
import numpy as np
import os

# get test set
with codecs.open('/home/jvdzwaan/data/ocr/datadivision.json', encoding='utf-8') as f:
    division = json.load(f)
print len(division.get('train'))
print division.get('train')[0]

In [None]:
def read_texts(data_files, data_dir):
    raw_text = []
    gs = []
    ocr = []

    for df in data_files:
        with codecs.open(os.path.join(data_dir, df), encoding='utf-8') as f:
            aligned = json.load(f)

        ocr.append(aligned['ocr'])
        ocr.append([' '])             # add space between two texts
        gs.append(aligned['gs'])
        gs.append([' '])              # add space between two texts

        raw_text.append(''.join(aligned['ocr']))
        raw_text.append(''.join(aligned['gs']))

    # Make a single array, containing the character-aligned text of all data
    # files
    gs_text = [y for x in gs for y in x]
    ocr_text = [y for x in ocr for y in x]

    return ' '.join(raw_text), gs_text, ocr_text

data_dir = '/home/jvdzwaan/data/ocr/'
raw_val, gs_val, ocr_val = read_texts(division.get('val'), data_dir)
raw_test, gs_test, ocr_test = read_texts(division.get('test'), data_dir)
raw_train, gs_train, ocr_train = read_texts(division.get('train'), data_dir)

In [None]:
def get_char_to_int(chars):
    return dict((c, i) for i, c in enumerate(chars))

raw_text = ''.join([raw_val, raw_test, raw_train])
raw_text = raw_text.lower()
chars = sorted(list(set(raw_text)))
chars.append(u'\n')                      # padding character
char_to_int = get_char_to_int(chars)

n_chars = len(raw_text)
n_vocab = len(chars)

print n_vocab

In [None]:
def to_string(char_list, lowercase):
    if lowercase:
        return u''.join(char_list).lower()
    return u''.join(char_list)


def create_synced_data(ocr_text, gs_text, char_to_int, n_vocab, seq_length=25,
                       batch_size=100, padding_char=u'\n', lowercase=False, step=1):
    """Create padded one-hot encoded data sets from text.

    A sample consists of seq_length characters from ocr_text
    (includes empty characters) (input), and seq_length characters from
    gs_text (includes empty characters) (output).
    ocr_text and gs_tetxt contain aligned arrays of characters.
    Because of the empty characters ('' in the character arrays), the input
    and output sequences may not have equal length. Therefore input and
    output are padded with a padding character (newline).

    Returns:
      int: the number of samples in the dataset
      generator: generator for one-hot encoded data (so the data doesn't have
        to fit in memory)
    """
    dataX = []
    dataY = []
    text_length = len(ocr_text)
    for i in range(0, text_length-seq_length + 1, step):
        seq_in = ocr_text[i:i+seq_length]
        seq_out = gs_text[i:i+seq_length]
        dataX.append(to_string(seq_in, lowercase))
        dataY.append(to_string(seq_out, lowercase))
    return len(dataX), synced_data_gen(dataX, dataY, seq_length, n_vocab,
                                       char_to_int, batch_size, padding_char)


def synced_data_gen(dataX, dataY, seq_length, n_vocab, char_to_int, batch_size,
                    padding_char):
    while 1:
        for batch_idx in range(0, len(dataX), batch_size):
            X = np.zeros((batch_size, seq_length), dtype=np.int)
            Y = np.zeros((batch_size, seq_length, n_vocab), dtype=np.bool)
            sliceX = dataX[batch_idx:batch_idx+batch_size]
            sliceY = dataY[batch_idx:batch_idx+batch_size]
            for i, (sentenceX, sentenceY) in enumerate(zip(sliceX, sliceY)):
                for j, c in enumerate(sentenceX):
                    X[i, j] = char_to_int[c]
                for j in range(seq_length-len(sentenceX)):
                    X[i, len(sentenceX) + j] = char_to_int[padding_char]
                for j, c in enumerate(sentenceY):
                    Y[i, j, char_to_int[c]] = 1
                for j in range(seq_length-len(sentenceY)):
                    Y[i, len(sentenceY) + j, char_to_int[padding_char]] = 1
            yield X, Y

lowercase = True
batch_size = 100
seq_length = 25
            
numTrainSamples, trainDataGen = create_synced_data(ocr_train, gs_train, char_to_int, n_vocab, seq_length=seq_length, batch_size=batch_size, lowercase=lowercase, step=1)
numTestSamples, testDataGen = create_synced_data(ocr_test, gs_test, char_to_int, n_vocab, seq_length=seq_length, batch_size=batch_size, lowercase=lowercase, step=1)
numValSamples, valDataGen = create_synced_data(ocr_val, gs_val, char_to_int, n_vocab, seq_length=seq_length, batch_size=batch_size, lowercase=lowercase, step=1)
n_patterns = numTrainSamples
print("Train Patterns: {}".format(n_patterns))
print("Validation Patterns: {}".format(numValSamples))
print("Test Patterns: {}".format(numTestSamples))
print('Total: {}'.format(numTrainSamples+numTestSamples+numValSamples))

In [None]:
numTrainSamples, trainDataGen = create_synced_data(ocr_train, gs_train, char_to_int, n_vocab, seq_length=seq_length, batch_size=batch_size, lowercase=lowercase, step=3)
numTestSamples, testDataGen = create_synced_data(ocr_test, gs_test, char_to_int, n_vocab, seq_length=seq_length, batch_size=batch_size, lowercase=lowercase, step=3)
numValSamples, valDataGen = create_synced_data(ocr_val, gs_val, char_to_int, n_vocab, seq_length=seq_length, batch_size=batch_size, lowercase=lowercase, step=3)
n_patterns = numTrainSamples
print("Train Patterns: {}".format(n_patterns))
print("Validation Patterns: {}".format(numValSamples))
print("Test Patterns: {}".format(numTestSamples))
print('Total: {}'.format(numTrainSamples+numTestSamples+numValSamples))

In [None]:
# Met embedding layer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import TimeDistributed
from keras.layers import Bidirectional
from keras.layers import RepeatVector
from keras.layers import Embedding

model = Sequential()
model.add(Embedding(n_vocab, n_vocab, input_length=25))

model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Dropout(0.5))

model.add(TimeDistributed(Dense(n_vocab, activation='softmax')))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
n

In [None]:
model.fit_generator(trainDataGen, steps_per_epoch=int(numTrainSamples/batch_size), epochs=40, validation_data=valDataGen, validation_steps=int(numValSamples/batch_size))

Convolutional layer kan alleen bij seq2seq, omdat anders de output van de convulutie niet meer gelijk is aan het aantal output characters, dus dan kun je nooit een sequentie van die lengte voorspellen.

In [None]:
from keras.layers import Conv1D, MaxPooling1D

filter_length = [5, 3, 3]
nb_filter = [196, 196, 256]
pool_size = 2

model = Sequential()
model.add(Embedding(n_vocab, n_vocab, input_length=25))

for i in range(len(nb_filter)):
    model.add(Conv1D(filters=nb_filter[i],
                     kernel_size=filter_length[i],
                     padding='valid',
                     activation='relu',
                     kernel_initializer='glorot_normal'))

    model.add(Dropout(0.1))
    model.add(MaxPooling1D(pool_size=2))

model.add(Bidirectional(LSTM(256)))
model.add(Dropout(0.5))
model.add(RepeatVector(seq_length))
model.add(LSTM(256, return_sequences=True))

model.add(TimeDistributed(Dense(n_vocab, activation='softmax')))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit_generator(trainDataGen, steps_per_epoch=int(numTrainSamples/batch_size), epochs=40, validation_data=valDataGen, validation_steps=int(numValSamples/batch_size))