In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import json

from ochre.utils import read_texts

datasets = '/home/jvdzwaan/data/icdar2017st/eng_monograph/datadivision.json'
data_dir = '/home/jvdzwaan/data/icdar2017st/eng_monograph/aligned/'

with open(datasets) as d:
    division = json.load(d)
print(len(division['train']))
print(len(division['test']))
print(len(division['val']))

In [None]:
from ochre.utils import get_chars, get_sequences

seq_length = 53

raw_val, gs_val, ocr_val = read_texts(division.get('val'), data_dir)
raw_test, gs_test, ocr_test = read_texts(division.get('test'), data_dir)
raw_train, gs_train, ocr_train = read_texts(division.get('train'), data_dir)

chars, num_chars, ci = get_chars(raw_val, raw_test, raw_train, False)

gs_seqs_val, ocr_seqs_val = get_sequences(gs_val, ocr_val, seq_length)
gs_seqs_test, ocr_seqs_test = get_sequences(gs_test, ocr_test, seq_length)
gs_seqs_train, ocr_seqs_train = get_sequences(gs_train, ocr_train, seq_length)

print('n samples val', len(gs_seqs_val))
print('n samples test', len(gs_seqs_test))
print('n samples train', len(gs_seqs_train))

In [None]:
def filter_ocr_space(ocr_text, gs_seqs, ocr_seqs):
    ocr_selected = []
    gs_selected = []
    for i, c in enumerate(ocr_text):
        if c == ' ':
            if i < len(ocr_text)-1 and ocr_text[i+1] != ' ':
                try:
                    ocr_selected.append(ocr_seqs[i+1])
                    gs_selected.append(gs_seqs[i+1])
                    #print(repr(ocr_seqs_test[i+1]))
                except IndexError:
                    break
    return gs_selected, ocr_selected
gs_selected_test, ocr_selected_test = filter_ocr_space(ocr_test, gs_seqs_test, ocr_seqs_test)

In [None]:
print(len(ocr_train), len(ocr_selected))

In [None]:
len(gs_selected)

In [None]:
gs_selected[-1]

In [None]:
ocr_selected[-1]

In [None]:
gs_selected_test, ocr_selected_test = filter_ocr_space(ocr_test, gs_seqs_test, ocr_seqs_test)
gs_selected_train, ocr_selected_train = filter_ocr_space(ocr_train, gs_seqs_train, ocr_seqs_train)
gs_selected_val, ocr_selected_val = filter_ocr_space(ocr_val, gs_seqs_val, ocr_seqs_val)

print('n samples val', len(gs_selected_val))
print('n samples test', len(gs_selected_test))
print('n samples train', len(gs_selected_train))

In [None]:
import pickle

with open('train.pkl', 'wb') as f:
    pickle.dump((gs_selected_train, ocr_selected_train), f)
    
with open('val.pkl', 'wb') as f:
    pickle.dump((gs_selected_val, ocr_selected_val), f)
    
with open('ci.pkl', 'wb') as f:
    pickle.dump(ci, f)

In [None]:
print(ocr_seqs_test[0])

In [None]:
from ochre.datagen import DataGenerator

dg_val = DataGenerator(xData=ocr_selected_val, yData=gs_selected_val, char_to_int=ci,
                       seq_length=seq_length, padding_char='\n', oov_char='@',
                       batch_size=100, shuffle=False)
dg_test = DataGenerator(xData=ocr_selected_test, yData=gs_selected_test, char_to_int=ci,
                       seq_length=seq_length, padding_char='\n', oov_char='@',
                       batch_size=100, shuffle=False)
dg_train = DataGenerator(xData=ocr_selected_train, yData=gs_selected_train, char_to_int=ci,
                       seq_length=seq_length, padding_char='\n', oov_char='@',
                       batch_size=100, shuffle=False)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import TimeDistributed
from keras.layers import Bidirectional
from keras.layers import RepeatVector
from keras.layers import Embedding
from keras.callbacks import ModelCheckpoint

n_nodes = 1000
dropout = 0.2
n_embed = 256
n_vocab = len(ci)

loss='categorical_crossentropy'
optimizer='adam'
metrics=['accuracy']

model = Sequential()

# encoder

model.add(Embedding(n_vocab, n_embed, input_length=seq_length))
model.add(LSTM(n_nodes, input_shape=(seq_length, n_vocab)))
# For the decoder's input, we repeat the encoded input for each time step
model.add(RepeatVector(seq_length))
model.add(LSTM(n_nodes, return_sequences=True))

# For each of step of the output sequence, decide which character should be
# chosen
model.add(TimeDistributed(Dense(n_vocab, activation='softmax')))
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

In [None]:
# initialize saving of weights
#filepath = os.path.join(weights_dir, '{loss:.4f}-{epoch:02d}.hdf5')
filepath = '{loss:.4f}-{epoch:02d}.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1,
                                 save_best_only=True, mode='min')
callbacks_list = [checkpoint]

# do training (and save weights)
model.fit_generator(dg_train, steps_per_epoch=len(dg_train), epochs=10, 
                    validation_data=dg_val, 
                    validation_steps=len(dg_val), callbacks=callbacks_list,
                    use_multiprocessing=True,
                    workers=3)