In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# https://offbit.github.io/how-to-read/
# idea: chunk characters into words using convolutional layers 
#(ocr mistakes are recognized, because they don't fit into the standard word models)
# than do lstm on the chunks
# input = representation of document (characters and sentences)
# output = OCR quality (high/low or high/medium/low)
# gebruiken we WER of CER als quality measure?

In [None]:
import json
import codecs
import glob
import numpy as np
import os

# get test set
with codecs.open('/home/jvdzwaan/data/ocr/datadivision.json', encoding='utf-8') as f:
    division = json.load(f)
print len(division.get('train'))

In [None]:
from nlppln.commands.pattern_nl import parse
from pattern.nl import parsetree
from nltk.tokenize import sent_tokenize

def parse_text2(text):
    sentences = []
    s_idx = 0
    s = []

    for t in parse(text):
        if s_idx != t['sentence']:
            sentences.append(s)
            s = []
            s_idx = t['sentence']
        s.append(t['word'])
    sentences.append(s)
    return sentences

def parse_text3(text):
    p = parsetree(text,
                  tokenize=True,     # Split punctuation marks from words?
                  tags=True,         # Parse part-of-speech tags? (NN, JJ, ...)
                  chunks=False,      # Parse chunks? (NP, VP, PNP, ...)
                  relations=False,   # Parse chunk relations? (-SBJ, -OBJ, ...)
                  lemmata=True,      # Parse lemmata? (ate => eat)
                  encoding='utf-8',  # Input string encoding.
                  tagset=None)       # Penn Treebank II (default) or UNIVERSAL.
    for sentence_id, sentence in enumerate(p):
        print sentence.string
        
def parse_text(text):
    sentences = sent_tokenize(text)
    return [s.lower() for s in sentences]

print parse_text("Dit is een test. Er zijn twee zinnen.")

In [None]:
import codecs
import os

def doc_name(doc_id):
    return '{}-ds.ocr.txt'.format(doc_id)

data_dir = '/home/jvdzwaan/data/dncvu/ocr/'

num_sentences = []
sentence_lengths = []
doc_ids = []
docs = []

for j in division.get('train'):
    doc_id = j.split('.')[0]
    doc_ids.append(doc_id)
    with codecs.open(os.path.join(data_dir, doc_name(doc_id)), encoding='utf-8') as f:
        text = f.read()
    sentences = parse_text(text)
    num_sentences.append(len(sentences))
    for s in sentences:
        sentence_lengths.append(len(s))
    docs.append(sentences)

print max(num_sentences)
print max(sentence_lengths)

In [None]:
plt.hist(num_sentences, bins=100);

In [None]:
plt.hist(sentence_lengths, bins=50);

In [None]:
# based on the graphs, we pick
maxlen = 256
max_sentences = 25

In [None]:
txt = u''
for doc in docs:
    for s in doc:
        txt += s

chars = list(set(txt))
chars.append(u'*') # add padding character
chars.reverse()
print ''.join(chars)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
import pandas as pd

q = pd.read_csv('/home/jvdzwaan/data/ocr/merged.csv', index_col=0)
q

In [None]:
# get the labels
m = 'CER'
threshold = 1.0

labels = []

for doc_id in doc_ids:
    if q.loc['{}-ds.gs_out'.format(doc_id)][m] > threshold:
        labels.append(0)
    else:
        labels.append(1)
print len(labels)
print sum(labels)

In [None]:
# make a more balanced data set

def balanced_set(performance_df, doc_ids, m='CER', threshold=1.0, seed=4):

    high = []
    low = []

    for doc_id in doc_ids:
        if performance_df.loc['{}-ds.gs_out'.format(doc_id)][m] > threshold:
            # low quality
            low.append(doc_id)
            # high quality
        else:
            high.append(doc_id)
    print 'high quality', len(high)
    print 'low quality', len(low)
    
    # calculate the number of texts for which the gs data should be added to the data
    num_gs = len(doc_ids)/2 - len(high)
    print 'num gs', num_gs
    
    # determine what gs texts should be added to the data
    low.sort()

    np.random.seed(seed)
    np.random.shuffle(low)

    gs = low[0:num_gs]
    low = low[num_gs:]
    
    print 'high quality', len(high)
    print 'low quality', len(low)
    print 'gs', len(gs)
    
    return high, low, gs

high, low, gs = balanced_set(q, doc_ids)

In [None]:
def read_texts(file_prefixes, high, low, gs, data_dir):
    raw = []
    docs = []
    labels = []

    for p in file_prefixes:
        with codecs.open(os.path.join(data_dir, '{}.json'.format(p)), encoding='utf-8') as f:
            aligned = json.load(f)

        if p in high:
            text = ''.join(aligned['ocr'])
            label = 1
        elif p in low:
            text = ''.join(aligned['ocr'])
            label = 0
        else:
            text = ''.join(aligned['gs'])
            label = 1

        raw.append(text)
        docs.append(parse_text(text))
        labels.append(label)

    return ''.join(raw), docs, labels

data_dir = '/home/jvdzwaan/data/dncvu/aligned/'
high, low, gs = balanced_set(q, doc_ids)
raw_train, docs_train, labels_train = read_texts(doc_ids, high, low, gs, data_dir)
for j in division.get('val'):
    doc_id = j.split('.')[0]
    doc_ids.append(doc_id)
high, low, gs = balanced_set(q, doc_ids)
raw_val, docs_val, labels_val = read_texts(doc_ids, high, low, gs, data_dir)

In [None]:
print doc_ids[0]
print docs_train[0]
print labels[0]
# parse_text maakt lower_case. Dat willen we misschien niet.

In [None]:
X = np.zeros((len(docs), max_sentences, maxlen), dtype=np.int64)
y = np.array(labels)

for i, doc in enumerate(docs):
    for j, sentence in enumerate(doc):
        if j < max_sentences:
            for t, char in enumerate(sentence[-maxlen:]):
                X[i, j, (maxlen-1-t)] = char_indices[char]

In [None]:
print X[0, 0,:]

In [None]:
from keras.layers import Input, Lambda
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.models import Sequential, Model
from keras.layers.core import Dropout, Dense
from keras.layers.recurrent import LSTM
from keras.layers import merge
from keras.layers.wrappers import TimeDistributed

filter_length = [5, 3, 3]
nb_filter = [196, 196, 256]
pool_length = 2

in_sentence = Input(shape=(maxlen,), dtype='int64')
# binarize function creates a onehot encoding of each character index
embedded = Embedding(maxlen, len(chars))(in_sentence)

for i in range(len(nb_filter)):
    embedded = Conv1D(nb_filter=nb_filter[i],
                            filter_length=filter_length[i],
                            border_mode='valid',
                            activation='relu',
                            kernel_initializer='glorot_normal',
                            subsample_length=1)(embedded)

    embedded = Dropout(0.1)(embedded)
    embedded = MaxPooling1D(pool_length=pool_length)(embedded)

forward_sent = LSTM(128, return_sequences=False, dropout_W=0.2, dropout_U=0.2, consume_less='gpu')(embedded)
backward_sent = LSTM(128, return_sequences=False, dropout_W=0.2, dropout_U=0.2, consume_less='gpu', go_backwards=True)(embedded)

sent_encode = merge([forward_sent, backward_sent], mode='concat', concat_axis=-1)
sent_encode = Dropout(0.3)(sent_encode)

encoder = Model(input=in_sentence, output=sent_encode)

In [None]:
sequence = Input(shape=(max_sentences, maxlen), dtype='int64')
encoded = TimeDistributed(encoder)(sequence)
forwards = LSTM(80, return_sequences=False, dropout_W=0.2, dropout_U=0.2, consume_less='gpu')(encoded)
backwards = LSTM(80, return_sequences=False, dropout_W=0.2, dropout_U=0.2, consume_less='gpu', go_backwards=True)(encoded)

merged = merge([forwards, backwards], mode='concat', concat_axis=-1)
output = Dropout(0.3)(merged)
output = Dense(128, activation='relu')(output)
output = Dropout(0.3)(output)
output = Dense(1, activation='sigmoid')(output)

model = Model(input=sequence, output=output)

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.fit(X, labels, batch_size=64, epochs=10)

In [None]:
# todo:
# - make data sets (more) balanced
# - add validation set

In [None]:
# https://machinelearningmastery.com/timedistributed-layer-for-long-short-term-memory-networks-in-python/
# http://minimaxir.com/2017/04/char-embeddings/