In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

In [None]:
from nlppln.utils import create_dirs, out_file_name

from ochre.utils import to_space_tokenized

def space_tokenize(gs_data, ocr_data, seq_length=53):

    num_seqs = 0
    equal = 0

    for i, (c1, c2) in enumerate(zip(gs_data, ocr_data)):
        if i == 0 or (c1 == ' ' and c2 == ' '):
            #print(i)
            start = i
            if i > 0:
                start = i + 1
            gs = ''.join(gs_data[start:start+seq_length])
            ocr = ''.join(ocr_data[start:start+seq_length])
        
            #print(to_space_tokenized(gs))
            #print(to_space_tokenized(ocr))
            
            yield to_space_tokenized(gs), to_space_tokenized(ocr)
        
            if gs == ocr:
                equal += 1
        
            num_seqs += 1
    print('num seqs:', num_seqs)
    print('equal:', equal)
    print('% equal:', equal/num_seqs *100)

def write_gs_and_ocr_data(gs_fname, ocr_fname, generator):
    create_dirs(gs_fname, is_file=True)
    total = 0
    print(gs_fname)
    print(ocr_fname)
    with open(gs_fname, 'w') as gs, open(ocr_fname, 'w') as ocr:
        for gs_seq, ocr_seq in generator:
            gs.write(gs_seq)
            gs.write('\n')
            
            ocr.write(ocr_seq)
            ocr.write('\n')
            
            total += 1
    return total

def get_file_names(out_dir, div): 
    gs = out_file_name(out_dir, div, ext='gs')
    ocr = out_file_name(out_dir, div, ext='ocr')
    
    return gs, ocr

In [None]:
import json

#datasets = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/datadivision-random.json'
datasets = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/datadivision-A8P1.json'
data_dir = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/aligned/'
out_dir = '/home/jvdzwaan/data/kb-ocr/A8P1'

with open(datasets) as d:
    division = json.load(d)
print(len(division['train']))
print(len(division['test']))
print(len(division['val']))

In [None]:
%%time
# read the texts
from ochre.utils import read_texts

seq_length = 53

raw_val, gs_val, ocr_val = read_texts(division.get('val'), data_dir)
raw_test, gs_test, ocr_test = read_texts(division.get('test'), data_dir)
raw_train, gs_train, ocr_train = read_texts(division.get('train'), data_dir)

In [None]:
%%time
# Generate the training data

gs_fname, ocr_fname = get_file_names(out_dir, 'train')
num = write_gs_and_ocr_data(gs_fname, ocr_fname, space_tokenize(gs_train, ocr_train, seq_length=53))
print('# train:', num)

In [None]:
%%time
# Generate the validation data
gs_fname, ocr_fname = get_file_names(out_dir, 'val')
num = write_gs_and_ocr_data(gs_fname, ocr_fname, space_tokenize(gs_val, ocr_val, seq_length=53))
print('# val:', num)

In [None]:
%%time
# Generate the test data: one file per input file
import os

from nlppln.utils import out_file_name, create_dirs

out_dir_test = os.path.join(out_dir, 'test')

total = 0

for txt_file in division.get('test'):
    gs_out_name = out_file_name(out_dir_test, txt_file, ext='gs')
    #print(gs_out_name)
    ocr_out_name = out_file_name(out_dir_test, txt_file, ext='ocr')
    #print(ocr_out_name)
    
    raw_test, gs_test, ocr_test = read_texts([txt_file], data_dir)
    n = write_gs_and_ocr_data(gs_out_name, ocr_out_name, 
                              space_tokenize(gs_test, ocr_test))
    total += n
print('# test:', total)

In [None]:
# check the % of equal sequences in the test data
raw_test, gs_test, ocr_test = read_texts(division.get('test'), data_dir)
for data in space_tokenize(gs_test, ocr_test):
    pass

In [None]:
# visually inspect the alignment of the characters
from ochre.utils import read_texts

data_file = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/aligned/DDD_010977145_002.json'
raw, gs, ocr = read_texts([data_file], None)

for c1, c2 in zip(gs, ocr):
    if c1 == '' or c2 == '': 
        if c1 == '' and c2 == '':
            print('#', '#', 'STRANGE')
        elif c1 == '':
            print('#', c2)
        elif c2 == '':
            print(c1, '#')
    else:
        print(c1, c2)