In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#%matplotlib inline

import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt

In [None]:
import os

from collections import Counter

from ochre.utils import align_characters, read_texts, to_space_tokenized

def get_sequences(fname):
    sentences = []
    with open(fname) as f:
        for sentence in f:
            #print(sentence)
            characters = sentence.split()
            #print(characters)
            s = []
            for c in characters:
                if c == '<SPACE>':
                    s.append(' ')
                else:
                    s.append(c)
            #print(''.join(s))
            sentences.append(s)
    return sentences

def space_tokenize(gs_data, ocr_data, seq_length=53):

    num_seqs = 0
    equal = 0

    for i, (c1, c2) in enumerate(zip(gs_data, ocr_data)):
        if i == 0 or (c1 == ' ' and c2 == ' '):
            #print(i)
            start = i
            if i > 0:
                start = i + 1
            gs = ''.join(gs_data[start:start+seq_length])
            ocr = ''.join(ocr_data[start:start+seq_length])
        
            #print(to_space_tokenized(gs))
            #print(to_space_tokenized(ocr))
            
            yield gs_data[start:start+seq_length], ocr_data[start:start+seq_length]
        
            if gs == ocr:
                equal += 1
        
            num_seqs += 1
    print('num seqs:', num_seqs)
    print('equal:', equal)
    print('% equal:', equal/num_seqs *100)


def convert_to_text(aligned_file, pred_file, empty_char='#'):
    pred_seqs = get_sequences(pred_file)
    
    raw_test, gs_test, ocr_test = read_texts([aligned_file], None)
    gs_seqs = []
    ocr_seqs = []
    for gs_seq, ocr_seq in space_tokenize(gs_test, ocr_test):
        gs = []
        for c in gs_seq:
            if c == '':
                gs.append(empty_char)
            else:
                gs.append(c)
                
        ocr = []
        for c in ocr_seq:
            if c == '':
                ocr.append(empty_char)
            else:
                ocr.append(c)
                
        gs_seqs.append(gs)
        ocr_seqs.append(ocr)
    
    idx = 0
    counters = {}
    input_counters = {}

    for j, (s1, s2, s3) in enumerate(zip(gs_seqs, ocr_seqs, pred_seqs)):
        # the final sequence are empty, edlib can't deal with this
        if s3 != [] and s2 != []:
            #print('  GS:', ''.join(s1))
            #print(' OCR:', ''.join(s2))
            #print('PRED:', ''.join(s3))
            
            for i, (c1, c2) in enumerate(zip(s1, s2)):
                if c1 == ' ' and c2 == ' ':
                    skip = i+1
                    break
        
            # align pred with ocr (input)
            ocr, pred, _match = align_characters(s3, s2, empty_char='')
            
            tmp_pred = []
            for c in pred:
                if c == '':
                    tmp_pred.append(empty_char)
                else:
                    tmp_pred.append(c)
            
            
            #print('ALIG:', ''.join(tmp_pred))
            #print('SKIP:', ' '*(skip-1)+'^')
            #print('---')

            for i, c in enumerate(pred):
                if not idx + i in counters.keys():
                    counters[idx+i] = Counter()
                counters[idx+i][c] += 1
                #print(idx+i, counters[idx+i])
            
            for i, inp in enumerate(s2):
                if not idx + i in input_counters.keys():
                    input_counters[idx+i] = Counter()
                input_counters[idx+i][inp] += 1
                
            idx += skip
                
    agg_out = []
    multiple_choices = 0
    take_ocr_char = 0
    for idx, c in counters.items():
        if len(c) > 1:
            multiple_choices += 1
            opt1, opt2 = c.most_common(2)
            if opt1[1] == opt2[1]:
                #print('Take OCR char')
                #print(c.most_common(2))
                
                try:
                    #print(input_counters[idx])
                    selected = input_counters[idx].most_common(1)[0][0]
                    if selected == empty_char:
                        selected = ''
                except KeyError:
                    selected = ''
                take_ocr_char += 1
            else:
                selected = opt1[0]
        else:
            selected = c.most_common(1)[0][0]
        
        agg_out.append(selected)
        
            

    corrected_text = u''.join(agg_out)
    corrected_text = corrected_text.replace(empty_char, u'')
    
    print(f'{multiple_choices} of {len(counters)} characters have multiple choices')
    print(f'{take_ocr_char} of {multiple_choices} have equal most common')
    
    return corrected_text      

gs_file = '/home/jvdzwaan/data/kb-ocr/A8P1/test/DDD_010265237_001.gs'
ocr_file = '/home/jvdzwaan/data/kb-ocr/A8P1/test/DDD_010265237_001.ocr'
pred_file = '/home/jvdzwaan/data/kb-ocr/A8P1/pred/DDD_010265237_001.pred'
aligned_file = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/aligned/DDD_010265237_001.json'

convert_to_text(aligned_file, pred_file)


In [None]:
import glob

from nlppln.utils import create_dirs, out_file_name

def convert_dir_to_text(aligned_dir, pred_dir, out_dir):
    create_dirs(out_dir)
    
    pred_files = sorted(glob.glob(f'{pred_dir}/*.pred'))
    
    for pred in pred_files:
        print(pred)
        
        bn = os.path.splitext(os.path.basename(pred))[0]
        #print(bn)
        aligned = os.path.join(aligned_dir, f'{bn}.json')
        #print(aligned)
        
        corrected_text = convert_to_text(aligned, pred)
        
        # The ocrevaluation tool wants *.txt
        out_file = out_file_name(out_dir, pred, ext='txt')
        #print(out_file)
        with open(out_file, 'w') as f:
            f.write(corrected_text)
        print()

aligned_dir = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/aligned/'
pred_dir = '/home/jvdzwaan/data/kb-ocr/A8P1/pred/'

out_dir = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/pred-A8P1/'

convert_dir_to_text(aligned_dir, pred_dir, out_dir)