In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

In [None]:
from collections import OrderedDict

from lxml import etree

from ochre.matchlines import get_ns

def get_textblocks(fname):
    alto_ns = get_ns(fname)
    blocks = OrderedDict()
    
    num_blocks = 0
    
    context = etree.iterparse(fname, events=('end', ), tag=(alto_ns+'TextBlock'))
    for event, elem in context:
        blocks[elem.attrib['ID']] = []
        num_blocks += 1
        for a in elem.getchildren():
            if a.tag == alto_ns+'TextLine':
                for b in a.getchildren():
                    if b.tag == alto_ns+'String':
                        if b.attrib.get('SUBS_TYPE') == 'HypPart1':
                            blocks[elem.attrib['ID']].append(b.attrib['SUBS_CONTENT'])
                        elif b.attrib.get('SUBS_TYPE') != 'HypPart2':
                            blocks[elem.attrib['ID']].append(b.attrib['CONTENT'])
                    
        #for a in elem.getchildren():
        #    if a.tag == alto_ns+'String':
        #        lines[elem.attrib['ID']].append(a.attrib['CONTENT'])
        
         # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
            
    return blocks

# blocks equal length
#in_file_ocr = '/home/jvdzwaan/ownCloud/Shared/OCR/Originele ALTOs/DDD_010001946_002_alto.xml'
#in_file_gt = '/home/jvdzwaan/ownCloud/Shared/OCR/Ground-truth/DDD_010001946_002_GT.xml'

# blocks not equal length
in_file_ocr = '/home/jvdzwaan/ownCloud/Shared/OCR/Originele ALTOs/DDD_010007697_002_alto.xml'
in_file_gt = '/home/jvdzwaan/ownCloud/Shared/OCR/Ground-truth/DDD_010007697_002_GT.xml'


blocks_gs = get_textblocks(in_file_gt)
blocks_ocr = get_textblocks(in_file_ocr)

print(len(blocks_gs), len(blocks_ocr))

In [None]:
import edlib

def match_textblocks(blocks_gs, blocks_ocr):

    # calculate edit distances
    eds = np.zeros((len(blocks_gs), len(blocks_ocr)), np.int)

    for gs, (gs_id, gs_words) in enumerate(blocks_gs.items()):
        for ocr, (ocr_id, ocr_words) in enumerate(blocks_ocr.items()):
            gs_text = ' '.join(gs_words)
            ocr_text = ' '.join(ocr_words)
            if len(gs_text)> 0 and len(ocr_text) > 0:
                result = edlib.align(gs_text, ocr_text)
                ed = result['editDistance']
            else:
                ed = max(len(gs_text), len(ocr_text))
            eds[gs, ocr] = ed
            #print(result['editDistance'])
        
    # initialize matches
    used = []
    matches = {}

    for gs_label in blocks_gs.keys():
        matches[gs_label] = None
            
    # match blocks with ed=0
    for x, y in zip(*np.where(eds == 0)):
        #print(x, y, eds[x, y])
        gs_label = list(blocks_gs.keys())[x]
        ocr_label = list(blocks_ocr.keys())[y]
    
        if matches[gs_label] is None and ocr_label not in used:
            matches[gs_label] = ocr_label
            used.append(ocr_label)
        
    # sort rows on minimal edit distance
    for x, row in enumerate(eds):
        gs_label = list(blocks_gs.keys())[x]
        if matches[gs_label] is None:
            #print(gs_label)
            #print(np.argsort(row))
            for y in np.argsort(row):
                ocr_label = list(blocks_ocr.keys())[y]
            
                if ocr_label not in used:
                    #print(ocr_label)
                    #print(x, y, eds[x, y])
                
                    # get column y from eds. is eds[x, y] the minimal value?
                    col = eds[:, y]
                    min_value = col.min()
                    #print(min_value)
                    #print(' GS:', ' '.join(blocks_gs[gs_label]))
                    #print()
                    #print('OCR:', ' '.join(blocks_ocr[ocr_label]))
                    #print('---')
                    if eds[x, y] <= min_value:
                        matches[gs_label] = ocr_label
                        used.append(ocr_label)
                        #print('Taking this one')
                        #print()
                        break
    return matches, used

In [None]:
np.where(eds == 0)

In [None]:
used = []
matches = {}

for gs_label in blocks_gs.keys():
    matches[gs_label] = None

In [None]:
def count_matches(matches):
    return np.sum([1 for gs_label, ocr_label in matches.items() if ocr_label is not None])
count_matches(matches)

In [None]:
# eerst blokken met ed=0
for x, y in zip(*np.where(eds == 0)):
    #print(x, y, eds[x, y])
    gs_label = list(blocks_gs.keys())[x]
    ocr_label = list(blocks_ocr.keys())[y]
    
    if matches[gs_label] is None and ocr_label not in used:
        matches[gs_label] = ocr_label
        used.append(ocr_label)
        
        print(' GS:', ' '.join(blocks_gs[gs_label]))
        print()
        print('OCR:', ' '.join(blocks_ocr[ocr_label]))
        print('---')

In [None]:
# dan rijen sorteren op minimale edit distance
for x, row in enumerate(eds):
    gs_label = list(blocks_gs.keys())[x]
    if matches[gs_label] is None:
        #print(gs_label)
        #print(np.argsort(row))
        for y in np.argsort(row):
            ocr_label = list(blocks_ocr.keys())[y]
            
            if ocr_label not in used:
                print(ocr_label)
                print(x, y, eds[x, y])
                
                # pak kolom y uit eds. is eds[x, y] hier de minimum value van?
                col = eds[:, y]
                min_value = col.min()
                print(min_value)
                print(' GS:', ' '.join(blocks_gs[gs_label]))
                print()
                print('OCR:', ' '.join(blocks_ocr[ocr_label]))
                print('---')
                if eds[x, y] <= min_value:
                    matches[gs_label] = ocr_label
                    used.append(ocr_label)
                    print('Taking this one')
                    print()
                    break

In [None]:
print(len(used))

In [None]:
set(blocks_ocr.keys()).difference(set(used))

In [None]:
' '.join(blocks_ocr['P2_TB00130'])

In [None]:
for gs_label, ocr_label in matches.items():
    if ocr_label is None:
        print(gs_label)
        print(' '.join(blocks_gs[gs_label]))

In [None]:
edlib.align(' '.join(blocks_gs['P2_TB00004']), ' '.join(blocks_ocr['P2_TB00130']))

Opslaan:
    
* Mapping van gs_labels naar ocr_labels (yaml of json)
* num text blocks in gs
* num text blocks matched

In [None]:
import os

from nlppln.utils import get_files
from ochre.matchlines import gt_fname2ocr_fname

#gs_dir = '/home/jvdzwaan/ownCloud/Shared/OCR/Ground-truth/'
gs_dir = '/home/jvdzwaan/ownCloud/Shared/OCR/Reordered Ground-truth/'
ocr_dir = '/home/jvdzwaan/ownCloud/Shared/OCR/Originele ALTOs/'
#ocr_dir = '/home/jvdzwaan/ownCloud/Shared/OCR/Opnieuw geOCRd/'

gs_files = get_files(gs_dir)
# remove file with "extra" in the name, this one is the same as the file without "extra" in the name
gs_files = [f for f in gs_files if not 'extra' in f]

ocr_files = []
for gs_file in gs_files:
    ocr_bn = gt_fname2ocr_fname(gs_file)
    # the 'opnieuw' alto files have a different file name
    #ocr_bn = ocr_bn.replace('alto.xml', 'altoFR11.xml')
    ocr_bn = ocr_bn.replace('alto_reordered.xml', 'alto.xml')
    ocr_file = os.path.join(ocr_dir, ocr_bn)
    if os.path.isfile(ocr_file):
        ocr_files.append(ocr_file)
    else:
        print('File not found:', ocr_file)
        print('GS file:', gs_file)
print(len(gs_files), len(ocr_files))

In [None]:
import os
import json

from collections import Counter

from ochre.utils import get_temp_file
from ochre.matchlines import replace_entities

from nlppln.utils import create_dirs, out_file_name

def doc_id(fname):
    bn = os.path.basename(fname)
    n = bn.rsplit('_', 1)[0]
    return n

out_dir = '/home/jvdzwaan/data/kb-ocr/textblock_matches-reordered_gt/'
create_dirs(out_dir)

metadata = []

num_not_equal = 0

for gs_file, ocr_file in tqdm(zip(gs_files, ocr_files), total=len(gs_files)):
    #print(ocr_file)
    gs_tmp = get_temp_file()
    #print(gs_tmp)
    with open(gs_tmp, 'w') as f:
        f.write(replace_entities(gs_file))
    
    blocks_gs = get_textblocks(gs_tmp)
    blocks_ocr = get_textblocks(ocr_file)

    os.remove(gs_tmp)
    
    if len(blocks_gs) != len(blocks_ocr):
        #print(os.path.basename(gs_file), len(blocks_gs), len(blocks_ocr))
        num_not_equal += 1
        #not_equal.append(os.path.splitext(os.path.basename(gs_file))[0])
        used = []
    else:
        matches, used = match_textblocks(blocks_gs, blocks_ocr)
        out_file = out_file_name(os.path.join(out_dir), doc_id(ocr_file), ext='json')
        #print(out_file)
        with open(out_file, 'w') as f:
            json.dump(matches, f, indent=4)
        
    metadata.append({'gs_blocks': len(blocks_gs), 
                     'ocr_blocks': len(blocks_ocr),
                     'matched': len(used),
                     'doc_id': doc_id(gs_file)})
df = pd.DataFrame(metadata)
df

In [None]:
num_not_equal

In [None]:
df = df.set_index('doc_id')
df

In [None]:
df.to_csv('/home/jvdzwaan/data/kb-ocr/textblock_matches-reordered_GT.csv')

In [None]:
df.query('matched == 0')