In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

In [None]:
import os

from lxml import etree

from nlppln.utils import get_files
from ochre.matchlines import gt_fname2ocr_fname

gs_dir = '/home/jvdzwaan/ownCloud/Shared/OCR/Ground-truth/'
ocr_dir = '/home/jvdzwaan/ownCloud/Shared/OCR/Originele ALTOs/'
#ocr_dir = '/home/jvdzwaan/ownCloud/Shared/OCR/Opnieuw geOCRd/'

gs_files = get_files(gs_dir)
# remove file with "extra" in the name, this one is the same as the file without "extra" in the name
gs_files = [f for f in gs_files if not 'extra' in f]

ocr_files = []
for gs_file in gs_files:
    ocr_bn = gt_fname2ocr_fname(gs_file)
    # the 'opnieuw' alto files have a different file name
    #ocr_bn = ocr_bn.replace('alto.xml', 'altoFR11.xml')
    ocr_file = os.path.join(ocr_dir, ocr_bn)
    if os.path.isfile(ocr_file):
        ocr_files.append(ocr_file)
    else:
        print('File not found:', ocr_file)
        print('GS file:', gs_file)
print(len(gs_files), len(ocr_files))

In [None]:
%%time

from nlppln.utils import create_dirs

from ochre.utils import get_temp_file
from ochre.matchlines import get_ns, replace_entities

def get_lines(fname, alto_ns):
    lines = []
    context = etree.iterparse(fname, events=('end', ), tag=(alto_ns+'TextLine'))
    for event, elem in context:
        words = []
        for a in elem.getchildren():
            if a.tag == alto_ns+'String':
                if a.attrib.get('SUBS_TYPE') == 'HypPart1':
                    words.append(a.attrib['SUBS_CONTENT'])
                elif a.attrib.get('SUBS_TYPE') != 'HypPart2':
                    words.append(a.attrib['CONTENT'])
                
        lines.append(' '.join(words))
        
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
            
    return lines

def doc_id(fname):
    bn = os.path.basename(fname)
    n = bn.rsplit('_', 1)[0]
    return n


out_dir = '/home/jvdzwaan/data/kb-ocr/text-not-aligned/'

create_dirs(out_dir)

gs_dir = os.path.join(out_dir, 'gs')
create_dirs(gs_dir)

ocr_dir = os.path.join(out_dir, 'ocr')
create_dirs(ocr_dir)

for gs_file, ocr_file in tqdm(zip(gs_files, ocr_files), total=len(gs_files)):
    try:
        gs_tmp = get_temp_file()
        #print(gs_tmp)
        with open(gs_tmp, 'w') as f:
            f.write(replace_entities(gs_file))
            
        #ocr_tmp = get_temp_file()
        #print(gs_tmp)
        #with open(ocr_tmp, 'w') as f:
        #    f.write(replace_entities(ocr_file))
        
        gs_lines = get_lines(gs_tmp, get_ns(gs_file))
        ocr_lines = get_lines(ocr_file, get_ns(ocr_file))
        #print(len(gs_lines), len(ocr_lines))
        
        os.remove(gs_tmp)
        #os.remove(ocr_tmp)
    
        #print(doc_id(gs_file))
        #print(doc_id(ocr_file))
        assert doc_id(gs_file) == doc_id(ocr_file)
        gs_out = os.path.join(gs_dir, '{}.txt'.format(doc_id(gs_file)))
        ocr_out = os.path.join(ocr_dir, '{}.txt'.format(doc_id(ocr_file)))
        #print(gs_out)
        #print(ocr_out)
    
        with open(gs_out, 'w') as f:
            f.write(' '.join(gs_lines))
        with open(ocr_out, 'w') as f:
            f.write(' '.join(ocr_lines))
    except etree.XMLSyntaxError as e:
        print(gs_file)
        print(e)
        print()