In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import os
from tqdm import tqdm_notebook as tqdm
#from codecs import open
from lxml import etree

from nlppln.utils import out_file_name, create_dirs

In [None]:
%%time
import re
import edlib
import copy
import pickle

from py_stringmatching.similarity_measure.partial_ratio import PartialRatio

from ochre.dbnl import Note, to_fragment, complete, get_repeated, extend_lines

def remove_notes(ocr_file, notes_file, out_dir, pickle_dir=None):
    with open(ocr_file) as f:
        ls = f.readlines()

    with open(notes_file) as f:
        notes = f.readlines()
        
    print('There are {} notes to be found.'.format(len(notes)))
        
    # remove empty lines
    lines = []

    for line in ls:
        if line.strip() != '':
            lines.append(line)
    print('The text contains {} lines.'.format(len(lines)))
    
    avg_line_length = np.mean([len(line) for line in lines])
    
    # get repeated notes
    repeated = get_repeated(notes)
    print('There are {} repeated notes.'.format(len(repeated)))
    
    pr = PartialRatio()
    
    print('Getting initial edit distances per line')
    # get initial edit distances per line
    line_data = pd.DataFrame()
    for i, n in enumerate(tqdm(notes)):
        line_data[i] = [edlib.align(n, line)['editDistance'] for line in lines]

    # initialize note objects
    scores = np.full_like(np.empty((len(notes), len(lines))), -1, dtype=np.int)
    #print(scores.shape)
    prev_note = None
    ns = []
    text = ''.join(lines)
    for i, n in enumerate(notes):
        note = Note(n, i, len(lines))
        if n in repeated:
            note.repeated = True
        else:
            note.repeated = False
        
        note.previous = prev_note
        prev_note = note
        note.candidates = []
    
        ns.append(note)

    ns.reverse()
    for i, n in enumerate(ns):
        if i > 0:
            n.next = ns[i-1]
    ns.reverse()
    
    print('Getting candidate lines')
    num = 0

    for i, n in enumerate(tqdm(ns)):
        #print(n)
        if not n.repeated:
            for j, idx in enumerate(line_data[i].sort_values().index[:10]):
                scores[i, idx] = pr.get_raw_score(lines[idx], n.text)
                #print(idx, lines[idx].strip(), score)
            
                # If the line starts with a number, sometimes the score is calculated incorrectly
                if scores[i, idx] < 90 and lines[idx][0].isdigit():
                    sc = pr.get_raw_score(re.sub(r'^\d+', '', lines[idx]), n.text)
                    if sc > 90:
                        #print('Fixed score', lines[idx])
                        scores[i, idx] = sc
                if scores[i, idx] > 90:
                    n.candidates.append(idx)
                    num += 1
                    #print('Added index', j)
                    #if j > 1:
                    #    print(n.text)
                    #    print('j', j)
                    #    print(idx, lines[idx].strip(), score)
                    #    print('---')
                    #break
            n.candidates.sort()
    
    print('Finding complete notes based on candidates.')
    c = 0
    for n in tqdm(ns):
        if n.candidates != []:
            done = complete(n.text, to_fragment(lines, n.candidates))
            if done:
                n.lines = n.candidates
                n.complete(lines)
                c += 1
            #print(n)
            #print(n.text)
            #print(n.get_search_indices())
            #print(n.lines)
            #print(n.candidates)
            #print(to_fragment(lines, n.candidates))
            #print(complete(n.text, to_fragment(lines, n.candidates)))
            #if (complete(n.text, to_fragment(lines, n.candidates))):
            #    c += 1
            #print(n.next)
            #print('---')
    print('Found {} complete notes.'.format(c))
    
    print('Matching best first')
    f = 0
    l = 0
    for i, n in enumerate(tqdm(ns)):
        #print(n.index)
        #print(n.lines_to_check[:10])
        #print('no of lines to check', len(n.lines_to_check))
        if not n.done:
            #print(n.get_search_indices())
            for idx in n.get_search_indices():
                if idx not in n.lines:
                    if scores[i, idx] == -1:
                        scores[i, idx] = pr.get_raw_score(lines[idx], n.text)
                    
                        if scores[i, idx] < 90 and lines[idx][0].isdigit():
                            sc = pr.get_raw_score(re.sub(r'^\d+', '', lines[idx]), n.text)
                            if sc > 90:
                                #print('Fixed score', lines[idx])
                                scores[i, idx] = sc
    
                    if scores[i, idx] > 90:
                        l += 1
                        n.lines.append(idx)
                        n.complete(lines)
                        break
    print('Added {} lines.'.format(l))
    print('Found {} "complete" notes.'.format(f))
    
    print('Matching lines incomplete notes/notes without candidates')
    f = 0
    l = 0
    for i, n in enumerate(tqdm(ns)):
        #print(n.index)
        #print(n.lines_to_check[:10])
        #print('no of lines to check', len(n.lines_to_check))
        if not n.done:
            #print(n.get_search_indices())
            for idx in n.get_search_indices():
                if idx not in n.lines:
                    if scores[i, idx] == -1:
                        scores[i, idx] = pr.get_raw_score(lines[idx], n.text)
                        
                    if scores[i, idx] > 90:
                        l += 1
                        n.lines.append(idx)
                        if n.complete(lines):
                            f += 1
                            #print('Complete', n.index)
                            #print(n.lines)
                            #print(n.to_fragment(lines))
                            #print('Checked:', checked)
                            break
                    
                        if len(n.text) < avg_line_length:
                            n.short = True
                            print('short', n)
                            break
            #if n.index == 434:
            #    print(n.lines)
            #print(n.lines)
            #print('Checked:', checked)
    print('Added {} lines.'.format(l))
    print('Found {} "complete" notes.'.format(f))
    
    print('Extending lines')
    added = 0

    for n in tqdm(ns):
        if n.lines != []:
            #print(n.text)
            #print(lines[n.lines[0]])
            #print(scores[n.index, n.lines[0]])
            #print(n.lines)
            ls = copy.copy(n.lines)
            extended = list(n.get_search_indices())
            before = [idx for idx in extended if idx < n.lines[0]]
            before.reverse()
            inside = [idx for idx in extended if idx > n.lines[0] and idx < n.lines[-1]]
            after = [idx for idx in extended if idx > n.lines[-1]]
            #print(n.lines)
            #print(before)
            #print(inside)
            #print(after)
            for idx in inside + before + after:
                #print(idx)
                if idx not in n.lines:
                    #print('Checking')
                    ed = n.ed(lines)
                    #print('ed', ed)
                    ls.append(idx)
                    ls.sort()
                    #print(ls)
                    new_ed = edlib.align(to_fragment(lines, ls), n.text)['editDistance']
                    #print('new_ed', new_ed)
                    if new_ed < ed:
                        #print('Adding', idx)
                        added += 1
                        n.lines.append(idx)
                        n.lines.sort()
                        n.added.append(idx)
                        n.complete(lines)
                    else:
                        ls.remove(idx)
            #if n.added != []:
            #    print(n)
            #    print(n.added)
            #    print(n.lines)
            #    print('---')
    print('Added {} lines.'.format(added))
    
    c = 0
    nf = 0
    s = 0
    other = []
    for n in ns:
        if n.done:
            c += 1
        if n.lines == []:
            nf += 1
        if n.short:
            s += 1
        if not n.done and n.lines != []:
            other.append(n.index)
    print('Complete', c)
    print('Not found', nf)
    print('Short', s)
    print('Incomplete', len(other))
    print('Total', np.sum([c, nf, len(other)]))
    
    if pickle_dir is not None:
        out = out_file_name(pickle_dir, ocr_file)
        print('Pickling to {}.'.format(out))
        
        create_dirs(out, is_file=True)
        
        with open(out, 'wb') as f:
            pickle.dump(ns, f)

    print('Removing notes')
    removed = []

    for n in ns:
        for idx in n.lines:
            removed.append(idx)
            
    # get the ocr text
    with open(ocr_file) as f:
        text = f.read()
        
    removed = list(set(removed))
    for idx in removed:
        l = lines[idx]
        text = text.replace(l, '')

    # save result
    create_dirs(out_dir)
    out = out_file_name(out_dir, ocr_file)
    #print(out)
    with open(out, 'w') as f:
        f.write(text)
        
    return removed

r = remove_notes('/home/jvdzwaan/data/dbnl_ocr/raw/ocr-with-title-page/_aio001jver01_01.txt', 
                 '/home/jvdzwaan/data/dbnl_ocr/raw/notes/_aio001jver01_01.txt',
                 '/home/jvdzwaan/data/dbnl_ocr/raw/ocr',
                 pickle_dir='/home/jvdzwaan/data/dbnl_ocr/raw/pickled')
#print(r)

In [None]:
import os

from tqdm import tqdm_notebook as tqdm

from nlppln.utils import get_files, out_file_name, create_dirs

in_dir = '/home/jvdzwaan/data/dbnl_ocr/raw/ocr-without-title-page/'
notes_dir = '/home/jvdzwaan/data/dbnl_ocr/raw/notes/'
out_dir = '/home/jvdzwaan/data/dbnl_ocr/raw/ocr-1pass'

create_dirs(out_dir)

in_files = ['rade001gera01_01.txt', '_zev001198901_01.txt', '_tir001196201_01.txt',
            'looy001wond03_01.txt', 'potg001jczi10_01.txt', 'berg050jaro01_01.txt',
            '_tsj002195001_01.txt', '_jaa006199901_01.txt', '_taa006189101_01.txt',
            '_sep001197201_01.txt', 'oltm003memo05_01.txt', '_noo001189201_01.txt',
            'koni057heil01_01.txt', '_vla016197401_01.txt', '_bij005195501_01.txt']
in_files = [os.path.join(in_dir, f) for f in in_files]

with open('lines_removed_best_first.txt', 'w') as f:
    for in_file in tqdm(in_files):
        out = out_file_name(out_dir, in_file)
    
        # is there a notes file?
        notes_file = os.path.join(notes_dir, os.path.basename(in_file))
        if os.path.isfile(notes_file):
            print('processing', in_file)
        
            removed = remove_notes(in_file, notes_file, out_dir)
            removed.sort()
            f.write(os.path.basename(out))
            f.write('\t')
            removed = [str(r) for r in removed]
            f.write(','.join(removed))
            f.write('\n')
        print('---')