In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import os
from tqdm import tqdm_notebook as tqdm
#from codecs import open
from lxml import etree

from nlppln.utils import out_file_name, create_dirs

In [None]:
%%time
from ochre.dbnl import Note, to_fragment, complete, get_repeated, extend_lines

def remove_notes(ocr_file, notes_file, out_dir, pickle_dir=None):
    with open(ocr_file) as f:
        ls = f.readlines()

    with open(notes_file) as f:
        notes = f.readlines()
        
    print('There are {} notes to be found.'.format(len(notes)))
        
    # remove empty lines
    lines = []

    for line in ls:
        if line.strip() != '':
            lines.append(line)
    print('The text contains {} lines.'.format(len(lines)))
    
    # get repeated notes
    repeated = get_repeated(notes)
    print('There are {} repeated notes.'.format(len(repeated)))
    
    pr = PartialRatio()
    
    print('Matching entire notes')

    f = 0
    fr = 0
    prev_note = None
    ns = []
    text = ''.join(lines)
    for i, n in enumerate(tqdm(notes)):
        note = Note(n, i, len(lines))
        score = pr.get_raw_score(note.text, text)
        #print(i, 'score', score)
        if score > 90:
            note.found = True
            f += 1
        else:
            note.found = False
        if n in repeated:
            note.repeated = True
            if note.found:
                fr += 1
        else:
            note.repeated = False
        
        note.previous = prev_note
        prev_note = note
    
        ns.append(note)

    ns.reverse()
    for i, n in enumerate(ns):
        if i > 0:
            n.next = ns[i-1]
    ns.reverse()
    
    print('Found {} notes.'.format(f))
    print('Found and repeated: {} notes.'.format(fr))
    fu = f - fr
    
    # roughly match found, unrepeated notes
    print('Matching lines of found, unrepeated notes, pass 1')
    pr = PartialRatio()
    
    scores = np.full((len(notes), len(lines)), np.inf)

    f = 0
    l = 0
    for i, n in enumerate(tqdm(ns)):
        #print(n)
        #print(n.lines_to_check[:10])
        #print('no of lines to check', len(n.lines_to_check))
        if n.found and not n.repeated:
            checked = 0
            for idx in range(len(lines)):
                if idx > n.get_search_start():
                    checked += 1
                    if scores[i, idx] == np.inf:
                        scores[i, idx] = pr.get_raw_score(lines[idx], n.text)
                        
                    if scores[i, idx] > 90:
                        l += 1
                        n.lines.append(idx)
                        break
    print('Added {} lines.'.format(l))
    if l != fu:
        print('Did not add a line for all found unrepeated notes (found: {}, unreapeated: {}).'.format(l, fu))
    else:
        print('Added a line for all found unrepeated notes.')
    print('Found {} "complete" notes.'.format(f))
    
    print('Matching lines of found, unrepeated notes, pass 2')
    f = 0
    l = 0
    for i, n in enumerate(tqdm(ns)):
        #print(n)
        #print(n.lines_to_check[:10])
        #print('no of lines to check', len(n.lines_to_check))
        if n.found and not n.repeated:
            checked = 0
            for idx in range(len(lines)):
                if idx in n.get_search_indices() and idx not in n.lines:
                    checked += 1
                    if scores[i, idx] == np.inf:
                        scores[i, idx] = pr.get_raw_score(lines[idx], n.text)
                        
                    if scores[i, idx] > 90:
                        l += 1
                        n.lines.append(idx)
                        if n.complete(lines):
                            f += 1
                            #print('Complete')
                            #print(n.lines)
                            #print(n.to_fragment(lines))
                            #print('Checked:', checked)
                            break
            #print(n.lines)
            #print('Checked:', checked)
    print('Added {} lines.'.format(l))
    print('Found {} "complete" notes.'.format(f))
            
    # roughly match unprepeated notes that didn't have an initial match
    f = 0
    l = 0
    print('Matching lines of unfound, unrepeated notes')
    for n in tqdm(ns):
        #print(n)
        #print(n.lines_to_check[:10])
        #print('no of lines to check', len(n.lines_to_check))
        if not n.found and not n.repeated:
            checked = 0
            for idx in n.get_search_indices():
                checked += 1
                score = pr.get_raw_score(lines[idx], n.text)
                if score > 90:
                    l += 1
                    n.lines.append(idx)
                    n.found = True
                    if n.complete(lines):
                        f += 1
                        #print('Complete')
                        #print(n.lines)
                        #print(n.to_fragment(lines))
                        #print('Checked:', checked)
                        break
            #print(n.lines)
            #print('Checked:', checked)
    
    print('Added {} lines.'.format(l))
    print('Found {} "complete" notes.'.format(f))
            
    # match repeated notes
    f = 0
    l = 0
    print('Matching repeated notes')
    for n in tqdm(ns):
        #print(n)
        #print(n.lines_to_check[:10])
        #print('no of lines to check', len(n.lines_to_check))
        if n.repeated:
            checked = 0
            for idx in n.get_search_indices():
                checked += 1
                score = pr.get_raw_score(lines[idx], n.text)
                if score > 90:
                    l += 1
                    n.lines.append(idx)
                    n.found = True
                    if n.complete(lines):
                        f += 1
                        #print('Complete')
                        #print(n.lines)
                        #print(n.to_fragment(lines))
                        #print('Checked:', checked)
                        break
            #print(n.lines)
            #print('Checked:', checked)
    
    print('Added {} lines.'.format(l))
    print('Found {} "complete" notes.'.format(f))
        
    print('Extending lines')
    added = 0

    for n in tqdm(ns):
        #print(n)
        #print(n.lines)
        ls = copy.copy(n.lines)
        extended = extend_lines(n.lines, len(lines), 5)
        for idx in extended:
            #print(idx)
            if idx not in n.lines:
                #print('Checking')
                ed = n.ed(lines)
                #print('ed', ed)
                ls.append(idx)
                ls.sort()
                #print(ls)
                new_ed = edlib.align(to_fragment(lines, ls), n.text)['editDistance']
                #print('new_ed', new_ed)
                if new_ed <= ed:
                    #print('Adding', idx)
                    added += 1
                    n.lines.append(idx)
                    n.lines.sort()
                else:
                    ls.remove(idx)
        #print(n.lines)
    print('Added {} lines.'.format(added))
    
    f = 0
    c = 0
    for n in ns:
        if n.found:
            f += 1
        if n.complete(lines):
            c += 1
    print('Found {} of {} notes (complete: {})'.format(f, len(notes), c))
    
    if pickle_dir is not None:
        out = out_file_name(pickle_dir, ocr_file)
        print('Pickling to {}.'.format(out))
        
        create_dirs(out, is_file=True)
        
        with open(out, 'wb') as f:
            pickle.dump(ns, f)

    print('Removing notes')
    removed = []

    for n in ns:
        for idx in n.lines:
            removed.append(idx)
            
    # get the ocr text
    with open(ocr_file) as f:
        text = f.read()
        
    removed = list(set(removed))
    for idx in removed:
        l = lines[idx]
        text = text.replace(l, '')

    # save result
    create_dirs(out_dir)
    out = out_file_name(out_dir, ocr_file)
    #print(out)
    with open(out, 'w') as f:
        f.write(text)
        
    return removed

r = remove_notes('/home/jvdzwaan/data/dbnl_ocr/raw/ocr-with-title-page/_aio001jver01_01.txt', 
                 '/home/jvdzwaan/data/dbnl_ocr/raw/notes/_aio001jver01_01.txt',
                 '/home/jvdzwaan/data/dbnl_ocr/raw/ocr',
                 pickle_dir='/home/jvdzwaan/data/dbnl_ocr/raw/pickled')
#print(r)

In [None]:
import os

from tqdm import tqdm_notebook as tqdm

from nlppln.utils import get_files, out_file_name, create_dirs

in_dir = '/home/jvdzwaan/data/dbnl_ocr/raw/ocr-without-title-page/'
notes_dir = '/home/jvdzwaan/data/dbnl_ocr/raw/notes/'
out_dir = '/home/jvdzwaan/data/dbnl_ocr/raw/ocr-2passes'
pickle_dir = '/home/jvdzwaan/data/dbnl_ocr/raw/pickled'

create_dirs(out_dir)

in_files = ['rade001gera01_01.txt', '_zev001198901_01.txt', '_tir001196201_01.txt',
            'looy001wond03_01.txt', 'potg001jczi10_01.txt', 'berg050jaro01_01.txt',
            '_tsj002195001_01.txt', '_jaa006199901_01.txt', '_taa006189101_01.txt',
            '_sep001197201_01.txt', 'oltm003memo05_01.txt', '_noo001189201_01.txt',
            'koni057heil01_01.txt', '_vla016197401_01.txt', '_bij005195501_01.txt']
in_files = [os.path.join(in_dir, f) for f in in_files]

with open('lines_removed_iteratively_2passes.txt', 'w') as f:
    for in_file in tqdm(in_files):
        out = out_file_name(out_dir, in_file)
    
        # is there a notes file?
        notes_file = os.path.join(notes_dir, os.path.basename(in_file))
        if os.path.isfile(notes_file):
            print('processing', in_file)
        
            removed = remove_notes(in_file, notes_file, out_dir, pickle_dir=pickle_dir)
            removed.sort()
            f.write(os.path.basename(out))
            f.write('\t')
            removed = [str(r) for r in removed]
            f.write(','.join(removed))
            f.write('\n')
        print('---')