In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

In [None]:
# non-word errors
# lexicon -> INL?
# word boundary: space in both gs and ocr, probleem: punctuation

In [None]:
lexicon = pd.read_csv('/home/jvdzwaan/code/ticclat/explore/notebooks/ingest/INL_lexicon.csv')
lexicon.head()

In [None]:
from pattern.nl import parsetree

def parse(text, parsetree):
    tokens = []
    p = parsetree(text,
                  tokenize=True,     # Split punctuation marks from words?
                  tags=True,         # Parse part-of-speech tags? (NN, JJ, ...)
                  chunks=False,      # Parse chunks? (NP, VP, PNP, ...)
                  relations=False,   # Parse chunk relations? (-SBJ, -OBJ, ...)
                  lemmata=True,      # Parse lemmata? (ate => eat)
                  encoding='utf-8',  # Input string encoding.
                  tagset=None)       # Penn Treebank II (default) or UNIVERSAL.
    for sentence_id, sentence in enumerate(p):
        for word_id, word in enumerate(sentence):
            tokens.append({'id': word_id,
                           'word': word.string,
                           'lemma': word.lemma,
                           'sentence': sentence_id,
                           'pos': word.type})
    return tokens

In [None]:
import json

gs_file = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/gs/DDD_000012234_001.txt'
ocr_file = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/ocr/DDD_000012234_001.txt'
aligned_file = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/aligned/DDD_000012234_001.json'

with open(aligned_file) as f:
    alignment_data = json.load(f)
aligned_gs = alignment_data['gs']
aligned_ocr = alignment_data['ocr']

In [None]:
with open(gs_file) as f:
    text = f.read()
tokens = parse(text, parsetree)

In [None]:
from ochre.create_word_mappings import find_word_boundaries

from nlppln.utils import remove_ext, create_dirs, out_file_name

def create_word_mapping(tokens, aligned_gs, aligned_ocr):
    words = [w['word'] for w in tokens]
    #print(words)
    wb = find_word_boundaries(words, aligned_gs)

    res = {'gs': [], 'ocr': []}
    for s, e in wb:
        w1 = u''.join(aligned_gs[s:e])
        w2 = u''.join(aligned_ocr[s:e])

        res['gs'].append(w1.strip())
        res['ocr'].append(w2.strip())

    # Use pandas DataFrame to create the csv, so commas and quotes are properly
    # escaped.
    df = pd.DataFrame(res)
    df.index.names = ['word_index']
    
    return df
    
df = create_word_mapping(tokens, aligned_gs, aligned_ocr)
df.head()

In [None]:
import os

from nlppln.utils import get_files

def word_mappings_directory(in_dir):
    out_dir = os.path.join(in_dir, 'word-mappings')

    gs_dir = os.path.join(in_dir, 'gs')
    ocr_dir = os.path.join(in_dir, 'ocr')
    aligned_dir = os.path.join(in_dir, 'aligned')

    gs_files = get_files(gs_dir)
    ocr_files = get_files(ocr_dir)
    aligned_files = get_files(aligned_dir)

    for gs_file, ocr_file, aligned_file in tqdm(zip(*(gs_files, ocr_files, aligned_files)), total=len(gs_files)):
        #print(gs_file)
        #print(ocr_file)
        #print(aligned_file)
        with open(aligned_file) as f:
            alignment_data = json.load(f)
    
        aligned_gs = alignment_data['gs']
        aligned_ocr = alignment_data['ocr']
    
        #print(''.join(aligned_ocr))
    
        with open(gs_file) as f:
            text = f.read()
        tokens = parse(text, parsetree)
        #print(tokens)
    
        df = create_word_mapping(tokens, aligned_gs, aligned_ocr)
        
        doc_id = remove_ext(aligned_file)
        #print(doc_id)
    
        out_file = out_file_name(out_dir, doc_id, ext='csv')
        print(out_file)
        create_dirs(out_file, is_file=True)
        #df[['gs', 'ocr']].to_csv(out_file)
        #print()
        #break

in_dir = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs'
#in_dir = '/home/jvdzwaan/data/icdar2019pocr-nl/'

word_mappings_directory(in_dir)

In [None]:
import os
import glob

from nlppln.utils import create_dirs, get_files

from nlppln.commands.normalize_whitespace_punctuation import normalize_whitespace, normalize_punctuation

from ochre.utils import align_characters

def align_and_word_mappings_directory(in_dir, out_dir, pred, gs='gs'):
    create_dirs(out_dir)

    gs_dir = os.path.join(in_dir, 'gs')
    pred_dir = os.path.join(in_dir, pred)

    pred_files = get_files(pred_dir)

    for pred_file in tqdm(pred_files):
        #print(gs_file)
        #print(ocr_file)
        gs_file = os.path.join(gs_dir, os.path.basename(pred_file))
        
        # make alignment
        with open(gs_file) as f:
            gs_text = f.read()
            
        gs_text = normalize_whitespace(gs_text)
        gs_text = normalize_punctuation(gs_text)
        
        with open(pred_file) as f:
            pred_text = f.read()
            
        pred_text = normalize_whitespace(pred_text)
        pred_text = normalize_punctuation(pred_text)
            
        aligned_gs, aligned_pred, _match = align_characters(pred_text, gs_text)    
       
        tokens = parse(gs_text, parsetree)
        #print(tokens)
    
        df = create_word_mapping(tokens, aligned_gs, aligned_pred)
        
        doc_id = remove_ext(gs_file)
        #print(doc_id)
    
        out_file = out_file_name(out_dir, doc_id, ext='csv')
        print(out_file)
        create_dirs(out_file, is_file=True)
        df[['gs', 'ocr']].to_csv(out_file)
        #print()
        #break

in_dir = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs'
out_dir = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/A8P1-wm/'

align_and_word_mappings_directory(in_dir, out_dir, pred='pred-A8P1')

In [None]:
in_dir = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs'
out_dir = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/A8P2-wm/'

align_and_word_mappings_directory(in_dir, out_dir, pred='pred-A8P2')

In [None]:
in_dir = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs'
out_dir = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/A8P3-wm/'

align_and_word_mappings_directory(in_dir, out_dir, pred='pred-A8P3')

In [None]:
df = pd.read_csv('/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/word-mappings/DDD_000012234_001.csv')

In [None]:
df.head()