# Impact of OCR in linguistic processing

Tasks in this notebook:
- [ ] Part-of-speech tagging
- [ ] Named entity recognition
- [ ] Dependency parsing
- [ ] Semantic role labelling

Not considered here:
- Sentence splitting
- Tokenisation
- Lemmatisation

In [47]:
import sys
import argparse
from pathlib import Path
import spacy
import glob
import syntok.segmenter as segmenter
from spacy.tokens import Doc
import string
import pandas as pd
import os
import numpy as np
import difflib
from difflib import SequenceMatcher
import collections
import re
import ast
from tqdm import tnrange, tqdm_notebook

In [48]:
trovedf = pd.read_pickle("db_trove.pkl")

In [50]:
trovedf.shape

(30509, 9)

In [49]:
trovedf.head()

Unnamed: 0,filePath,articleId,articleType,year,ocrText,humanText,corrected,str_similarity,str_length
1,./trove_overproof/datasets/dataset1/rawTextAnd...,18378453,Article ILLUSTRATED,1953,"FROM RIVER CROSSING TO END OF TRIÄÜ I ^PI A^H""...",FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,,0.847747,747
2,./trove_overproof/datasets/dataset1/rawTextAnd...,18363627,Article,1953,"Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...",,0.964174,642
3,./trove_overproof/datasets/dataset1/rawTextAnd...,18366055,Article,1953,FIRST CHURCH I SERVICE 1 Presbyterian I ' Anni...,FIRST CHURCH SERVICE Presbyterian Anniversary ...,,0.739176,947
4,./trove_overproof/datasets/dataset1/rawTextAnd...,18386137,Article,1953,"""Bob"" Lulham's Fight Against Thallium District...","""Bob"" Lulham's Fight Against Thallium Arthur ...",,0.493898,2950
5,./trove_overproof/datasets/dataset1/rawTextAnd...,18368961,Article,1953,"DIVORCE Before The Judge In Divorce, Mr Justic...","DIVORCE Before The Judge In Divorce, Mr. Justi...",,0.894262,1220


In [51]:
trovedf = trovedf[(abs(trovedf['ocrText'].str.len() - trovedf['humanText'].str.len()) <= 100)]

In [52]:
trovedf.shape

(28287, 9)

In [53]:
trovedf.head()

Unnamed: 0,filePath,articleId,articleType,year,ocrText,humanText,corrected,str_similarity,str_length
1,./trove_overproof/datasets/dataset1/rawTextAnd...,18378453,Article ILLUSTRATED,1953,"FROM RIVER CROSSING TO END OF TRIÄÜ I ^PI A^H""...",FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,,0.847747,747
2,./trove_overproof/datasets/dataset1/rawTextAnd...,18363627,Article,1953,"Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...",,0.964174,642
5,./trove_overproof/datasets/dataset1/rawTextAnd...,18368961,Article,1953,"DIVORCE Before The Judge In Divorce, Mr Justic...","DIVORCE Before The Judge In Divorce, Mr. Justi...",,0.894262,1220
7,./trove_overproof/datasets/dataset1/rawTextAnd...,18381450,Article,1953,I SCHOOL CHESS * Homebush Increased Ils lead o...,SCHOOL CHESS Homebush increased its lead over...,,0.918347,992
8,./trove_overproof/datasets/dataset1/rawTextAnd...,18383206,Article,1953,Architects' Contracts Architects have signed t...,Architects' Contracts Architects have signed t...,,0.897167,953


### Align OCRed and human-corrected text

In [54]:
def findOcrHumanMatches(ocrText, humText):
    dOcrWordIndices = dict()
    dHumWordIndices = dict()
    maskedOcrText = ocrText.lower()
    for ow in maskedOcrText.split(" "):
        indices = maskedOcrText.index(ow), maskedOcrText.index(ow) + len(ow)
        for index in range(indices[0], indices[1]):
            maskedOcrText = maskedOcrText[:index] + '%' + maskedOcrText[index + 1:]
        if ow in dOcrWordIndices:
            dOcrWordIndices[ow].append(indices)
        else:
            dOcrWordIndices[ow] = [indices]
    maskedHumText = humText.lower()
    for hw in maskedHumText.split(" "):
        indices = maskedHumText.index(hw), maskedHumText.index(hw) + len(hw)
        for index in range(indices[0], indices[1]):
            maskedHumText = maskedHumText[:index] + '%' + maskedHumText[index + 1:]
        if hw in dHumWordIndices:
            dHumWordIndices[hw].append(indices)
        else:
            dHumWordIndices[hw] = [indices]
    
    dPotentialMatches = dict()
    for hq in sorted(dHumWordIndices, key=len, reverse=True):
        for oq in dOcrWordIndices:
            m = SequenceMatcher(None, hq, oq)
            if (hq, tuple(dHumWordIndices[hq])) in dPotentialMatches:
                dPotentialMatches[(hq, tuple(dHumWordIndices[hq]))].append((oq, dOcrWordIndices[oq], float(m.ratio())))
            else:
                dPotentialMatches[(hq, tuple(dHumWordIndices[hq]))] = [(oq, dOcrWordIndices[oq], float(m.ratio()))]
    
    ratio_decreasing = [1.0, 0.9, 0.8, 0.7]
    distance_limits = [10, 20, 30, 50, 90]
    word_length_list = [5, 3]
    
    lMatches = []
    used_hum_indices = []
    used_ocr_indices = []
    already_added = set()
    
    for word_length in word_length_list:
        for ratio in ratio_decreasing:
            for allowed_distance in distance_limits:
                for pm in dPotentialMatches:
                    hum_word = pm[0]
                    hum_indices = pm[1]
                    if len(hum_word) > word_length:
                        potential_matches = [dPotentialMatches[pm]]
                        hum_index_matched = False
                        for hum_index in hum_indices:
                            if hum_index_matched == False:
                                if not (hum_word, hum_index) in already_added:
                                    for pm in potential_matches[0]:
                                        for ow_indices in pm[1]:
                                            if abs(hum_index[0] - ow_indices[0]) <= allowed_distance and pm[2] >= ratio:
                                                if not hum_index[0] in used_hum_indices and not ow_indices[0] in used_ocr_indices:
                                                    match_not_possible = False
                                                    for already_matched in lMatches:
                                                        if already_matched[2] > hum_index[0]:
                                                            if already_matched[0] <= ow_indices[0]:
                                                                match_not_possible = True
                                                        elif already_matched[2] < hum_index[0]:
                                                            if already_matched[0] >= ow_indices[0]:
                                                                match_not_possible = True
                                                    if match_not_possible == False: 
                                                        already_added.add((hum_word, hum_index))
                                                        lMatches.append((ow_indices[0], ow_indices[1], hum_index[0], hum_index[1]))
                                                        used_hum_indices += list(range(hum_index[0], hum_index[0] + len(hw)))
                                                        used_ocr_indices += list(range(ow_indices[0], ow_indices[0] + len(ow)))
                                                        hum_index_matched = True
                                                        break
                            else:
                                hum_index_matched = False
                                break

    sorted_matches = sorted(lMatches, key=lambda tup: tup[0])
    
    return sorted_matches

In [83]:
sampledf = trovedf
corrected_cond = (sampledf["corrected"] == '')# & (df["str_length_humanText"] > 10)
sampledf["use_corrected"] = 1
sampledf.loc[corrected_cond, 'corrected'] = sampledf.loc[corrected_cond, 'humanText']
sampledf.loc[corrected_cond, 'use_corrected'] = 0
sampledf['use_corrected'].value_counts()

Unnamed: 0,filePath,articleId,articleType,year,ocrText,humanText,corrected,str_similarity,str_length,alignment,processed,use_corrected
30504,./trove_overproof/datasets/dataset3/rawTextAnd...,3,Article,1921,SPARROWS ATTACK MEN; MAKE THEM QUIT WORil DANV...,SPARROWS ATTACK MEN; MAKE THEM QUIT WORK DANVI...,SPARROWS ATTACK MEN; MAKE THEM QUIT WORil DANV...,0.940526,723,"[(0, 8, 0, 8), (9, 15, 9, 15), (16, 20, 16, 20...",yes,1
30505,./trove_overproof/datasets/dataset3/rawTextAnd...,4,Article,1921,AIR FORCES SEND SUB TO BTTOM WITH BOMB. Explos...,AIR FORCES SEND SUB TO BOTTOM WITH BOMB. Explo...,AIR FORCES SEND SUB TO BTTOM WITH BOMB. Explos...,0.948481,3028,"[(4, 10, 4, 10), (11, 15, 11, 15), (23, 28, 23...",yes,1
30507,./trove_overproof/datasets/dataset3/rawTextAnd...,6,Article,1922,"TRBTY OR WAR, FEAR or LODGE Senator Warns of P...","TRBTY OR WAR, FEAR or LODGE Senator Warns of P...","TRBTY OR WAR, FEAR or LODGE Senator Warns of P...",0.950375,3204,"[(0, 5, 0, 5), (9, 13, 9, 13), (22, 27, 22, 27...",yes,1
30508,./trove_overproof/datasets/dataset3/rawTextAnd...,7,Article,1922,"RAIDERS IN FREDER IC K LAND HAUL OF $84,000 Wa...","RAIDERS IN FREDERICK LAND HAUL OF $84,000 Watc...","RAIDERS IN FREDERICK LAND HAUL OF $84,000 Watc...",0.957743,3668,"[(0, 7, 0, 7), (11, 17, 11, 20), (23, 27, 21, ...",yes,1
30509,./trove_overproof/datasets/dataset3/rawTextAnd...,8,Article,1922,GUARDED SINCE EATH THREATS British Ambassador'...,GUARDED SINCE DEATH THREATS British Ambassador...,GUARDED SINCE DEATH THREATS British Ambassador...,0.978701,937,"[(0, 7, 0, 7), (8, 13, 8, 13), (19, 26, 20, 27...",yes,1


In [75]:
if not 'alignment' in sampledf:
    sampledf['alignment'] = ""
    sampledf['processed'] = "no"
    sampledf = pd.read_pickle("trove_artidigh.pkl")

counter = 0
for index, row in tqdm_notebook(sampledf.iterrows()):
    if row['processed'] == "no":
        counter += 1
        ocrText = row['ocrText'].strip(" ")
        humanText = row['corrected'].strip(" ")
        sorted_matches = findOcrHumanMatches(ocrText, humanText)
        sampledf.loc[index, 'alignment'] = str(sorted_matches)
        sampledf.loc[index, 'processed'] = 'yes'
        if counter % 100 == 0:
            sampledf.to_pickle("trove_artidigh.pkl")

sampledf.to_pickle("trove_artidigh.pkl")

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))