In [1]:
from Levenshtein import editops
import pandas as pd
import numpy as np
from itertools import groupby
from operator import itemgetter
from collections import Counter
import unicodedata
import random

In [2]:
ocrdf = pd.read_csv("ocrTokens.tsv", sep="\t", names=["ocrtext", "humtext"])

In [3]:
ocrdf.head()

Unnamed: 0,ocrtext,humtext
0,Customlinc,Customline
1,Hunlber,Humber
2,Gurdon,Gordon
3,rort,Fort
4,Sydnej,Sydney


In [4]:
ocrdf.shape

(140313, 2)

In [5]:
def find_consecutive_ranges(ltrans):
    transchunks = []
    for k, g in groupby(enumerate(ltrans), lambda ix : ix[0] - ix[1]):
        transchunks.append(list(map(itemgetter(1), g)))
    return transchunks

In [6]:
def capture_transformations(humtext, ocrtext, transformations):
    observed_transformations = []
    
    othtransf = False # False if all transformations are 'replace'
    for t in transformations:
        if not t[0] == 'replace':
            othtransf = True

    # Dictionary of human2ocr transformations, and of ocr2hum transformations
    dHum2OcrTrans = dict()
    dOcr2HumTrans = dict()
    for t in transformations:
        ttype = t[0]
        tcharhum = t[1]
        tcharocr = t[2]
        if tcharhum in dHum2OcrTrans:
            dHum2OcrTrans[tcharhum].append((tcharocr, ttype))
        else:
            dHum2OcrTrans[tcharhum] = [(tcharocr, ttype)]
        if tcharocr in dOcr2HumTrans:
            dOcr2HumTrans[tcharocr].append((tcharhum, ttype))
        else:
            dOcr2HumTrans[tcharocr] = [(tcharhum, ttype)]
        
    # Only 'replace' transformations:
    if othtransf == False:
        humrepls = [t[1] for t in transformations] # Indices replaced in humtext
        ocrrepls = [t[2] for t in transformations] # Indices replaced in ocrtext
        humranges = find_consecutive_ranges(humrepls)
        ocrranges = find_consecutive_ranges(ocrrepls)
        
        for i in range(len(humranges)):
            newhum = ""
            newocr = ""
            for j in range(len(humranges[i])):
                newhum += humtext[humranges[i][j]]
                newocr += ocrtext[ocrranges[i][j]]
            if newhum or newocr:
                observed_transformations.append((newhum, newocr))
    
    # Other transformations too:
    else:
        # If there is a 'delete', we capture it from the ocr2hum dictionary
        for ocr in dOcr2HumTrans:
            newhum = ""
            newocr = ""
            try:
                newocr = ocrtext[ocr]
                if len(dOcr2HumTrans[ocr]) > 1 and any(tr_item[1] == 'delete' for tr_item in dOcr2HumTrans[ocr]):
                    for ch_trans in dOcr2HumTrans[ocr]:
                        newhum += humtext[ch_trans[0]]
                if newhum and newocr:
                    observed_transformations.append((newhum, newocr))
            except IndexError:
                pass
            
        # If there is an 'insert', we capture it from the hum2ocr dictionary    
        for hum in dHum2OcrTrans:
            newhum = ""
            newocr = ""
            try:
                newocr = humtext[hum]
                if len(dHum2OcrTrans[hum]) > 1 and any(tr_item[1] == 'insert' for tr_item in dHum2OcrTrans[hum]):
                    for ch_trans in dHum2OcrTrans[hum]:
                        newhum += ocrtext[ch_trans[0]]
                if newhum and newocr:
                    observed_transformations.append((newhum, newocr))
            except IndexError:
                pass

    return list(set(observed_transformations))

In [7]:
dTransformations = dict()
for i, row in ocrdf.iterrows():
    ocrtext = str(row['ocrtext'])
    humtext = str(row['humtext'])
    for t in capture_transformations(humtext, ocrtext, editops(humtext, ocrtext)):
        if t[0] in dTransformations:
            dTransformations[t[0]].append(t[1])
        else:
            dTransformations[t[0]] = [t[1]]

def sort_by_values_len(dict):
    dict_len= {key: len(value) for key, value in dict.items()}
    import operator
    sorted_key_list = sorted(dict_len.items(), key=operator.itemgetter(1), reverse=True)
    sorted_dict = [(item[0], dict[item [0]]) for item in sorted_key_list]
    return sorted_dict

sortedTransformations = sort_by_values_len(dTransformations)

In [8]:
for t in sortedTransformations:
    print(t[0], Counter(t[1]))

e Counter({'o': 24145, 'c': 5172, 'a': 598, 'i': 512, 't': 468, 'u': 422, 'r': 229, 's': 207, 'p': 191, 'n': 180, 'é': 91, 'f': 54, '»': 49, 'ó': 40, '.': 29, '«': 23, '"': 23, 'ö': 20, 'l': 18, 'B': 16, 'q': 15, '^': 14, 'è': 14, 'b': 14, 'ê': 12, 'd': 11, '-': 9, 'ç': 9, 'P': 9, 'j': 9, 'í': 8, 'L': 8, "'": 7, '*': 7, 'O': 7, 'v': 6, 'g': 5, 'k': 5, 'h': 5, 'C': 4, 'V': 4, 'ë': 3, 'm': 3, '3': 2, ';': 2, 'y': 2, 'G': 2, 'î': 2, 'á': 2, 'x': 2, 'J': 2, '?': 1, '!': 1, '6': 1, 'ß': 1, 'H': 1, 'ú': 1, 'K': 1, 'â': 1, 'û': 1, '\\': 1, '9': 1, '(': 1, 'D': 1, 'U': 1, 'F': 1, 'M': 1, 'ü': 1, 'ô': 1})
r Counter({'i': 12553, 'l': 956, 't': 907, 'n': 60, '.': 22, 'í': 18, 'v': 17, "'": 16, 'T': 14, 'f': 13, 'u': 13, 'c': 13, '!': 12, '-': 9, 'j': 8, 'x': 8, 'o': 7, 'a': 6, 'd': 6, '»': 5, 'V': 4, 's': 3, '"': 3, 'y': 3, 'h': 3, 'e': 3, '¡': 2, 'Y': 2, '^': 2, 'ï': 2, 'O': 1, 'm': 1, '*': 1, 'w': 1, 'b': 1, 'p': 1, 'k': 1, 'B': 1, 'S': 1, ']': 1})
a Counter({'n': 4301, 'u': 2929, 'i': 2885, 'o

hs Counter({'i*': 1, 'a': 1, 'li': 1})
o- Counter({'g': 2, 'e': 1})
FF Counter({'W': 1, 'IT': 1, 'EE': 1})
nh Counter({'ub': 3})
sb Counter({'l': 1, 'bh': 1, 'li': 1})
_, Counter({'g': 2, 'N': 1})
sz Counter({'.i': 1, 'w': 1, 'as': 1})
Man Counter({'m': 3})
Jc Counter({'F': 1, 'N': 1, 'E': 1})
su Counter({'G': 1, 'EU': 1, 'ra': 1})
.H Counter({'R': 3})
Vic Counter({'\\li': 1, 'Mci': 1, 'u': 1})
Äl Counter({'M': 3})
Ru Counter({'Ha': 2, 'i': 1})
Gov Counter({'e': 2, 'l': 1})
cas Counter({'ost': 2, 't': 1})
Nv Counter({'w': 3})
UE Counter({'VF': 1, 'DI': 1, 'L1': 1})
che Counter({'obc': 1, 'onc': 1, 'üic': 1})
AM Counter({'UI': 1, 'W': 1, 'U': 1})
br Counter({'lu': 2, 'ln': 1})
ies Counter({'f': 1, 'y': 1, 'IcB': 1})
mai Counter({'ino': 1, 'rml': 1, 'inu': 1})
per Counter({'t': 1, 'fPi': 1, 'j': 1})
-b Counter({'B': 2, 'F': 1})
razo Counter({'i': 3})
nnd Counter({'O': 1, 'M': 1, 'C': 1})
LE Counter({'TT': 1, 'XH': 1, 'I': 1})
J_ Counter({'E': 1, 'K': 1, 'L': 1})
rz Counter({'m': 1, 'iT':

URV Counter({'unv': 1})
Saw Counter({'E': 1})
Melb Counter({'f': 1})
hunter Counter({'l': 1})
ANNING Counter({'annino': 1})
HITTALL Counter({'hittalt': 1})
ROTT Counter({'kott': 1})
ss' Counter({'w': 1})
andw Counter({'i': 1})
War Counter({'a': 1})
i*5 Counter({'S': 1})
kas Counter({'bla': 1})
ii'a Counter({'n': 1})
rds Counter({'uli': 1})
MASON Counter({'V': 1})
LOCOMBE Counter({'locombs': 1})
LUNT Counter({'luit': 1})
W.L Counter({'l': 1})
uni Counter({'iim': 1})
abella Counter({'i': 1})
AIRLAND Counter({'airlaxd': 1})
a'a Counter({'s': 1})
-tí Counter({'B': 1})
rtillery. Counter({'R': 1})
Pal Counter({'e': 1})
Evan Counter({'.': 1})
Xf Counter({'I': 1})
ULLETIN Counter({'ulleiin': 1})
rtiller Counter({'R': 1})
GWOO Counter({'owoo': 1})
ROUND Counter({'kound': 1})
arlton Counter({'A': 1})
cibl Counter({'e': 1})
EWINGTON Counter({'P': 1})
îy Counter({'N': 1})
T'On Counter({'r': 1})
umea Counter({'r': 1})
sell Counter({'i': 1})
.= Counter({'s': 1})
0\ Counter({'W': 1})
ö Counter({'o': 

LEBRAT Counter({'I': 1})
aund Counter({'s': 1})
Shak Counter({'e': 1})
phe Counter({'j': 1})
Mari Counter({'Alan': 1})
2v Counter({'N': 1})
mie Counter({'imo': 1})
Dà Counter({'d': 1})
hursday Counter({'mrESDAY': 1})
nshire Counter({'m': 1})
Devon Counter({'s': 1})
Dev Counter({'o': 1})
nev Counter({'uLV': 1})
2s Counter({'N': 1})
COVE Counter({'covi': 1})
ÏT Counter({'N': 1})
Td Counter({'M': 1})
squ Counter({'H': 1})
dro Counter({'l': 1})
3,nnd Counter({'y': 1})
erativ Counter({'C': 1})
Green Counter({'w': 1})
Mee Counter({'r': 1})
.* Counter({'c': 1})
tf* Counter({'W': 1})
«! Counter({'e': 1})
rabourne Counter({'i': 1})
Nei Counter({'M': 1})
zetti Counter({'n': 1})
RROL Counter({'B': 1})
AY' Counter({'W': 1})
R.K.T Counter({'t': 1})
rgus Counter({'n': 1})
Mul Counter({'U': 1})
APE Counter({'apí': 1})
ong Counter({'o': 1})
tah Counter({'ult': 1})
say Counter({'anv': 1})
Tre Counter({'n': 1})
t,/list Counter({'r': 1})
anson Counter({'t': 1})
itchell Counter({'r': 1})
'C Counter({'c': 

herris Counter({'HLRRIS': 1})
hausso Counter({'l': 1})
lassia Counter({'t': 1})
hau Counter({'l': 1})
FORCES Counter({'ronces': 1})
pf Counter({'g': 1})
I*. Counter({'R': 1})
_-Wayda Counter({'z': 1})
rí Counter({'m': 1})
ish Counter({'l': 1})
14 Counter({'B': 1})
.ff Counter({'W': 1})
servatori Counter({'R': 1})
Constantin Counter({'T': 1})
i» Counter({'w': 1})
urran Counter({'m': 1})
LLI Counter({'Ual': 1})
GRIFFITHS Counter({'grittiths': 1})
Executi Counter({'T': 1})
Lk Counter({'E': 1})
at( Counter({'G': 1})
/B Counter({'R': 1})
xwe Counter({'wic': 1})
Îv Counter({'N': 1})
zn Counter({'m': 1})
inc Counter({'mit': 1})
riet Counter({'y': 1})
Hr Counter({'E': 1})
Conci Counter({'I': 1})
LK Counter({'U': 1})
VAt Counter({'W': 1})
ZI Counter({'R': 1})
hristch Counter({'l': 1})
rch Counter({'i': 1})
Ban Counter({'m': 1})
nell Counter({'u': 1})
íf Counter({'N': 1})
Ford Counter({'y': 1})
Fd Counter({'tl': 1})
Fev Counter({'e': 1})
JVC Counter({'M': 1})
Fath Counter({'theR': 1})
i? Counter

In [9]:
true_variations = {k: list(set(g["ocrtext"].tolist())) for k,g in ocrdf.groupby("humtext")}

In [10]:
def create_false_variations(hum_token, dTransformations, allTransfKeys, allTransfValues):
    num_changes = random.choice([1, 1, 1, 1, 2])
    random_subst_len = random.choice([1, 2, 3])
    parts = [hum_token[i : i + random_subst_len] for i in range(0, len(hum_token), random_subst_len)]
    random.shuffle(allTransfValues)
    random.shuffle(parts)
    keep_false_variation = ""
    for n in range(num_changes):
        if parts:
            for p in parts:
                if p in dTransformations:
                    for tv in allTransfValues:
                        if len(tv) <= 3 and not tv in dTransformations[p]:
                            keep_false_variation = hum_token.replace(p, tv, 1)
                            if keep_false_variation.strip() != "":
                                break
    return keep_false_variation

In [11]:
transformations_keys = [transf for transf in list(dTransformations.keys()) if len(transf) <= 3]
transformations_values = [transf for transf in list(dTransformations.values())]
transformations_values = list(set([item for sublist in transformations_values for item in sublist]))

with open('ocr_posneg.tsv', mode='w') as fw:
    for hum_token in true_variations:
        for i in range(len(true_variations[hum_token])):
            truevar = true_variations[hum_token][i]
            falsevar = create_false_variations(hum_token, dTransformations, transformations_keys, transformations_values).strip()
            if truevar and falsevar:
                fw.write(hum_token.strip() + "\t" + truevar  + "\tTRUE\n")
                fw.write(hum_token.strip() + "\t" + falsevar + "\tFALSE\n")

In [12]:
# Since we're learning transformations at a token level, we're missing the white space in our vocabulary.

transformations_keys = [transf for transf in list(dTransformations.keys()) if len(transf) <= 3]
transformations_values = [transf for transf in list(dTransformations.values())]
transformations_values = list(set([item for sublist in transformations_values for item in sublist]))

with open('ocr_posneg_whitespaces.tsv', mode='w') as fw:
    for hum_token in true_variations:
        for i in range(len(true_variations[hum_token])):
            truevar = true_variations[hum_token][i]
            falsevar = create_false_variations(hum_token, dTransformations, transformations_keys, transformations_values).strip()
            if truevar and falsevar:
                fw.write(hum_token.strip() + "\t" + truevar  + "\tTRUE\n")
                fw.write(hum_token.strip() + "\t" + falsevar + "\tFALSE\n")
                if random.choices(population=[1, 0], weights=[0.01, 0.99])[0] == 1:
                    randomCharTrue = random.choice(truevar)
                    randomCharFalse = random.choice(falsevar)
                    truevar = truevar.replace(randomCharTrue, randomCharTrue + " ", 1)
                    falsevar = falsevar.replace(randomCharFalse, randomCharFalse + " ", 1)
                    fw.write(hum_token.strip() + "\t" + truevar  + "\tTRUE\n")
                    fw.write(hum_token.strip() + "\t" + falsevar + "\tFALSE\n")