# [!] Preliminary note

You can skip this notebook and instead download the `.tsv` required for the experiments directly from [here](https://zenodo.org/record/4034819). Store the file `ocrTokens.tsv` in the `processing/resources/` folder.

# Create a dataset for OCR'd text

In [1]:
import pandas as pd
import spacy
import ast
import csv

### Load alignments

Load dataframe with the alignments generated by [this notebook](https://github.com/Living-with-machines/lwm_ARTIDIGH_2020_OCR_impact_downstream_NLP_tasks/blob/master/aligning_trove.ipynb) and containing spaCy's processing.

In [2]:
df = pd.read_pickle("../../resources/aligned_df_with_spacy.pkl")

In [3]:
df.head()

Unnamed: 0,filePath,articleId,articleType,year,ocrText,humanText,corrected,str_similarity,str_length,alignment,processed,spacyOcr,spacyHum
1,./trove_overproof/datasets/dataset1/rawTextAnd...,18378453,Article ILLUSTRATED,1953,"FROM RIVER CROSSING TO END OF TRIÄÜ I ^PI A^H""...",FROM RIVER CROSSING TO END OF TRIAL SPLASH: Pe...,,0.847747,747,"[(0, 4, 0, 4), (5, 10, 5, 10), (11, 19, 11, 19...",yes,"(FROM, RIVER, CROSSING, TO, END, OF, TRIÄÜ, I,...","(FROM, RIVER, CROSSING, TO, END, OF, TRIAL, SP..."
2,./trove_overproof/datasets/dataset1/rawTextAnd...,18363627,Article,1953,"Natural Childbirth Sir,-We nurses have seen fa...","Natural Childbirth Sir,-We nurses have seen fa...",,0.964174,642,"[(0, 7, 0, 7), (8, 18, 8, 18), (19, 26, 19, 26...",yes,"(Natural, Childbirth, Sir,-We, nurses, have, s...","(Natural, Childbirth, Sir,-We, nurses, have, s..."
5,./trove_overproof/datasets/dataset1/rawTextAnd...,18368961,Article,1953,"DIVORCE Before The Judge In Divorce, Mr Justic...","DIVORCE Before The Judge In Divorce, Mr. Justi...",,0.894262,1220,"[(0, 7, 0, 7), (8, 14, 8, 14), (19, 24, 19, 24...",yes,"(DIVORCE, Before, The, Judge, In, Divorce, ,, ...","(DIVORCE, Before, The, Judge, In, Divorce, ,, ..."
7,./trove_overproof/datasets/dataset1/rawTextAnd...,18381450,Article,1953,I SCHOOL CHESS * Homebush Increased Ils lead o...,SCHOOL CHESS Homebush increased its lead over...,,0.918347,992,"[(2, 8, 0, 6), (9, 14, 7, 12), (17, 25, 14, 22...",yes,"(I, SCHOOL, CHESS, *, Homebush, Increased, Ils...","(SCHOOL, CHESS, , Homebush, increased, its, l..."
8,./trove_overproof/datasets/dataset1/rawTextAnd...,18383206,Article,1953,Architects' Contracts Architects have signed t...,Architects' Contracts Architects have signed t...,,0.897167,953,"[(0, 11, 0, 11), (12, 21, 12, 21), (22, 32, 22...",yes,"(Architects, ', Contracts, Architects, have, s...","(Architects, ', Contracts, Architects, have, s..."


In [4]:
df.shape

(28287, 13)

### Find tokens belonging to certain entities that contain OCR errors

**Notes:**
* The `target` argument expects one of these strings:
    * `correct`: if OCR'd token and human-corrected token are identical.
    * `incorrect`: if OCR'd token is different from human-corrected (i.e. there is an OCR error in the token).
    * `all`: all token alignments, regardless of whether the OCR contains errors.
* The `targetLabels` argument expects the list of entity types supported by spaCy NER module.

In [5]:
def evaluateNer(dirtyText, cleanText, spacyDirtyText, spacyCleanText, absoluteIndexLinking, target, targetLabels):
    
    allAlignedPairs = []
    
    dDirtyTokenNer = dict()
    for token in spacyDirtyText:
        tokenIndicesNer = token.idx
        dDirtyTokenNer[tokenIndicesNer] = (token.ent_iob_, token.ent_type_, token.text)
        
    dCleanTokenNer = dict()
    for token in spacyCleanText:
        tokenIndicesNer = token.idx
        dCleanTokenNer[tokenIndicesNer] = (token.ent_iob_, token.ent_type_, token.text)
    
    for matched_word in absoluteIndexLinking:
        if matched_word[0] in dDirtyTokenNer and matched_word[2] in dCleanTokenNer:
            labelOcr = dDirtyTokenNer[matched_word[0]][1].strip()
            labelHum = dCleanTokenNer[matched_word[2]][1].strip()
            ocrWord = dDirtyTokenNer[matched_word[0]][2]
            humWord = dCleanTokenNer[matched_word[2]][2]
            
            target_condition = False
            if target == "correct":
                if ocrWord.lower() == humWord.lower():
                    target_condition = True
            if target == "incorrect":
                if ocrWord.lower() != humWord.lower():
                    target_condition = True
            if target == "all":
                target_condition = True
            
            if target_condition == True:
                if labelHum in targetLabels:
                    # We are not interested in errors due to bad tokenisation,
                    # filter out tokens with only one character,
                    # filter out word with - (hyphenated),
                    # human-corrected token's first character is capitalised,
                    # and human-corrected token contains no number in it:
                    if not ocrWord in humWord and not humWord in ocrWord and len(ocrWord) > 1 and not '-' in humWord and humWord[0].isupper() and not any(map(str.isdigit, humWord)):
                        allAlignedPairs.append((ocrWord, humWord))

    return allAlignedPairs

In [6]:
# Target token:
# * "correct" if identical between OCR'd and human-corrected counterpart,
# * "incorrect" if OCR'd token is different from human-corrected,
# * "all" if we don't care.

alignedEntities = []
target = "incorrect"

labels = ['GPE', 'LOC', 'NORP', 'FAC', 'ORG', 'PERSON', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LANGUAGE']
    
for index, row in df.iterrows():
    if row['alignment'] != "":
        ocrText = row['ocrText'].strip(" ")
        humanText = row['corrected'].strip(" ")
        spacyOcr = row['spacyOcr']
        spacyHum = row['spacyHum']
        alignment = ast.literal_eval(row['alignment'])
        alignedEntities += evaluateNer(ocrText, humanText, spacyOcr, spacyHum, alignment, target, labels)

In [7]:
print(len(alignedEntities))

140313


In [8]:
with open('../../resources/ocrTokens.tsv', mode='w') as ocrToponyms:
    ocrToponyms_writer = csv.writer(ocrToponyms, delimiter='\t')
    for ent in alignedEntities:
        ocrToponyms_writer.writerow([ent[0], ent[1]])