# Create a DataFrame out of selected directories

In [1]:
import argparse
from glob import glob
import numpy as np
import os
from pathlib import Path
import pandas as pd
import re
import spacy
import syntok.segmenter as segmenter
import string
import sys

In [2]:
nlp = spacy.load("en_core_web_lg")

**Impacts of OCR quality on indexing, accessing and retrievability of digital documents**

The corpus accounts for 22M (only 2019 dataset) OCRed characters along with the corresponding Gold Standard (GS) which has been aligned at the character level.

<img src="./images/ICDAR_symbols.png">

The aligned OCRed/GS texts are provided for training and test purposes. The alignment was made at the character level using "@" symbols. "#" symbols correspond to the absence of GS either related to alignment uncertainties or related to unreadable characters in the source document. 

The Error Rate and the quality of the alignment vary according to the nature and the state of degradation of the source documents. Periodicals (mostly historical newspapers) for example, due to their complex layout and their original fonts have been reported to be especially challenging. In addition, **it should be mentioned that the quality of Gold Standard also varies as the dataset aggregates resources from different projects that have their own annotation procedure, and obviously contains some errors.**


Important notice, in the archive :

* Even ids (document numbers) were exclusively used for Task 1) Detection
* Odd ids (document numbers) were exclusively used for Task 2) Correction

In [3]:
# Dictionary with the following structure:
# arbitrary_dirname: path_to_txt_files
# "arbitrary_dirname" is an arbitrary name which can be used to filter the DB in later steps

dict_dirnames = {
    "eng_monograph_2017": './ICDAR2017_datasetPostOCR_v1.2/ICDAR2017_datasetPostOCR_Full_12M_v1.2/eng_monograph',
    "eng_periodical_2017": './ICDAR2017_datasetPostOCR_v1.2/ICDAR2017_datasetPostOCR_Full_12M_v1.2/eng_periodical',
    "en1_2019": './ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_full_22M_without_Finnish/EN/EN1'
}


In [4]:
db = pd.DataFrame(columns=["index", "OCR_toInput", "OCR_aligned", "GS_aligned", "dirname"])

In [6]:
row_counter = 0
for ldir in dict_dirnames.keys():
    list_files = glob(os.path.join(dict_dirnames[ldir], "*.txt"))
    for lfile in list_files:   
        fio = open(lfile, "r")
        fread = fio.readlines()
        db.loc[row_counter] = [int(os.path.basename(lfile).split(".txt")[0]), 
                               fread[0].split("[OCR_toInput]")[1],
                               fread[1].split("[OCR_aligned]")[1],
                               fread[2].split("[ GS_aligned]")[1],
                               ldir
                              ]    
        row_counter += 1

In [7]:
print("Length before removing duplicates: {}".format(len(db)))
print(10*"=")
print("Duplicate rows:")
db[db.duplicated("OCR_toInput")]

Length before removing duplicates: 963
Duplicate rows:


Unnamed: 0,index,OCR_toInput,OCR_aligned,GS_aligned,dirname
889,141,"10* THE CHEATS OF SCAPIN. Scapin. Well,. Sir,...","10* THE CHEATS OF SCAPIN. Scapin. Well,. Sir,...","@@@@@@@@@@@@@@@@@@@@@@@@@ Scapin. Well,@ Sir,...",en1_2019
893,82,The PREFACE. Is the Souls of all Mankind be c...,@The PREFACE. Is the Souls of all Mankind be ...,The PREFACE. If the Souls of all Mankind be ...,en1_2019
894,140,"betwixt our condition, and that of the Heathe...","@@@@@@@@betwixt our condition, and that of th...","( 99 ) betwixt our condition, and that of th...",en1_2019
918,147,(toy jkilfull mattthe learning of the Ægyptia...,@(toy jkilfull @@ma@ttthe learning of the Ægy...,(10) skilfull in all the learning of the Ægy...,en1_2019
919,146,"huisstm head, for, the, cross is left v in hi...","@@@@@huisstm head, for, the, cross is left v ...","86 A Mison head, for@ the@ croſs is left@@ ...",en1_2019
927,135,PREFACE. VU Abp. Trench could tell us nothing...,@PREFACE. @VU Abp. Trench could tell us nothi...,PREFACE. vii Abp. Trench could tell us nothi...,en1_2019
929,109,■\n,■\n,@\n,en1_2019
932,134,[8*3 shutting out and Retraction of the fen- ...,@[@8@@@@@*3 shutting out and Retraction of th...,[ 89 ] out shutting out and Retraction of th...,en1_2019
933,21,Jcadmu Sckntkrum. v 19 Secttio Tertia. Aftrol...,Jcadmu Sckntkrum. v 19 Secttio Tertia. Aftrol...,@@@@@@@@@@@@@@@@@@@@@@ Se@tio Tertia.@@@@@@@...,en1_2019
938,136,24 CBA%ACTE\S An vnvortbie Knight. i$« 9 1 4 ...,@24 CBA%ACTE\S An vnvortbie Knight@@@@.@@@@@@...,24 CHARACTERS An vnworthie Knight. 15. A wor...,en1_2019


### Remove duplicate rows

In [8]:
db = db.drop_duplicates("OCR_toInput")
print("Length after removing duplicates: {}".format(len(db)))
db.head()

Length after removing duplicates: 951


Unnamed: 0,index,OCR_toInput,OCR_aligned,GS_aligned,dirname
0,289,BK. III. 364 AN ENGLISH ANTHOLOGY. Over many ...,BK. III. 364 AN ENGLISH ANTHOLOGY. Over many ...,####################################### many ...,eng_monograph_2017
1,504,47 K0BEET THE DETJYLL. How God sent an aunged...,47 K0BEET THE DETJYLL. How God sent an aunge@...,########################## God sent an aungel...,eng_monograph_2017
2,262,bk. Ill 322 AN ENGLISH ANTHOLOGY. The and The...,bk. Ill 322 AN ENGLISH ANTHOLOGY. The and The...,#############################################...,eng_monograph_2017
3,276,SECT. IV. SWEET AUBURN. 341 E'en children fol...,SECT. IV. SWEET AUBURN. 341 E'en children fol...,################################ children fol...,eng_monograph_2017
4,510,ROBERT THE DETJYLE. 53 shewed hym of this bef...,ROBERT THE DETJYLE. 53 shewed hym of this bef...,############################# hym of this bef...,eng_monograph_2017


## Sentencizer

In [9]:
db_sentence = pd.DataFrame(columns=["doc_index", "sent_index", "OCR_aligned", "GS_aligned", "begin_index"])
counter = 0

for i_row, myrow in db.iterrows():
    if np.mod(counter, 100) == 0: print("X", end="")
    list_sentence_offsets = []

    for paragraph in segmenter.process(myrow["GS_aligned"]):
        for sentence in paragraph:
            list_sentence_offsets.append(sentence[0].offset)

    for x in range(len(list_sentence_offsets)):
        counter += 1
        begSentence = list_sentence_offsets[x]
        try:
            endSentence = list_sentence_offsets[x+1]
        except IndexError:
            endSentence = len(myrow["GS_aligned"])
        ocr_sentence = myrow["OCR_aligned"][begSentence - 1 : endSentence - 1]
        gs_sentence = myrow["GS_aligned"][begSentence - 1 : endSentence - 1]
        db_sentence.loc[counter] = ["{}_{}".format(myrow["dirname"], myrow["index"]), 
                                    "{}_{}_{}".format(myrow["dirname"], myrow["index"], begSentence),
                                    ocr_sentence, gs_sentence, 
                                    begSentence-1]

XXXXXXXXXXXX

In [10]:
db_sentence.head()

Unnamed: 0,doc_index,sent_index,OCR_aligned,GS_aligned,begin_index
1,eng_monograph_2017_289,eng_monograph_2017_289_1,BK. III. 364 AN ENGLISH ANTHOLOGY. Over many ...,####################################### many ...,0
2,eng_monograph_2017_289,eng_monograph_2017_289_161,"Now they look abroad to see, Now return and w...","Now they look abroad to see, Now return and w...",160
3,eng_monograph_2017_289,eng_monograph_2017_289_219,"Pitying, I dropped a tear But I saw a glow-wo...","Pitying, I dropped a tear But I saw a glow-wo...",218
4,eng_monograph_2017_289,eng_monograph_2017_289_341,"I am set to light the ground, While the beetl...","I am set to light the ground, While the beetl...",340
5,eng_monograph_2017_289,eng_monograph_2017_289_464,' -Blake.,' -Blake.,463


## Save DataFrame

In [11]:
#db.to_pickle("db_ICDAR.pkl")
db_sentence.to_pickle("db_sentence_ICDAR_new.pkl")