# Create a DataFrame out of selected directories

In [1]:
from glob import glob
import os
import pandas as pd
import re

In [2]:
# Dictionary with the following structure:
# arbitrary_dirname: path_to_txt_files
# dirname is an arbitrary name which can be used to filter the DB in later steps

dict_dirnames = {
    "eng_monograph_2017": './ICDAR2017_datasetPostOCR_v1.2/ICDAR2017_datasetPostOCR_Full_12M_v1.2/eng_monograph',
    "eng_periodical_2017": './ICDAR2017_datasetPostOCR_v1.2/ICDAR2017_datasetPostOCR_Full_12M_v1.2/eng_periodical',
    "en1_2019": './ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_full_22M_without_Finnish/EN/EN1'
}


In [3]:
db = pd.DataFrame(columns=["index", "OCR_toInput", "GS", "dirname"])

In [4]:
row_counter = 0
for ldir in dict_dirnames.keys():
    list_files = glob(os.path.join(dict_dirnames[ldir], "*.txt"))
    for lfile in list_files:   
        fio = open(lfile, "r")
        fread = fio.readlines()
        db.loc[row_counter] = [int(os.path.basename(lfile).split(".txt")[0]), 
                               fread[0].split("[OCR_toInput] ")[1],
                               fread[2].split("[ GS_aligned] ")[1],
                               ldir
                              ]    
        row_counter += 1

In [7]:
print("Length before removing duplicates: {}".format(len(db)))
print(10*"=")
print("Duplicate rows:")
db[db.duplicated("OCR_toInput")]

Length before removing duplicates: 963
Duplicate rows:


Unnamed: 0,index,OCR_toInput,GS,dirname
889,141,"10* THE CHEATS OF SCAPIN. Scapin. Well,. Sir, ...","@@@@@@@@@@@@@@@@@@@@@@@@@ Scapin. Well,@ Sir, ...",en1_2019
893,82,The PREFACE. Is the Souls of all Mankind be co...,The PREFACE. If the Souls of all Mankind be c...,en1_2019
894,140,"betwixt our condition, and that of the Heathen...","( 99 ) betwixt our condition, and that of the...",en1_2019
918,147,(toy jkilfull mattthe learning of the Ægyptian...,(10) skilfull in all the learning of the Ægyp...,en1_2019
919,146,"huisstm head, for, the, cross is left v in his...","86 A Mison head, for@ the@ croſs is left@@ i...",en1_2019
927,135,PREFACE. VU Abp. Trench could tell us nothing ...,PREFACE. vii Abp. Trench could tell us nothin...,en1_2019
929,109,■\n,@\n,en1_2019
932,134,[8*3 shutting out and Retraction of the fen- s...,[ 89 ] out shutting out and Retraction of the...,en1_2019
933,21,Jcadmu Sckntkrum. v 19 Secttio Tertia. Aftrolo...,@@@@@@@@@@@@@@@@@@@@@@ Se@tio Tertia.@@@@@@@@...,en1_2019
938,136,24 CBA%ACTE\S An vnvortbie Knight. i$« 9 1 4 H...,24 CHARACTERS An vnworthie Knight. 15. A wor‑...,en1_2019


### Remove duplicate rows

In [8]:
db = db.drop_duplicates("OCR_toInput")
print("Length after removing duplicates: {}".format(len(db)))
db.head()

Length after removing duplicates: 951


Unnamed: 0,index,OCR_toInput,GS,dirname
0,289,BK. III. 364 AN ENGLISH ANTHOLOGY. Over many a...,####################################### many a...,eng_monograph_2017
1,504,47 K0BEET THE DETJYLL. How God sent an aunged ...,########################## God sent an aungell...,eng_monograph_2017
2,262,bk. Ill 322 AN ENGLISH ANTHOLOGY. The and The ...,############################################# ...,eng_monograph_2017
3,276,SECT. IV. SWEET AUBURN. 341 E'en children foll...,################################ children foll...,eng_monograph_2017
4,510,ROBERT THE DETJYLE. 53 shewed hym of this befo...,############################# hym of this befo...,eng_monograph_2017


## Preprocess

In [9]:
def cleanup(corpus):
    # remove all # and @
    corpus = [re.sub(r'#', '', element, flags=re.IGNORECASE) for element in corpus]
    corpus = [re.sub(r'@', '', element, flags=re.IGNORECASE) for element in corpus]
    # remove multiple spaces
    corpus = [re.sub(r'\s+', ' ', element, flags=re.IGNORECASE) for element in corpus]
    corpus = [element.strip() for element in corpus]
    #corpus = [element.lower() for element in corpus]
    return corpus

In [10]:
db["GS_cleaned"] = cleanup(db["GS"])
db["OCR_cleaned"] = cleanup(db["OCR_toInput"])

In [11]:
db.head()

Unnamed: 0,index,OCR_toInput,GS,dirname,GS_cleaned,OCR_cleaned
0,289,BK. III. 364 AN ENGLISH ANTHOLOGY. Over many a...,####################################### many a...,eng_monograph_2017,"many a tangled spray, All heart-broke, I heard...",BK. III. 364 AN ENGLISH ANTHOLOGY. Over many a...
1,504,47 K0BEET THE DETJYLL. How God sent an aunged ...,########################## God sent an aungell...,eng_monograph_2017,God sent an aungell to the heremyte that he sh...,47 K0BEET THE DETJYLL. How God sent an aunged ...
2,262,bk. Ill 322 AN ENGLISH ANTHOLOGY. The and The ...,############################################# ...,eng_monograph_2017,laughing flowers that round them blow Drink li...,bk. Ill 322 AN ENGLISH ANTHOLOGY. The and The ...
3,276,SECT. IV. SWEET AUBURN. 341 E'en children foll...,################################ children foll...,eng_monograph_2017,"children followed, with endearing wile, And pl...",SECT. IV. SWEET AUBURN. 341 E'en children foll...
4,510,ROBERT THE DETJYLE. 53 shewed hym of this befo...,############################# hym of this befo...,eng_monograph_2017,hym of this before sayd knyght he hadde many t...,ROBERT THE DETJYLE. 53 shewed hym of this befo...


## Remove not-processed columns?

In [12]:
# remove columns
list_col2remove = ["OCR_toInput", "GS"]
db = db.drop(columns=list_col2remove)
db.head()

Unnamed: 0,index,dirname,GS_cleaned,OCR_cleaned
0,289,eng_monograph_2017,"many a tangled spray, All heart-broke, I heard...",BK. III. 364 AN ENGLISH ANTHOLOGY. Over many a...
1,504,eng_monograph_2017,God sent an aungell to the heremyte that he sh...,47 K0BEET THE DETJYLL. How God sent an aunged ...
2,262,eng_monograph_2017,laughing flowers that round them blow Drink li...,bk. Ill 322 AN ENGLISH ANTHOLOGY. The and The ...
3,276,eng_monograph_2017,"children followed, with endearing wile, And pl...",SECT. IV. SWEET AUBURN. 341 E'en children foll...
4,510,eng_monograph_2017,hym of this before sayd knyght he hadde many t...,ROBERT THE DETJYLE. 53 shewed hym of this befo...


In [13]:
db.to_pickle("db_ICDAR.pkl")