In [1]:
from pathlib import Path

DATA = Path('./dataset')
CLEAN = DATA / 'clean'
DIRTY = DATA / 'dirty'
DATA.mkdir(exist_ok=True) and CLEAN.mkdir(exist_ok=True) and DIRTY.mkdir(exist_ok=True)

In [2]:
from ipyfilechooser import FileChooser
from ipywidgets import Layout

fc = FileChooser(
    DIRTY,
    filter_pattern="*.txt",
    title="Select TXT file to extract",
    layout=Layout(width="900px"),
)
display(fc)

FileChooser(path='/home/bdrad/Documents/Vogel/pdf-text-cleanup/pdf_text_cleanup/dataset/dirty', filename='', t…

* [30181](https://www.gutenberg.org/ebooks/30181) -- _Fungi: Their Nature and Uses_ by M. C. Cooke


In [4]:
fname = Path(fc.selected)
assert fname.is_file()
print(f"File selected: [{fname.name}]({fname.as_uri()})")

File selected: [30181.txt](file:///home/bdrad/Documents/Vogel/pdf-text-cleanup/pdf_text_cleanup/dataset/dirty/30181.txt)


# Clean text file

Note that I have manually pruned the text, clipping the beginning and end and removing various tables and indented portions.

In [27]:
import re
import csv
from pqdm.processes import pqdm
# from pqdm.threads import pqdm

In [95]:
CLEAN_CSV = CLEAN / (fname.stem + '.csv')
READY = DATA / Path('ready')

In [32]:
with open(fname, 'r') as fin:
    data = fin.read()

data = re.sub(r'\n\n+', '\n\n', data)
bodies = data.split('\n\n')
bodies = [b for b in bodies if len(b.strip().split(' ')) > 7 and '&c' not in b]

def parse(txt):
    if re.match(r'"?\[Illustration:', txt):
        return ''
    txt = re.sub(r'\n', ' ', txt)   # Remove newlines; make one line
    # txt = re.sub(r'\[Illustration:.+?\]', r'', txt)  # Remove illustrations
    txt = re.sub(r'\[[a-zA-Z]+\]', r'', txt)  # Remove citations
    txt = re.sub(r'\*\*(.*?)\*\*', r'\1', txt)  # Remove bolds enclosed within **
    txt = re.sub(r'_(.*?)_', r'\1', txt)    # Remove italics enclosed within _
    txt = re.sub(r'.--', r'--', txt)    # remove unnecessary period
    # txt = re.sub(r'""', r'"', txt)  # remove double quotes
    return txt

result = pqdm(bodies, parse, n_jobs=4)

with open(CLEAN_CSV, 'w') as fout:
    writer = csv.writer(fout)
    # writer.writerow(['text'])
    out = [[r] for r in result if r]
    writer.writerows(out)
    print(f"Wrote {len(out)} rows to {CLEAN_CSV}")

QUEUEING TASKS | :   0%|          | 0/513 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/513 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/513 [00:00<?, ?it/s]

Wrote 474 rows to dataset/clean/30181.csv


# Augment Text

In [None]:
out = Path('ready')
out.mkdir(exist_ok=True)

In [101]:
with open(CLEAN_CSV, 'r') as fin:
    reader = csv.reader(fin)
    data = list(reader)
    data = [item for sublist in data for item in sublist]

['The most casual observer of Nature recognizes in almost every instance that comes under his notice in every-day life, without the aid of logical definition, the broad distinctions between an animal, a plant, and a stone. To him, the old definition that an animal is possessed of life and locomotion, a plant of life without locomotion, and a mineral deficient in both, seems to be sufficient, until some day he travels beyond the circuit of diurnal routine, and encounters a sponge or a zoophyte, which possesses only one of his supposed attributes of animal life, but which he is assured is nevertheless a member of the animal kingdom. Such an encounter usually perplexes the neophyte at first, but rather than confess his generalizations to have been too gross, he will tenaciously contend that the sponge must be a plant, until the evidence produced is so strong that he is compelled to desert his position, and seek refuge in the declaration that one kingdom runs into the other so imperceptibl

In [105]:
import random


SPACE_DROP = 0.20
SPACE_ADD = 0.15
HYPHEN_ADD = 0.70
HYPHEN_STRIDE = 40

def aug_text(text):
    # text = ''.join([char for char in text if char != ' ' or random.random() > SPACE_DROP])
    n = len(text)-1
    new_text = ""

    for i,char in enumerate(text):
        if char == ' ':
            if random.random() >= SPACE_DROP:
                new_text += char
        else:
            new_text += char
            if i > 0 and text[i] != ' ' and i < n and text[i+1] != ' ':
                if random.random() < SPACE_ADD:
                    new_text += ' '
    return new_text


aug_ds = pqdm(data, aug_text, n_jobs=4)

READY_CSV = READY / CLEAN_CSV.name
with open(READY_CSV, 'w') as fout:
    csv_writer = csv.writer(fout)
    csv_writer.writerow(['text', 'label'])
    csv_writer.writerows(zip(aug_ds, data))
    print(f"Wrote to {READY_CSV}")

QUEUEING TASKS | :   0%|          | 0/474 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/474 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/474 [00:00<?, ?it/s]

Wrote to dataset/ready/30181.csv
