In [1]:
import os
import json

In [2]:
BASE_PATH = "../../data/finetune/base/QS-OCR-Large"

PREPROCESS_SAVE_PATH = "../../data/finetune/base/QS-OCR-Large-save"

In [3]:
split_files = []

for file_or_dir_name in os.listdir(BASE_PATH):
    path = os.path.join(BASE_PATH, file_or_dir_name)
    if (os.path.isfile(path)):
        split_files.append(path)

split_files

['../../data/finetune/base/QS-OCR-Large\\text_test.txt',
 '../../data/finetune/base/QS-OCR-Large\\text_train.txt',
 '../../data/finetune/base/QS-OCR-Large\\text_val.txt']

In [4]:
mappings = {
    "0": "letter",
    "1": "form",
    "2": "email",
    "3": "handwritten",
    "4": "advertisement",
    "5": "scientific report",
    "6": "scientific publication",
    "7": "specification",
    "8": "file folder",
    "9": "news article",
    "10": "budget",
    "11": "invoice",
    "12": "presentation",
    "13": "questionnaire",
    "14": "resume",
    "15": "memo"
}

In [8]:
import re

newline_spam = re.compile("\n{3,}")

def normalize_text(text: str):
    result = newline_spam.sub("\n\n", text)
    return result

def is_removed(text: str):
    processed = text.strip()
    processed = processed.strip("\n\t")

    return len(processed) == 0

In [6]:
from tqdm import tqdm

In [9]:
for split_file_path in split_files:

    save_path = os.path.join(PREPROCESS_SAVE_PATH, os.path.split(split_file_path)[1] + ".jsonl")
    save_folder_path = os.path.split(save_path)[0]
    if not os.path.isdir(save_folder_path):
        os.mkdir(os.path.split(save_path)[0])

    with open(save_path, mode="w", encoding="UTF-8") as save_file:
        with open(split_file_path, mode="r", encoding="UTF-8") as split_file:

            total_lines = sum(1 for _ in split_file)
            split_file.seek(0)

            with tqdm(total=total_lines, unit="lines", unit_scale=True, desc=split_file_path) as pbar:
                for document_label_path in split_file:
                    pbar.update(1)
                    partial_document_path, label = document_label_path.split()

                    document_path = os.path.join(BASE_PATH, partial_document_path)

                    with open(document_path, mode="r", encoding="UTF-8") as document_file:

                        document_text = normalize_text("\n".join(document_file.readlines()))

                        if not is_removed(document_text):
                            save_file.write(json.dumps({
                                "text": document_text,
                                "label": mappings[label]
                            }) + "\n")

../../data/finetune/base/QS-OCR-Large\text_test.txt: 100%|██████████| 40.0k/40.0k [07:01<00:00, 94.9lines/s] 
../../data/finetune/base/QS-OCR-Large\text_train.txt: 100%|██████████| 320k/320k [1:12:24<00:00, 73.7lines/s] 
../../data/finetune/base/QS-OCR-Large\text_val.txt: 100%|██████████| 40.0k/40.0k [09:18<00:00, 71.7lines/s]


In [10]:
from datasets import load_dataset, Features, Value, ClassLabel, DatasetDict, Dataset
import pandas as pd

In [11]:
data_files = {
    "train": os.path.abspath(r"..\..\data\finetune\base\QS-OCR-Large-save\text_train.txt.jsonl"),
    "validation": os.path.abspath(r"..\..\data\finetune\base\QS-OCR-Large-save\text_val.txt.jsonl"),
    "test": os.path.abspath(r"..\..\data\finetune\base\QS-OCR-Large-save\text_test.txt.jsonl")
}

features = Features({
    "text": Value("string"),
    "label": ClassLabel(names=list(mappings.values()))
})

dataset = DatasetDict({
    key: Dataset.from_pandas(pd.read_json(value, lines=True)).cast(features)
    for key, value in data_files.items()
})

dataset

Casting the dataset:   0%|          | 0/308026 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/38498 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/38520 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 308026
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 38498
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 38520
    })
})

In [12]:
dataset.save_to_disk(BASE_PATH + "-raw")

Saving the dataset (0/1 shards):   0%|          | 0/308026 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/38498 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/38520 [00:00<?, ? examples/s]