### **Document Colorizer**

#### **Import necessary libraries**

In [3]:
import os
import cv2
import random

#### **Parse a set of documents as background images to paste checkboxes onto**

##### **RVL-CDIP**

The dataset contains 16 types of documents:
0 letter
1 form
2 email
3 handwritten
4 advertisement
5 scientific report
6 scientific publication
7 specification
8 file folder
9 news article
10 budget
11 invoice
12 presentation
13 questionnaire
14 resume
15 memo

For the purpose of reducing as much as noise as possible for the training set, we might look at category 0, 2, 3, 6, 9, 12

In [1]:
DOC_CATEGORIES = {
    0: "letter",
    1: "form",
    2: "email",
    3: "handwritten",
    4: "advertisement",
    5: "scientific_report",
    6: "scientific_publication",
    7: "specification",
    8: "file_folder",
    9: "news_article",
    10: "budget",
    11: "invoice",
    12: "presentation",
    13: "questionnaire",
    14: "resume",
    15: "memo"
}
categories = [0, 2, 3, 6, 9]

In [4]:
train_dir = "datasets/rvl-cdip/images"
train_txt = "datasets/rvl-cdip/labels/train.txt"

In [None]:
count = 0

with open(train_txt, 'r') as train_folder:
    for line in train_folder:
        line_array = line.split(" ")
        cat_num = int(line_array[1])
        source_path = os.path.join(train_dir, line_array[0])
        # Obtain about 10000 images in the abovementioned categories which will serve as source documents to paste checkboxes onto
        if cat_num in categories and count <= 10000:
            new_path = f"datasets/train_val/rvl_boxes/train/images/{str(count) + '_' + DOC_CATEGORIES[cat_num] + '.jpg'}"
            shutil.copy(source_path, new_path) # Copy the image
            count += 1

##### **Colorize**

Since the existing documents are entirely grayscale images, which differ greatly from real-life documents which can be in various settings, we will apply a set of colorization processes on a portion of the whole data. 

Define helper functions and constants

In [6]:
COLOR_COMBINATIONS = [ # Text, Background
    ("#2F3C7E", "#FBEAEB"),
    ("#317773", "#E2D1F9"),
    ("#990011", "#FCF6F5"),
    ("#FFFFFF", "#8AAAE5"), 
    ("#EE4E34", "#FCEDDA"),
    ("#3A6B35", "#CBD18F"),
    ("#00008B", "#ADD8E6"),
    ("#101820", "#FBF8BE"),
    ("#000000", "#E7E8D1"),
    ("#101820", "#FCF6F5"),
    ("#2C5F2D", "#FCF6F5"),
    ("#000000", "#FCEDDA"),
    ("#A41681", "#B2FAE6"),
    ("#B2FAE6", "#A41681"),
    ("#081B26", "#1E90FF"),
    ("#008080", "#FDFEFD")
]

In [7]:
def convertHexToRGB(value):
    value = value.lstrip('#')
    lv = len(value)
    return [int(value[i:i + lv // 3], 16) for i in range(0, lv, lv // 3)]

In [5]:
document_image_paths = next(os.walk("datasets/train_val/rvl_boxes/train/images/"))[2]
# Get all unedited images
complete_document_image_paths = [os.path.join("datasets/train_val/rvl_boxes/train/images/", x) for x in document_image_paths if x.find(".DS_Store") < 0]
# Get a random group to apply colorization
transform_group = random.choices(complete_document_image_paths, k=4000)

Randomly change background, foreground color of images

In [65]:
for image_path in transform_group:
    color_choice = random.choices(COLOR_COMBINATIONS)[0]
    img = cv2.imread(image_path)
    if img is None:
        continue
    # Get all black pixels
    darker_pixels = np.where(
        (img[:, :, 0] <= 128) & 
        (img[:, :, 1] <= 128) & 
        (img[:, :, 2] <= 128)
    )

    lighter_pixels = np.where(
        (img[:, :, 0] > 128) & 
        (img[:, :, 1] > 128) & 
        (img[:, :, 2] > 128)
    )

    # set those pixels to text color in the color choice tuple
    img[darker_pixels] = convertHexToRGB(color_choice[0])
    img[lighter_pixels] = convertHexToRGB(color_choice[1])
    cv2.imwrite(image_path, img) # Overwrite the original black-and-white image

In [66]:
document_image_paths = next(os.walk("datasets/train_val/rvl_boxes/train/images/"))[2]
# Get all unedited images
complete_document_image_paths = [os.path.join("datasets/train_val/rvl_boxes/train/images/", x) for x in document_image_paths if x.find(".DS_Store") < 0]