# **Creating Synthetic Dataset for Document Segmentation**

In [1]:
import os
import sys 
sys.path.append('..' + os.sep)

import torch
import torchvision
import numpy as np 

from src.models.unet import UNet
from PIL import Image
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
import matplotlib.pyplot as plt 
import requests
from bs4 import BeautifulSoup
import shutil 
import random 
import cv2

## **Assembling Document Images**

In my research, I could not more/ reliable data sources for documents in context. or real world images of documents with a wide variety of different perspectives / lighting / backgrounds. So we assemble a synthetic dataset comprised of document images from OCR / denoising / binarization datasets, superimposed and augmented onto backgrounds aggregated from google images. 

In total, we will have 4,363 images compiled from the following datasets: 
| Dataset Name                                                                 | Number of Images | Format | Size      | 
|------------------------------------------------------------------------------|------------------|--------|-----------|
| [LRDE Document Binarization Dataset](https://www.lrde.epita.fr/wiki/Olena/DatasetDBD#Data) | 125 | png    | 2516x3712 |
| [FUNSD: Form Understanding in Noisy Scanned Docs](https://guillaumejaume.github.io/FUNSD/)  | 199 | png    | 754x1000 |
| [IAM Handwritten Forms Dataset](https://www.kaggle.com/datasets/naderabdalghani/iam-handwritten-forms-dataset?select=data) | 1,539 | png | 2479x3542 |
| [DocVA: Document Collection VQA](https://www.docvqa.org/datasets/doccvqa) | 12,768 (only using 1,000) | png | 1682x2159 | 
| [The DocBank Dataset (Part 1)](https://doc-analysis.github.io/docbank-page/index.html) | 54,984 (only using 1,500) | jpg | 773x1000 | 

TODO SAMPLES FROM EACH DATASET 

TODO AGG$REGATION

In [7]:
# First 
QUERIES = [
    "table images top view",
    "wooden table texture",
    "marble texture",
    "carpet texture",
    "concrete wall texture",
    "whiteboard background",
    "brick wall texture",
    "office desk top view",
    "linen tablecloth texture background",
    "leather texture",
    "metal surface texture",
    "table images top view",
    "sidewalk top view",
    "sand texture",
    "grass field top view",
    "stone pavement texture",
    "wooden deck texture",
    "forest floor texture",
    "rock surface texture",
    "colorful backgrounds",
    "colorful desk background",
    "dark desk background",
    "light desk background",
    "floor texture"
    ]

USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/91.0.4472.124 Safari/537.36"
)
HEADERS = {
    "User-Agent": USER_AGENT
}


def fetch_image_urls(query: str, max_links: int) -> list:
    search_url = f"https://www.google.com/search?site=&tbm=isch&q={query}"
    response = requests.get(search_url, headers=HEADERS)
    if response.status_code != 200:
        raise Exception(f"Error: {response.status_code}")

    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Trying an alternate way to fetch the image urls
    img_tags = soup.select('.rg_i.Q4LuWd')
    img_urls = [img.get('data-src') or img.get('data-iurl') for img in img_tags]
    img_urls = [url for url in img_urls if url is not None and url.startswith(('http:', 'https:'))]

    return img_urls[:max_links]


def download_images_from_query(queries: list, max_images_per_query: int):
    for query in queries:
        img_urls = fetch_image_urls(query, max_images_per_query)
        for i, url in enumerate(img_urls):
            response = requests.get(url)
            filename = os.path.join("..","data","documents","google", f"{query.replace(' ', '_')}_{i + 1}.jpg")
            with open(filename, 'wb') as f:
                f.write(response.content)

download_images_from_query(QUERIES, 100)  


In [6]:



source = os.path.join("..","data","documents","raw","DocBank_500K_ori_img.zip","images")
dest =  os.path.join("..","data","documents","raw","DocBank_500K_ori_img.zip","sample")


if not os.path.exists(source):
    raise ValueError(f"Source directory {source} does not exist.")

# Get a list of all images in the source directory
all_images = [f for f in os.listdir(source) if os.path.isfile(os.path.join(source, f))]


# Randomly sample images
sampled_images = random.sample(all_images, 1500)

# Create destination directory if it doesn't exist
if not os.path.exists(dest):
    os.makedirs(dest)

# Copy the sampled images to the destination directory
for image in sampled_images:
    shutil.copy2(os.path.join(source, image), os.path.join(dest, image))
print(f"Copied {1000} images from {source} to {dest}.")


Copied 1000 images from ..\data\documents\raw\DocBank_500K_ori_img.zip\images to ..\data\documents\raw\DocBank_500K_ori_img.zip\sample.


In [12]:
dirPath = "E:\\GitHub\\docUNET-Pytorch\\data\\document_dataset_resized\\valid\\images"
files = os.listdir(dirPath)
SIZE = (312,312)


for f in tqdm(files): 
    fpath = os.path.join(dirPath,f)
    img = cv2.imread(fpath)
    resized = cv2.resize(img,SIZE,cv2.INTER_NEAREST)
    #plt.imshow(resized)
    cv2.imwrite(fpath,resized)

100%|██████████| 1343/1343 [00:16<00:00, 79.81it/s]


: 