# Used datasets are:
- [russian writing](https://github.com/ZackPashkin/Cyrillic-Handwriting-Dataset?tab=readme-ov-file) 
- [math formulas](https://researchdata.edu.au/crohme-competition-recognition-expressions-png/639782)

### At first, we need to separate photos white (the most supreme color) backround from others

In [2]:
import os
import shutil
import argparse
import numpy as np
from numpy import typing as np_typing

from PIL import Image

In [16]:
WHITENESS_THRESH = 220
WHITE_PERCENT = 0.87

def count_white_pixels(pixels: np_typing.NDArray) -> int:
    return  np.sum(np.all(pixels >= WHITENESS_THRESH, axis=2))
    


def is_white(image_path: str) -> bool:
    with Image.open(image_path) as im:
        im = im.convert("RGB")
        pixels = np.array(im)
    total_pixels = pixels.shape[0] * pixels.shape[1]
    white_pixels = count_white_pixels(pixels)
    return (white_pixels / total_pixels) >= WHITE_PERCENT

EXTENSIONS = (".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".gif")
EXTENSIONS_S = set(EXTENSIONS)


def filter(from_dir: str, to_dir: str) -> None:
    os.makedirs(to_dir, exist_ok=True)
    
    for root, _, files in os.walk(from_dir):
        for fname in files:
            if not fname.lower().endswith(EXTENSIONS):
                continue
            src = os.path.join(root, fname)
            try:
                if is_white(src):
                    dst = os.path.join(to_dir, fname)
                    shutil.copy2(src, dst)
            except Exception as e:
                print(f"Skipping {src}: {e}")
    

In [11]:
filter('data/math', 'data/math_out')

In [None]:
filter('data/russian', 'data/russian_out')

### Then combine the shit into one images (and also generate labels)

In [29]:
import random

CANVAS_SIZE = (1920, 1080)
COUNT_PER_SAMPLE = 6
MIN_RANDOM_SCALE = 0.5
MAX_RANDOM_SCALE = 1.0

MAX_ATTEMPTS = 10

RUSSIAN_SCALE = 1.4
MATH_SCALE = 0.25

def generate_composite(
    imgs_A, imgs_B, out_dir, idx,
):
    canvas_w, canvas_h = CANVAS_SIZE
 
    canvas = Image.new("RGB", CANVAS_SIZE, (255, 255, 255))

    bboxes = []
    placements = []

    def place_list(img_paths, class_idx, scale_factor=1.0):
        samples = (random.sample(img_paths, COUNT_PER_SAMPLE)
                   if COUNT_PER_SAMPLE <= len(img_paths)
                   else random.choices(img_paths, k=COUNT_PER_SAMPLE))
        for img_path in samples:
            img = Image.open(img_path).convert("RGB")
            iw, ih = img.size
            max_fit = min(canvas_w/iw, canvas_h/ih, 1.0)
            scale = random.uniform(MIN_RANDOM_SCALE, MAX_RANDOM_SCALE) * max_fit
            nw, nh = int(iw*scale*scale_factor), int(ih*scale*scale_factor)
            if nw < 1 or nh < 1:
                continue
            img_resized = img.resize((nw, nh))
            for _ in range(MAX_ATTEMPTS):
                attempts = 0
               
                x = random.randint(0, canvas_w-nw)
                y = random.randint(0, canvas_h-nh)
                rect = (x, y, x+nw, y+nh)
                if not any(
                    rect[0]<ox2 and rect[2]>ox1 and rect[1]<oy2 and rect[3]>oy1
                    for (ox1,oy1,ox2,oy2) in placements
                ):
                    canvas.paste(img_resized, (x, y))
                    placements.append(rect)
                    xc = (x+nw/2)/canvas_w
                    yc = (y+nh/2)/canvas_h
                    bboxes.append((class_idx, xc, yc, nw/canvas_w, nh/canvas_h))
                    break
                    
                
    place_list(imgs_A, class_idx=0, scale_factor=RUSSIAN_SCALE)
    place_list(imgs_B, class_idx=1, scale_factor=MATH_SCALE)

    img_name = f"img_{idx:05d}.jpg"
    label_name = f"img_{idx:05d}.txt"
    canvas.save(os.path.join(out_dir, img_name), quality=95)
    with open(os.path.join(out_dir, label_name), 'w') as lf:
        for cls, xc, yc, w, h in bboxes:
            lf.write(f"{cls} {xc:.6f} {yc:.6f} {w:.6f} {h:.6f}\n")

In [20]:
def collect_images(folder: str):
    return [os.path.join(folder, f) for f in os.listdir(folder)
             if os.path.splitext(f.lower())[1] in EXTENSIONS_S]

In [35]:
def generate(russian, math, out, n_samples=4):
    os.makedirs(out, exist_ok=True)
    imgs_A = collect_images(russian)
    imgsB = collect_images(math)
    imgs_B = imgsB[:len(imgs_A)]


    for i in range(n_samples):
        generate_composite(imgs_A, imgs_B, out, i)

In [36]:
generate('data/russian_out', 'data/math_out', "data/out_out")