# Data Generator
This notebook is designed to generate data for training an AI model.
## Documents Generator
To ensure the model does not become dependent on specific contexts, we generate documents with highly randomized content. This is achieved using **LibreOffice Calc**, where text from a predefined file is randomly distributed across sheets.

To prepare this file, ensure it contains at least 7,000 words. Additionally, maintain a folder structure to data processing.
### Sheets generator

In [2]:
from odf.opendocument import OpenDocumentSpreadsheet
from odf.style import Style, TextProperties
from odf.table import Table, TableRow, TableCell
from odf.text import P
import random
from numpy.random import choice
from pathlib import Path
from typing import List

# Read text for document generation
TEXT_FILE_PATH = "text.txt"
OUTPUT_DIRECTORY = Path("../data/documents")

def load_words(file_path: str) -> List[str]:
    """Load words from a text file."""
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read().split()
    
words_from_file = load_words(TEXT_FILE_PATH)

def generate_random_font_style() -> Style:
    """Generate a random font style for table cells."""
    style = Style(name=f"Style_{random.randint(1, 1_000_000)}", family="table-cell")
    
    font_size = choice(["12pt", "14pt", "17pt", "21pt", "26pt"],
                       p=[0.8, 0.05, 0.05, 0.05, 0.05])
    bold = choice(["bold", "normal"], p=[0.05, 0.95])
    italic = choice(["italic", "normal"], p=[0.05, 0.95])
    
    style.addElement(TextProperties(fontstyle=italic, fontweight=bold, fontsize=font_size))
    return style

def generate_random_file_content(file_name: str):
    """Generate an OpenDocument spreadsheet with random text formatting."""
    # Prepare data
    random.shuffle(words_from_file)
    words = words_from_file[random.randint(0, 101):random.randint(102, len(words_from_file))]
    num_of_words = len(words)

    
    # Create a new spreadsheet document
    doc = OpenDocumentSpreadsheet()
    table = Table(name=file_name)
    file_content = []
    
    # Generate table content
    i = 0
    while i < num_of_words:
        line_length = random.randint(1, 10)
        tab_adding = choice([True, False], p=[0.05, 0.95])
        dash_adding = choice([True, False], p=[0.05, 0.95])
        star_adding = choice([True, False], p=[0.05, 0.95])
        
        row_data = []
        if tab_adding:
            row_data.append("")
            tab_adding = False
        
        for _ in range(line_length):
            if i >= num_of_words:
                break
            
            if dash_adding:
                row_data.append("-")
                dash_adding = False
            elif star_adding:
                row_data.append("*")
                star_adding = False
            else:
                row_data.append(words[i])
                i += 1
        
        file_content.append(row_data)
    
    # Add rows and cells to the table
    for row_data in file_content:
        row = TableRow()
        for cell_data in row_data:
            font_style = generate_random_font_style()
            doc.automaticstyles.addElement(font_style)
            cell = TableCell(stylename=font_style)
            cell.addElement(P(text=cell_data))
            row.addElement(cell)
        table.addElement(row)
    
    # Add table to document
    doc.spreadsheet.addElement(table)
    
    # Ensure output directory exists
    OUTPUT_DIRECTORY.mkdir(parents=True, exist_ok=True)
    
    # Save the document
    output_path = OUTPUT_DIRECTORY / f"{file_name}.ods"
    doc.save(str(output_path))
    print(f"Spreadsheet '{file_name}.ods' created successfully!")

# Example usage
generate_random_file_content("example")

Spreadsheet 'example.ods' created successfully!


### Converting sheets to PDF file.
This is necessary because it gives the content from a sheet the appearance of a document automatically.

In [3]:
import subprocess
import os

def convert_ods_to_pdf(file_name: str):
    """Convert an ODS file to PDF using LibreOffice CLI."""
    input_path = OUTPUT_DIRECTORY / f"{file_name}.ods"
    output_path = OUTPUT_DIRECTORY / f"{file_name}.pdf"
    
    if not input_path.exists():
        print(f"Error: File '{input_path}' not found.")
        return
    
    try:
        subprocess.run([
            "libreoffice", "--headless", "--convert-to", "pdf", str(input_path), "--outdir", str(OUTPUT_DIRECTORY)
        ], check=True)
        print(f"PDF '{output_path}' created successfully!")
        os.remove(input_path)
    except subprocess.CalledProcessError as e:
        print(f"Error during conversion: {e}")

# example usage
convert_ods_to_pdf("example")



convert /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/example.ods as a Calc document -> /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/example.pdf using filter : calc_pdf_Export
PDF '../data/documents/example.pdf' created successfully!


### Converting PDF file to JPEG file.

In [4]:
from pdf2image import convert_from_path

def convert_pdf_to_jpeg(file_name: str):
    """Convert all pages of a PDF file to JPEG using pdf2image."""
    pdf_path = OUTPUT_DIRECTORY / f"{file_name}.pdf"
    output_dir = OUTPUT_DIRECTORY
    output_dir.mkdir(parents=True, exist_ok=True)
    
    if not pdf_path.exists():
        print(f"Error: File '{pdf_path}' not found.")
        return
    
    try:
        images = convert_from_path(str(pdf_path))
        for i, image in enumerate(images):
            image.save(str(output_dir / f"{file_name}_page_{i + 1}.jpg"), "JPEG")
        print(f"All pages of '{file_name}.pdf' converted to JPEG successfully!")
        os.remove(pdf_path)
    except Exception as e:
        print(f"Error during conversion: {e}")

# example usage
convert_pdf_to_jpeg("example")

All pages of 'example.pdf' converted to JPEG successfully!


### Generating 10 000 documents and converting them to JPEG files.

In [5]:
for i in range(1, 10):
    name = f"IMG_{i}"
    generate_random_file_content(name)
    convert_ods_to_pdf(name)
    convert_pdf_to_jpeg(name)

Spreadsheet 'IMG_1.ods' created successfully!




convert /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/IMG_1.ods as a Calc document -> /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/IMG_1.pdf using filter : calc_pdf_Export
PDF '../data/documents/IMG_1.pdf' created successfully!
All pages of 'IMG_1.pdf' converted to JPEG successfully!
Spreadsheet 'IMG_2.ods' created successfully!




convert /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/IMG_2.ods as a Calc document -> /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/IMG_2.pdf using filter : calc_pdf_Export
PDF '../data/documents/IMG_2.pdf' created successfully!
All pages of 'IMG_2.pdf' converted to JPEG successfully!
Spreadsheet 'IMG_3.ods' created successfully!




convert /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/IMG_3.ods as a Calc document -> /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/IMG_3.pdf using filter : calc_pdf_Export
PDF '../data/documents/IMG_3.pdf' created successfully!
All pages of 'IMG_3.pdf' converted to JPEG successfully!
Spreadsheet 'IMG_4.ods' created successfully!




convert /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/IMG_4.ods as a Calc document -> /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/IMG_4.pdf using filter : calc_pdf_Export
PDF '../data/documents/IMG_4.pdf' created successfully!
All pages of 'IMG_4.pdf' converted to JPEG successfully!
Spreadsheet 'IMG_5.ods' created successfully!




convert /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/IMG_5.ods as a Calc document -> /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/IMG_5.pdf using filter : calc_pdf_Export
PDF '../data/documents/IMG_5.pdf' created successfully!
All pages of 'IMG_5.pdf' converted to JPEG successfully!
Spreadsheet 'IMG_6.ods' created successfully!




convert /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/IMG_6.ods as a Calc document -> /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/IMG_6.pdf using filter : calc_pdf_Export
PDF '../data/documents/IMG_6.pdf' created successfully!
All pages of 'IMG_6.pdf' converted to JPEG successfully!
Spreadsheet 'IMG_7.ods' created successfully!




convert /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/IMG_7.ods as a Calc document -> /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/IMG_7.pdf using filter : calc_pdf_Export
PDF '../data/documents/IMG_7.pdf' created successfully!
All pages of 'IMG_7.pdf' converted to JPEG successfully!
Spreadsheet 'IMG_8.ods' created successfully!




convert /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/IMG_8.ods as a Calc document -> /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/IMG_8.pdf using filter : calc_pdf_Export
PDF '../data/documents/IMG_8.pdf' created successfully!
All pages of 'IMG_8.pdf' converted to JPEG successfully!
Spreadsheet 'IMG_9.ods' created successfully!




convert /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/IMG_9.ods as a Calc document -> /home/kubis/Pulpit/BookScanner/BookScanner/data/documents/IMG_9.pdf using filter : calc_pdf_Export
PDF '../data/documents/IMG_9.pdf' created successfully!
All pages of 'IMG_9.pdf' converted to JPEG successfully!


## Adding documents curvature

In [6]:
import cv2
import numpy as np
import math

def add_padding(image, padding):
    """Adds padding to an image to prevent cropping after warping."""
    h, w, c = image.shape
    new_h, new_w = h + 2 * padding, w + 2 * padding
    padded_image = np.zeros((new_h, new_w, c), dtype=np.uint8)
    padded_image[padding:padding+h, padding:padding+w] = image
    return padded_image

In [7]:
def get_rotation_matrix(angle_x, angle_y, angle_z):
    """Compute 3D rotation matrix from given angles in degrees."""
    ax, ay, az = map(math.radians, [angle_x, angle_y, angle_z])
    
    rot_x = np.array([[1, 0, 0], [0, math.cos(ax), -math.sin(ax)], [0, math.sin(ax), math.cos(ax)]], dtype=np.float32)
    rot_y = np.array([[math.cos(ay), 0, math.sin(ay)], [0, 1, 0], [-math.sin(ay), 0, math.cos(ay)]], dtype=np.float32)
    rot_z = np.array([[math.cos(az), -math.sin(az), 0], [math.sin(az), math.cos(az), 0], [0, 0, 1]], dtype=np.float32)
    
    rot_matrix = np.eye(4, dtype=np.float32)
    rot_matrix[:3, :3] = rot_z @ rot_y @ rot_x  # Combine rotations
    return rot_matrix

In [8]:
def generate_mesh_grid(width, height):
    """Generate a mesh grid for remapping transformation."""
    x_map, y_map = np.meshgrid(np.arange(width, dtype=np.float32), np.arange(height, dtype=np.float32))
    mesh_3d = np.stack([x_map, y_map, np.zeros_like(x_map), np.ones_like(x_map)], axis=-1)  # (H, W, 4)
    return mesh_3d

def apply_transformations(mesh, rotation_matrix, amplitude, frequency):
    """Apply both wavy distortion and 3D rotation to the transformation mesh."""
    h, w, _ = mesh.shape
    flat_mesh = mesh.reshape(-1, 4).T  # Shape: (4, H*W)

    low_val = random.uniform(-1.0, 1.0)
    high_low = random.uniform(low_val, 1.0)

    # Generate a vector with values ranging from -1 to 1
    vector = np.linspace(low_val, high_low, flat_mesh.shape[1])
    
    # Apply wavy transformation
    flat_mesh[1] += amplitude * np.sin(frequency * flat_mesh[0]) * vector
    
    # Apply 3D rotation
    transformed_mesh = rotation_matrix @ flat_mesh
    x_new, y_new = transformed_mesh[0], transformed_mesh[1]  # Ignore Z-axis
    
    return x_new.reshape(h, w).astype(np.float32), y_new.reshape(h, w).astype(np.float32)

In [9]:
import os
import random
import pickle
import lzma

folder_path = "../data/documents"

for filename in os.listdir(folder_path):
        img_path = os.path.join(folder_path, filename)
        img = cv2.imread(img_path)

        if img is None:
            print(f"Warning: Failed to load image '{filename}'.")
            continue

        # Add padding to prevent cropping
        padding = random.randint(400, 550)
        scaled_image = add_padding(img, padding)

        # Get new dimensions
        height, width = scaled_image.shape[:2]

        # Generate mesh grid
        mesh_3d = generate_mesh_grid(width, height)

        # Define transformations
        amplitude = random.randint(0, 100)  # Pixel displacement
        frequency = random.uniform(0.5, 3.0) * np.pi / width  # Frequency relative to width
        rotation_matrix = get_rotation_matrix(random.randint(-10, 10),
                                            random.randint(-10, 10),
                                            random.randint(-5, 5))
        
        # Apply combined transformations
        x_map_final, y_map_final = apply_transformations(mesh_3d, rotation_matrix, amplitude, frequency)
        
        # Ensure correct data type for remap
        x_map_final = x_map_final.astype(np.float32)
        y_map_final = y_map_final.astype(np.float32)

        # Remap image using the transformed mesh
        final_image = cv2.remap(scaled_image, x_map_final, y_map_final, interpolation=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT)
        cv2.imwrite(f"../data/input/{filename}", final_image)
        print(f"Combined wavy and 3D rotated transformation for {filename} applied successfully!")

        with lzma.open(f"../data/output/{filename[:-4]}.pkl.xz", "wb") as f:
            f.write(pickle.dumps([x_map_final, y_map_final], protocol=pickle.HIGHEST_PROTOCOL))

        os.remove(img_path)


Combined wavy and 3D rotated transformation for IMG_5_page_3.jpg applied successfully!
Combined wavy and 3D rotated transformation for IMG_6_page_2.jpg applied successfully!
Combined wavy and 3D rotated transformation for IMG_2_page_2.jpg applied successfully!
Combined wavy and 3D rotated transformation for IMG_1_page_2.jpg applied successfully!
Combined wavy and 3D rotated transformation for IMG_9_page_1.jpg applied successfully!
Combined wavy and 3D rotated transformation for example_page_2.jpg applied successfully!
Combined wavy and 3D rotated transformation for IMG_1_page_1.jpg applied successfully!
Combined wavy and 3D rotated transformation for IMG_3_page_1.jpg applied successfully!
Combined wavy and 3D rotated transformation for IMG_1_page_3.jpg applied successfully!
Combined wavy and 3D rotated transformation for IMG_7_page_1.jpg applied successfully!
Combined wavy and 3D rotated transformation for IMG_5_page_1.jpg applied successfully!
Combined wavy and 3D rotated transformati

In [10]:
import cv2
import numpy as np
import math
import pickle

def add_padding(image, padding):
    """Adds padding to an image to prevent cropping after warping."""
    h, w, c = image.shape
    new_h, new_w = h + 2 * padding, w + 2 * padding
    padded_image = np.zeros((new_h, new_w, c), dtype=np.uint8)
    padded_image[padding:padding+h, padding:padding+w] = image
    return padded_image

def generate_mesh_grid(width, height):
    """Generate a mesh grid for remapping transformation."""
    x_map, y_map = np.meshgrid(np.arange(width, dtype=np.float32), np.arange(height, dtype=np.float32))
    mesh_3d = np.stack([x_map, y_map, np.zeros_like(x_map), np.ones_like(x_map)], axis=-1)  # (H, W, 4)
    return mesh_3d

def get_rotation_matrix(angle_x, angle_y, angle_z):
    """Compute 3D rotation matrix from given angles in degrees."""
    ax, ay, az = map(math.radians, [angle_x, angle_y, angle_z])
    
    rot_x = np.array([[1, 0, 0], [0, math.cos(ax), -math.sin(ax)], [0, math.sin(ax), math.cos(ax)]], dtype=np.float32)
    rot_y = np.array([[math.cos(ay), 0, math.sin(ay)], [0, 1, 0], [-math.sin(ay), 0, math.cos(ay)]], dtype=np.float32)
    rot_z = np.array([[math.cos(az), -math.sin(az), 0], [math.sin(az), math.cos(az), 0], [0, 0, 1]], dtype=np.float32)
    
    rot_matrix = np.eye(4, dtype=np.float32)
    rot_matrix[:3, :3] = rot_z @ rot_y @ rot_x  # Combine rotations
    return rot_matrix

def apply_transformations(mesh, rotation_matrix, amplitude, frequency):
    """Apply both wavy distortion and 3D rotation to the transformation mesh."""
    h, w, _ = mesh.shape
    flat_mesh = mesh.reshape(-1, 4).T  # Shape: (4, H*W)

    # Generate a vector with values ranging from -1 to 1
    vector = np.linspace(-1.0, 1.0, flat_mesh.shape[1])
    
    # Apply wavy transformation
    flat_mesh[1] += amplitude * np.sin(frequency * flat_mesh[0]) * vector
    
    # Apply 3D rotation
    transformed_mesh = rotation_matrix @ flat_mesh
    x_new, y_new = transformed_mesh[0], transformed_mesh[1]  # Ignore Z-axis
    
    return x_new.reshape(h, w).astype(np.float32), y_new.reshape(h, w).astype(np.float32)

# Load the image
image = cv2.imread("../data/documents/example_page_1.jpg")
if image is None:
    raise FileNotFoundError("Image not found!")

# Add padding to prevent cropping
padding = 400
scaled_image = add_padding(image, padding)

# Get new dimensions
height, width = scaled_image.shape[:2]

# Generate mesh grid
mesh_3d = generate_mesh_grid(width, height)

# Define transformations
amplitude = 100  # Pixel displacement
frequency = 0.1 * np.pi / width  # Frequency relative to width
rotation_matrix = get_rotation_matrix(-10, -10, -5)

# Apply combined transformations
x_map_final, y_map_final = apply_transformations(mesh_3d, rotation_matrix, amplitude, frequency)

# Ensure correct data type for remap
x_map_final = x_map_final.astype(np.float32)
y_map_final = y_map_final.astype(np.float32)

# Zapis do pliku
# with open("dane.pkl", "wb") as f:
#     np.save([x_map_final, y_map_final], f)

np.savez("deformation_grids.npz", x_map=x_map_final, y_map=y_map_final)

# Remap image using the transformed mesh
final_image = cv2.remap(scaled_image, x_map_final, y_map_final, interpolation=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT)
cv2.imwrite("../data/input/example_page_1_transformed.jpg", final_image)
print("Combined wavy and 3D rotated transformation applied successfully!")


[ WARN:0@1083.127] global loadsave.cpp:268 findDecoder imread_('../data/documents/example_page_1.jpg'): can't open/read file: check file path/integrity


FileNotFoundError: Image not found!