# Data Generator
This notebook is designed to generate data for training an AI model.
## Documents Generator
To ensure the model does not become dependent on specific contexts, we generate documents with highly randomized content. This is achieved using **LibreOffice Calc**, where text from a predefined file is randomly distributed across sheets.

To prepare this file, ensure it contains at least 7,000 words. Additionally, maintain a folder structure to data processing.
### Sheets generator

In [None]:
from odf.opendocument import OpenDocumentSpreadsheet
from odf.style import Style, TextProperties
from odf.table import Table, TableRow, TableCell
from odf.text import P
import random
from numpy.random import choice
from pathlib import Path
from typing import List

# Read text for document generation
TEXT_FILE_PATH = "text.txt"
OUTPUT_DIRECTORY = Path("../data/documents")

def load_words(file_path: str) -> List[str]:
    """Load words from a text file."""
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read().split()
    
words_from_file = load_words(TEXT_FILE_PATH)

def generate_random_font_style() -> Style:
    """Generate a random font style for table cells."""
    style = Style(name=f"Style_{random.randint(1, 1_000_000)}", family="table-cell")
    
    font_size = choice(["12pt", "14pt", "17pt", "21pt", "26pt"],
                       p=[0.8, 0.05, 0.05, 0.05, 0.05])
    bold = choice(["bold", "normal"], p=[0.05, 0.95])
    italic = choice(["italic", "normal"], p=[0.05, 0.95])
    
    style.addElement(TextProperties(fontstyle=italic, fontweight=bold, fontsize=font_size))
    return style

def generate_random_file_content(file_name: str):
    """Generate an OpenDocument spreadsheet with random text formatting."""
    # Prepare data
    random.shuffle(words_from_file)
    words = words_from_file[random.randint(0, 101):random.randint(102, len(words_from_file))]
    num_of_words = len(words)

    
    # Create a new spreadsheet document
    doc = OpenDocumentSpreadsheet()
    table = Table(name=file_name)
    file_content = []
    
    # Generate table content
    i = 0
    while i < num_of_words:
        line_length = random.randint(1, 10)
        tab_adding = choice([True, False], p=[0.05, 0.95])
        dash_adding = choice([True, False], p=[0.05, 0.95])
        star_adding = choice([True, False], p=[0.05, 0.95])
        
        row_data = []
        if tab_adding:
            row_data.append("")
            tab_adding = False
        
        for _ in range(line_length):
            if i >= num_of_words:
                break
            
            if dash_adding:
                row_data.append("-")
                dash_adding = False
            elif star_adding:
                row_data.append("*")
                star_adding = False
            else:
                row_data.append(words[i])
                i += 1
        
        file_content.append(row_data)
    
    # Add rows and cells to the table
    for row_data in file_content:
        row = TableRow()
        for cell_data in row_data:
            font_style = generate_random_font_style()
            doc.automaticstyles.addElement(font_style)
            cell = TableCell(stylename=font_style)
            cell.addElement(P(text=cell_data))
            row.addElement(cell)
        table.addElement(row)
    
    # Add table to document
    doc.spreadsheet.addElement(table)
    
    # Ensure output directory exists
    OUTPUT_DIRECTORY.mkdir(parents=True, exist_ok=True)
    
    # Save the document
    output_path = OUTPUT_DIRECTORY / f"{file_name}.ods"
    doc.save(str(output_path))
    print(f"Spreadsheet '{file_name}.ods' created successfully!")

# Example usage
generate_random_file_content("example")

### Converting sheets to PDF file.
This is necessary because it gives the content from a sheet the appearance of a document automatically.

In [None]:
import subprocess
import os

def convert_ods_to_pdf(file_name: str):
    """Convert an ODS file to PDF using LibreOffice CLI."""
    input_path = OUTPUT_DIRECTORY / f"{file_name}.ods"
    output_path = OUTPUT_DIRECTORY / f"{file_name}.pdf"
    
    if not input_path.exists():
        print(f"Error: File '{input_path}' not found.")
        return
    
    try:
        subprocess.run([
            "libreoffice", "--headless", "--convert-to", "pdf", str(input_path), "--outdir", str(OUTPUT_DIRECTORY)
        ], check=True)
        print(f"PDF '{output_path}' created successfully!")
        os.remove(input_path)
    except subprocess.CalledProcessError as e:
        print(f"Error during conversion: {e}")

# example usage
convert_ods_to_pdf("example")

### Converting PDF file to JPEG file.

In [None]:
from pdf2image import convert_from_path

def convert_pdf_to_jpeg(file_name: str):
    """Convert all pages of a PDF file to JPEG using pdf2image."""
    pdf_path = OUTPUT_DIRECTORY / f"{file_name}.pdf"
    output_dir = OUTPUT_DIRECTORY
    output_dir.mkdir(parents=True, exist_ok=True)
    
    if not pdf_path.exists():
        print(f"Error: File '{pdf_path}' not found.")
        return
    
    try:
        images = convert_from_path(str(pdf_path))
        for i, image in enumerate(images):
            image.save(str(output_dir / f"{file_name}_page_{i + 1}.jpg"), "JPEG")
        print(f"All pages of '{file_name}.pdf' converted to JPEG successfully!")
        os.remove(pdf_path)
    except Exception as e:
        print(f"Error during conversion: {e}")

# example usage
convert_pdf_to_jpeg("example")

### Generating 500 documents and converting them to JPEG files.

In [None]:
for i in range(1, 501):
    name = f"IMG_{i}"
    generate_random_file_content(name)
    convert_ods_to_pdf(name)
    convert_pdf_to_jpeg(name)

## Adding documents curvature