# Debugging model
---

In [9]:
import requests
import datetime
import os
from PIL import Image, ImageDraw, ImageFont


---
## Importing / Modifying the data

In [10]:
# We import all "all quiet on the western front" text from Project Gutenberg
url = "https://www.gutenberg.org/cache/epub/75011/pg75011.txt"
text = requests.get(url).text

with open("all_quiet_on_the_western_front.txt", "w", encoding="utf-8") as file:
    file.write(text)


In [11]:
print("The text is {} characters long.".format(len(text)))
print(f"Text: {text[:100]}...")

The text is 368551 characters long.
Text: ﻿The Project Gutenberg eBook of All quiet on the Western Front
    
This ebook is for the use of a...


In [12]:

with open("all_quiet_on_the_western_front.txt", "r", encoding="utf-8") as file:
    working_txt = file.read()
    # remove line breaks
    working_txt = working_txt.replace("\n", " ")
    # remove multiple spaces
    working_txt = " ".join(working_txt.split())
    

In [13]:
working_txt[:100]

'\ufeffThe Project Gutenberg eBook of All quiet on the Western Front This ebook is for the use of anyone a'

---
## Prepare Images
Create images of equal character length and put them into a directory.
By doing that we create images with an corresponding .txt-file. 

In [14]:
print("The text is {} characters long.".format(len(text)))

The text is 368551 characters long.


In [15]:
#insert a line break every 50 characters
def insert_line_breaks(text, line_length=50):
    return "\n".join(text[i:i + line_length] for i in range(0, len(text), line_length))

formatted_text = insert_line_breaks(working_txt, 50)
with open("formatted_all_quiet_on_the_western_front.txt", "w", encoding="utf-8") as file:
    file.write(formatted_text)
print("Formatted text saved to 'formatted_all_quiet_on_the_western_front.txt'.")
print(f"Content:\n{formatted_text[:100]}...")
print(f"number of rows: {formatted_text.count('\n') + 1}")


Formatted text saved to 'formatted_all_quiet_on_the_western_front.txt'.
Content:
﻿The Project Gutenberg eBook of All quiet on the W
estern Front This ebook is for the use of anyone ...
number of rows: 7109


---
## Create the dataset

In [None]:
# Parameters
output_dir = "dataset"
font_path = "DejaVuSans-Bold.ttf"
font_size = 60
img_width = 2048
img_height = 256
text_color = "black"

# Read and clean lines from file
with open("formatted_all_quiet_on_the_western_front.txt", "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

font = ImageFont.truetype(font_path, font_size)

for idx, line in enumerate(lines):
    # Create a new image
    img = Image.new('RGB', (img_width, img_height), color='white')
    draw = ImageDraw.Draw(img)

    # Get bounding box for text
    bbox = draw.textbbox((0, 0), line, font=font)
    text_width, text_height = bbox[2] - bbox[0], bbox[3] - bbox[1]

    # Calculate centered position
    x = (img_width - text_width) // 2
    y = (img_height - text_height) // 2

    # Draw centered text
    draw.text((x, y), line, font=font, fill=text_color)

    # Generate unique name
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
    base_name = f"{timestamp}_{idx}"

    # Save image and text
    img.save(os.path.join(output_dir, f"{base_name}.png"))
    with open(os.path.join(output_dir, f"{base_name}.txt"), "w", encoding="utf-8") as f:
        f.write(line)

    # Optional: comment this out if you want all lines processed
    #break  # Remove this line to process all lines
