## Cropped Image Saving once Bounding Boxes are Created

In [None]:
from PIL import Image
import os
import json
# Suppose you already have the page image from pdf2image
filename = "image.png"
img = Image.open(filename)
basename = os.path.splitext(filename)[0]

# whatever logic to get page 1 or whatever youre currently doing

# and suppose you calculated one line's coordinates and you want to crop that region specifically

# Example bounding box (x0, y0, x1, y1)
x0, y0, x1, y1 = 100, 150, 800, 190  

# Crop that region (line of text)
line_img = img.crop((x0, y0, x1, y1))

# Dynamic naming: use page and line number
page_num = 1
line_num = 3

"""you would do like:
for page in image:
    for line in image or something:
        do the saving logic
        each iteration, increment line_num
    each iteration, increment page_num
"""
output_path = f"data/lines/{basename}_page_{page_num:03d}_line_{line_num:03d}.png"
json_output_path = f"data/lines/{basename}_page_{page_num:03d}_line_{line_num:03d}.json"
txt_output_path = f"data/lines/{basename}_page_{page_num:03d}_line_{line_num:03d}.txt"
# ALSO save the coordinates for the entire line or the words, will see why this is important later
# json format, its basically just 
"""
[
    [
        {

        },
        {

        },
    ],
    [
        {

        },
        {

        },
    ],
]
"""

# Save it
line_img.save(output_path)


with open(json_output_path, "w") as outfile: # this should create the file if it doesnt exist yet, 
        # if it does exist it will overwrite its contents
        json.dump(data, outfile, indent=4) 
print(f"Saved {output_path}")

### Designing a Path in Jupyter Lab or any project

In [None]:
from pathlib import Path

# Relative path (recommended in Jupyter projects)
data_dir = Path("data/lines")
data_dir.mkdir(parents=True, exist_ok=True)

# Absolute path
abs_path = Path("/Users/jonathan/Documents/BeginnerProjectFallQuarter2025/data/lines")

print("Relative:", data_dir)
print("Relative with 'absolute' header:", data_dir.resolve())  # shows full absolute version
print("Absolute:", abs_path)


Relative: data/lines
Relative with 'absolute' header: /Users/homefolder/Documents/BeginnerProjectFallQuarter2025/python_notebooks/data/lines
Absolute: /Users/jonathan/Documents/ocr_project/data/lines


### Extracting Text and Bounding Boxes from a PDF

In [None]:
import fitz  # PyMuPDF

doc = fitz.open("sample.pdf")
page = doc[0]

# Returns list of tuples: (x0, y0, x1, y1, word, block_no, line_no, word_no)
words = page.get_text("words")

for w in words:
    x0, y0, x1, y1, text, *_ = w
    print(f"Word: '{text}' at box ({x0}, {y0}, {x1}, {y1})")

"""
if you call: page.get_text("dict")
{
  "blocks": [
    {
      "lines": [
        {
          "spans": [
            {
              "text": "Hello",
              "bbox": [x0, y0, x1, y1],
            },
            {
              "text": "world!",
              "bbox": [x0, y0, x1, y1],
            }
          ]
        }
      ]
    }
  ]
}
"""

### Building training dataset

# 🧩 **OCR Dataset Labeling Workflow**

---

## 🛠️ **Setup**

**Import:**
- `fitz` (**PyMuPDF**)  
- `json`, `os`, `PIL.Image`

**Initialize:**
- `BASE_PATH` = directory containing your `.png` line images  
- Open the original PDF document with:  
  `fitz.open(pdf_path)`

---

## 📄 **For each page in the PDF**

**Loop:**
```
For each PAGE_NUM, PAGE in PDF:
```

---

### 🧱 **Extract words or line structures**

```
WORDS = PAGE.get_text("words")
```
> Each **WORD** = `(x0, y0, x1, y1, text, block_no, line_no, word_no)`

---

### 🎯 **Convert PDF coordinates → image coordinates**

```
SCALE = 300 / 72   # 300 DPI rendered images
For each WORD in WORDS:
    Multiply (x0, y0, x1, y1) by SCALE
```

---

### 📦 **Load and match bounding boxes**

```
DETECTED_LINES = load coordinates you previously saved
                 from your image processing step
```

**For each** `LINE_BOX` **in** `DETECTED_LINES`:
- Initialize `LINE_TEXT = ""`
- For each `WORD` in `WORDS`:
  - If **WORD’s bounding box overlaps** `LINE_BOX` within a small tolerance →  
    **Append** `WORD.text` **to** `LINE_TEXT` (with a space)

---

### 🏷️ **Construct file names**

```
BASENAME = filename stem from the page image (e.g., "document1")

OUTPUT_IMAGE_PATH = f"data/lines/{BASENAME}_page_{PAGE_NUM:03d}_line_{LINE_INDEX:03d}.png"
JSON_OUTPUT_PATH  = f"data/lines/{BASENAME}_page_{PAGE_NUM:03d}_line_{LINE_INDEX:03d}.json"
TXT_OUTPUT_PATH   = f"data/lines/{BASENAME}_page_{PAGE_NUM:03d}_line_{LINE_INDEX:03d}.txt"
```

---

### 💾 **Save extracted text**

```
Save the LABEL_DATA to a .txt path
Write LABEL_DATA to TXT_OUTPUT_PATH
```

**Optional:**  
Print a confirmation message for each saved line.

---

## ✅ **Output Summary**

You will have **3 files with the same base name**, but different endings:

| File Type | Example | Description |
|------------|----------|-------------|
| 🖼️ **Image** | `data/lines/demo_page_001_line_002.png` | Cropped line image |
| 📄 **JSON** | `data/lines/demo_page_001_line_002.json` | Coordinates / structured metadata |
| 📝 **TXT** | `data/lines/demo_page_001_line_002.txt` | Plain text label for OCR training |


# Setup
Import fitz (PyMuPDF)
Import json, os, PIL.Image
Set BASE_PATH = directory containing your .png line images
Open the original PDF document with fitz.open(pdf_path)

# For each page in the PDF
For each PAGE_NUM, PAGE in PDF:
    # Get all words (or use "dict" for per-line structures)
    WORDS = PAGE.get_text("words")  
        # Each WORD = (x0, y0, x1, y1, text, block_no, line_no, word_no)

    # Convert PDF coordinates (points) → image coordinates (pixels)
    SCALE = 300 / 72  # 300 DPI rendered images
    For each WORD in WORDS:
        Multiply (x0, y0, x1, y1) by SCALE

    # Load your list of pre-saved bounding boxes for this page
    DETECTED_LINES = load coordinates you previously saved from your image processing step

    For each LINE_BOX in DETECTED_LINES:
        Initialize LINE_TEXT = empty string

        For each WORD in WORDS:
            If WORD's bounding box overlaps LINE_BOX within a small tolerance:
                Append WORD.text to LINE_TEXT (with a space)

        # Get the image name base (e.g., "document1")
        BASENAME = filename stem from the page image (e.g., "document1")

        # Construct filenames for image and its text label
        OUTPUT_IMAGE_PATH = f"data/lines/{BASENAME}_page_{PAGE_NUM:03d}_line_{LINE_INDEX:03d}.png"
        JSON_OUTPUT_PATH  = f"data/lines/{BASENAME}_page_{PAGE_NUM:03d}_line_{LINE_INDEX:03d}.json"
        TXT_OUTPUT_PATH = f"data/lines/{BASENAME}_page_{PAGE_NUM:03d}_line_{LINE_INDEX:03d}.txt"
        # Save the text label to a JSON file
        Save the LABEL_DATA to a .txt path with the
        Write LABEL_DATA to TXT_OUTPUT_PATH 

        Print confirmation message (optional)


### Train test split

In [None]:
import os
import random
from pathlib import Path
import glob

# === CONFIG ===
IMG_DIR = Path("data/lines")   # directory containing all .png files
TRAIN_RATIO = 0.7              # 70% for training
SEED = 42                      # for reproducibility
random.seed(SEED)

# === 1. Collect all image files ===
# get a list of all the image files, sorted by name
# get ONLY the png files.
# use glob

total = len(all_imgs)
print(f"Found {total} image files.")

# === 2. Shuffle and split ===
random.shuffle(all_imgs)
# the first 70% after shuffle should be train, the rest should be test.

# split all images into train and test
train_imgs = ###
test_imgs  = ###

print(f"Train set: {len(train_imgs)} images")
print(f"Test set:  {len(test_imgs)} images")

# === 3. Example: index-based access ===
# you can directly access by index like a list
i = 5
print(f"Train[{i}] = {train_imgs[i]}")
print(f"Test[0]    = {test_imgs[0]}")

# === 4. Optional: save lists to text files ===
# this is good in order to keep your training list stable. You can reproduce your training results repeatedly.
(train_list_path := IMG_DIR / "train_list.txt").write_text(
    "\n".join(str(p) for p in train_imgs)
)
(test_list_path := IMG_DIR / "test_list.txt").write_text(
    "\n".join(str(p) for p in test_imgs)
)
print("Saved train/test file lists.")
