## Installing Pre-reqs and importing Libraries

In [None]:
%%capture
!pip install opencv-python
!pip install matplotlib
!pip install numpy
!pip install ultralytics
!pip install easyocr
!pip install dill
!pip install pdf2image
!apt-get install -y poppler-utils
!pip install -q transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
%%capture
!wget -O detector-model.pt "https://huggingface.co/spaces/linhdo/checkbox-detector/resolve/main/models/detector-model.pt"
!wget -O classifier-model.pt "https://huggingface.co/spaces/linhdo/checkbox-detector/resolve/main/models/classifier-model.pt"

In [2]:
import os
assert os.path.exists("detector-model.pt"), "Detector model not downloaded!"
assert os.path.exists("classifier-model.pt"), "Classifier model not downloaded!"
print("YOLO Models downloaded successfully!")

YOLO Models downloaded successfully!


In [4]:
%%capture
!git clone https://github.com/jaswanth04/Checkbox_Detection.git

In [None]:
import cv2  # for reading images and drawing bounding boxes
from ultralytics import YOLO
import matplotlib.pyplot as plt
import numpy as np
import os
import easyocr
import sys
sys.path.insert(0, '/content/Checkbox_Detection/src')

from image_align import SIFTAligner, ORBAligner

# Load models
DETECTION_MODEL = YOLO("detector-model.pt")
CLASSIFICATION_MODEL = YOLO("classifier-model.pt")  # 0: block, 1: checked, 2: unchecked

## Converting pdfs to images

In [None]:
import os
from pdf2image import convert_from_path

# Set the directory containing PDF files
directory = "Scanned PDFs"

# Loop over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(directory, filename)

        # Convert PDF to images
        images = convert_from_path(pdf_path)

        # Save each page as an image
        for i, image in enumerate(images):
            image_filename = os.path.join(directory, f"{os.path.splitext(filename)[0]}_page_{i + 1}.png")
            image.save(image_filename, "PNG")

        # Delete the original PDF file
        os.remove(pdf_path)
        print(f"Converted and deleted: {filename}")


## Iterating Images

In [None]:
import os

# Set the directory containing the images
directory = "Scanned PDFs"
index_file_path = "image iterator.txt" # Text File to store the current index

# Get the list of image files
image_files = sorted([f for f in os.listdir(directory) if f.endswith(".png")])

# Read the current index from file, or set to 0 if file doesn't exist
if os.path.exists(index_file_path):
    with open(index_file_path, "r") as index_file:
        index = int(index_file.read().strip())
else:
    index = 0

# Ensure index wraps around if it exceeds the list length
index = index % len(image_files)

# Select the current image file path
image_path = os.path.join(directory, image_files[index])
print(f"Selected image: {image_path}")

# Update the index for the next run
index += 1
with open(index_file_path, "w") as index_file:
    index_file.write(str(index))


## Aligning the template image

In [None]:
# Path to the image file
template_path = "template.png" # Path to the template image

sift_aligner = SIFTAligner(template_path)

In [None]:
plt.figure(figsize=(20, 20))
plt.axis('off')
plt.imshow(cv2.cvtColor(cv2.imread(template_path), cv2.COLOR_BGR2RGB))

## Aligning the image and checking Similarity

In [None]:
query = cv2.imread(image_path)
aligned_sift = sift_aligner.align(query)

In [None]:
plt.figure(figsize=(10,10))

ax1 = plt.subplot(131)
ax1.axis('off')
ax1.set_title("Query Image")
ax1.imshow(cv2.cvtColor(query, cv2.COLOR_BGR2RGB))

ax2 = plt.subplot(132)
ax2.axis('off')
ax2.set_title("Sift Aligned Image")
ax2.imshow(cv2.cvtColor(aligned_sift, cv2.COLOR_BGR2RGB))
plt.show()

aligned_dir = "aligned"
os.makedirs(aligned_dir, exist_ok=True)
cv2.imwrite("/content/aligned/sift-aligned.png", aligned_sift)

In [None]:
import cv2
# Load images
image1 = query
image2 = aligned_sift
hist_img1 = cv2.calcHist([image1], [0, 1, 2], None, [256, 256, 256], [0, 256, 0, 256, 0, 256])
hist_img1[255, 255, 255] = 0 #ignore all white pixels
cv2.normalize(hist_img1, hist_img1, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX)
hist_img2 = cv2.calcHist([image2], [0, 1, 2], None, [256, 256, 256], [0, 256, 0, 256, 0, 256])
hist_img2[255, 255, 255] = 0  #ignore all white pixels
cv2.normalize(hist_img2, hist_img2, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX)
# Find the metric value
metric_val = cv2.compareHist(hist_img1, hist_img2, cv2.HISTCMP_CORREL)
print(f"Similarity Score: ", round(metric_val, 2))
# Similarity Score: 0.94

Similarity Score:  0.0


In [None]:
if metric_val > 0.89 and metric_val < 0.98:
  print("Images are similar")
  image_path = "/content/aligned/sift-aligned.png"
else:
  print("Images are not similar")

Images are not similar


## Using the Aligned Image with YOLO

In [None]:
# Define box colors and padding
BOX_COLORS = {
    "unchecked": (242, 48, 48),
    "checked": (38, 115, 101),
    "block": (242, 159, 5)
}
BOX_PADDING = 2

def detect(image_path):
    image = cv2.imread(image_path)
    if image is None:
        print("Error loading image.")
        return None

    # Predict on image
    results = DETECTION_MODEL.predict(source=image, conf=0.2, iou=0.8)
    boxes = results[0].boxes  # Get bounding boxes

    if len(boxes) == 0:
        return image

    # Process each detected box
    for box in boxes:
        detection_class_conf = round(box.conf.item(), 2)
        detection_class = list(BOX_COLORS)[int(box.cls)]
        start_box = (int(box.xyxy[0][0]), int(box.xyxy[0][1]))
        end_box = (int(box.xyxy[0][2]), int(box.xyxy[0][3]))

        # Draw bounding box
        image = cv2.rectangle(image, start_box, end_box, BOX_COLORS[detection_class], 2)

        # Label
        label = f"{detection_class} {detection_class_conf}"
        cv2.putText(image, label, (start_box[0], start_box[1]-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)

    return image

# Run detection
output_image = detect(image_path)

# Display the output image with bounding boxes
if output_image is not None:
    output_image_rgb = cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(10, 10))
    plt.imshow(output_image_rgb)
    plt.axis('off')
    plt.show()


In [None]:
# Define box colors and padding
BOX_COLORS = {
    "unchecked": (242, 48, 48),
    "checked": (38, 115, 101),
    "block": (242, 159, 5)
}
BOX_PADDING = 2

def detect_and_display_rois(image_path):
    image = cv2.imread(image_path)
    if image is None:
        print("Error loading image.")
        return None

    # Predict on image
    results = DETECTION_MODEL.predict(source=image, conf=0.2, iou=0.8)
    boxes = results[0].boxes  # Get bounding boxes

    if len(boxes) == 0:
        print("No boxes detected.")
        return image

    # Create a directory to save ROIs if it doesn't exist
    roi_dir = "rois"
    os.makedirs(roi_dir, exist_ok=True)

    # Process each detected box
    for idx, box in enumerate(boxes):
        detection_class_conf = round(box.conf.item(), 2)
        detection_class = list(BOX_COLORS)[int(box.cls)]
        start_box = (int(box.xyxy[0][0]), int(box.xyxy[0][1]))
        end_box = (int(box.xyxy[0][2]), int(box.xyxy[0][3]))

        # Draw bounding box on the original image
        image = cv2.rectangle(image, start_box, end_box, BOX_COLORS[detection_class], 2)

        # Extract the Region of Interest (ROI)
        roi = image[start_box[1]:end_box[1], start_box[0]:end_box[0]]

        # Check if ROI is valid
        if roi.size > 0:
            # Save the ROI as a separate image
            roi_filename = os.path.join(roi_dir, f"roi_{idx}.png")
            cv2.imwrite(roi_filename, roi)
            print(f"Saved ROI: {roi_filename}")

            # Optionally display the ROI using Matplotlib
            plt.figure(figsize=(5, 5))
            plt.imshow(cv2.cvtColor(roi, cv2.COLOR_BGR2RGB))
            plt.axis('off')
            plt.title(f"ROI {idx} - {detection_class} {detection_class_conf}")
            plt.show()

        # Label
        label = f"{detection_class} {detection_class_conf}"
        cv2.putText(image, label, (start_box[0], start_box[1]-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)

    return image

# Run detection and display ROIs
output_image = detect_and_display_rois(image_path)

# Display the output image with bounding boxes
if output_image is not None:
    output_image_rgb = cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(10, 10))
    plt.imshow(output_image_rgb)
    plt.axis('off')
    plt.show()


## Extracting Checkboxes and their confidence Scores

In [None]:
# Initialize EasyOCR Reader
ocr_reader = easyocr.Reader(['en'])  # Specify the language for OCR

# Define box colors and padding
BOX_COLORS = {
    "unchecked": (242, 48, 48),
    "checked": (38, 115, 101),
    "block": (242, 159, 5)
}
BOX_PADDING = 2

def detect_and_display_checked_rois(image_path):
    image = cv2.imread(image_path)
    if image is None:
        print("Error loading image.")
        return None

    # Predict on image
    results = DETECTION_MODEL.predict(source=image, conf=0.2, iou=0.8)
    boxes = results[0].boxes  # Get bounding boxes

    if len(boxes) == 0:
        print("No boxes detected.")
        return image

    # Create a directory to save ROIs if it doesn't exist
    roi_dir = "checked_rois"
    os.makedirs(roi_dir, exist_ok=True)

    # Process each detected box
    for idx, box in enumerate(boxes):
        detection_class_conf = round(box.conf.item(), 2)
        detection_class = list(BOX_COLORS)[int(box.cls)]
        start_box = (int(box.xyxy[0][0]), int(box.xyxy[0][1]))
        end_box = (int(box.xyxy[0][2]), int(box.xyxy[0][3]))

        # Draw bounding box on the original image
        image = cv2.rectangle(image, start_box, end_box, BOX_COLORS[detection_class], 2)

        # Extract the Region of Interest (ROI)
        roi = image[start_box[1]:end_box[1], start_box[0]:end_box[0]]

        # Check if the detected box is classified as "checked"
        if detection_class == "checked":
            # Save the ROI as a separate image
            roi_filename = os.path.join(roi_dir, f"checked_roi_{idx}.png")
            cv2.imwrite(roi_filename, roi)
            print(f"Saved Checked ROI: {roi_filename}")

            # Optionally display the ROI using Matplotlib
            plt.figure(figsize=(5, 5))
            plt.imshow(cv2.cvtColor(roi, cv2.COLOR_BGR2RGB))
            plt.axis('off')
            plt.title(f"Checked ROI {idx} - {detection_class} {detection_class_conf}")
            plt.show()

            # Extract text next to the checked box (to the right)
            text_start_x = end_box[0] + 5  # Add padding
            text_end_x = text_start_x + 200  # Approximate width for text
            text_start_y = start_box[1] - 5  # Align with the top of the checkbox
            text_end_y = end_box[1] + 5  # Align with the bottom of the checkbox

            # Crop the area where the label is expected to be
            label_region = image[text_start_y:text_end_y, text_start_x:text_end_x]

            # Use EasyOCR to read text from the cropped area
            text_results = ocr_reader.readtext(label_region)

            # Combine extracted text
            extracted_text = " ".join([result[1] for result in text_results])
            print(f"Extracted Text for Checked ROI {idx}: {extracted_text}")

        # Label
        label = f"{detection_class} {detection_class_conf}"
        cv2.putText(image, label, (start_box[0], start_box[1]-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)

    return image

# Run detection and display checked ROIs
output_image = detect_and_display_checked_rois(image_path)

# Display the output image with bounding boxes
if output_image is not None:
    output_image_rgb = cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(10, 10))
    plt.imshow(output_image_rgb)
    plt.axis('off')
    plt.show()


## Picking highest score checked box from each row

In [None]:
import cv2
import numpy as np
import easyocr
import os
import matplotlib.pyplot as plt

# Initialize EasyOCR Reader
ocr_reader = easyocr.Reader(['en'])

# Define box colors and padding
BOX_COLORS = {
    "unchecked": (242, 48, 48),  # Red for unchecked
    "checked": (0, 255, 0),      # Green for checked
    "block": (242, 159, 5)       # Yellow for block
}
BOX_PADDING = 2
ROW_THRESHOLD = 200  # Adjust as necessary based on row spacing in the images
EXPECTED_ROWS = 6  # Expected number of rows

def detect_and_display_checked_rois(image_path):
    image = cv2.imread(image_path)
    if image is None:
        print("Error loading image.")
        return None

    # Predict on image
    results = DETECTION_MODEL.predict(source=image, conf=0.2, iou=0.8)
    boxes = results[0].boxes  # Get bounding boxes

    if len(boxes) == 0:
        print("No boxes detected.")
        return image

    # Create a directory to save ROIs if it doesn't exist
    roi_dir = "checked_rois"
    os.makedirs(roi_dir, exist_ok=True)

    # Create an overlay image to highlight checked boxes
    overlay_image = np.zeros_like(image)

    # Process each detected box and group by row
    checked_boxes = []
    for idx, box in enumerate(boxes):
        detection_class_conf = round(box.conf.item(), 2)
        detection_class = list(BOX_COLORS)[int(box.cls)]
        start_box = (int(box.xyxy[0][0]), int(box.xyxy[0][1]))
        end_box = (int(box.xyxy[0][2]), int(box.xyxy[0][3]))

        if detection_class == "checked":
            checked_boxes.append({
                "confidence": detection_class_conf,
                "start": start_box,
                "end": end_box,
                "idx": idx
            })

    # Sort boxes by their y-coordinate to help identify rows
    checked_boxes.sort(key=lambda box: box["start"][1])

    # Select the highest confidence checkbox per row and check for missing rows
    selected_boxes = []
    row = []
    last_y = None
    row_count = 1
    missing_rows = []

    for box in checked_boxes:
        if last_y is None or abs(box["start"][1] - last_y) < ROW_THRESHOLD:
            row.append(box)
        else:
            row = sorted(row, key=lambda b: b["confidence"], reverse=True)
            if row:
                selected_boxes.append(row[0])
            else:
                missing_rows.append(row_count)

            row_count += 1
            row = [box]

        last_y = box["start"][1]

    if row:
        row = sorted(row, key=lambda b: b["confidence"], reverse=True)
        selected_boxes.append(row[0])

    if row_count < EXPECTED_ROWS:
        missing_rows.extend(range(row_count, EXPECTED_ROWS + 1))

    # Calculate y-coordinate differences and analyze for missing rows
    y_differences = {}
    for i in range(1, len(selected_boxes)):
        y_diff = abs(selected_boxes[i]["start"][1] - selected_boxes[i - 1]["start"][1])
        y_differences[f"Row {i} to Row {i+1}"] = y_diff

    # Identify average expected gap for rows within typical spacing
    typical_y_diffs = [diff for diff in y_differences.values() if 200 <= diff <= 300]
    avg_typical_y_diff = sum(typical_y_diffs) / len(typical_y_diffs) if typical_y_diffs else 0

    # Return the gap dictionary
    gap_info = {}
    for i, (row_pair, y_diff) in enumerate(y_differences.items()):
        gap_info[row_pair] = {
            "gap": y_diff,
            "estimated_missing_rows": round(y_diff / avg_typical_y_diff) - 1 if y_diff > avg_typical_y_diff * 2 else 0
        }

    print("Gap Information Between Rows:")
    print(gap_info)

    # Draw and process the selected checkboxes
    for box in selected_boxes:
        start_box = box["start"]
        end_box = box["end"]
        confidence = box["confidence"]
        idx = box["idx"]

        # Draw bounding box on the overlay image
        overlay_image = cv2.rectangle(overlay_image, start_box, end_box, (0, 0, 255), 5)

        # Extract and save the ROI
        roi = image[start_box[1]:end_box[1], start_box[0]:end_box[0]]
        roi_filename = os.path.join(roi_dir, f"checked_roi_{idx}.png")

        # Extract text next to the checked box
        text_start_x = end_box[0] + 5
        text_end_x = text_start_x + 200
        text_start_y = start_box[1] - 5
        text_end_y = end_box[1] + 5

        label_region = image[text_start_y:text_end_y, text_start_x:text_end_x]
        text_results = ocr_reader.readtext(label_region)
        extracted_text = " ".join([result[1] for result in text_results])
        print(f"Extracted Text for Selected Checked ROI in Row {idx}: {extracted_text}")

        label = f"checked {confidence}"
        cv2.putText(overlay_image, label, (start_box[0], start_box[1]-5), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), thickness=3)

    final_image = cv2.addWeighted(image, 0.6, overlay_image, 1.4, 0)

    return final_image, gap_info

# Run detection and display checked ROIs
output_image, gap_info = detect_and_display_checked_rois(image_path)

# Display the output image with bounding boxes
if output_image is not None:
    output_image_rgb = cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(10, 10))
    plt.imshow(output_image_rgb)
    plt.axis('off')
    plt.title("Output Image with Selected Checked ROIs Highlighted")
    plt.show()


In [None]:
# Now you have the gap information between rows
print("Detected Gaps Between Rows:", gap_info)

Detected Gaps Between Rows: {'Row 1 to Row 2': {'gap': 269, 'estimated_missing_rows': 0}, 'Row 2 to Row 3': {'gap': 263, 'estimated_missing_rows': 0}, 'Row 3 to Row 4': {'gap': 268, 'estimated_missing_rows': 0}, 'Row 4 to Row 5': {'gap': 266, 'estimated_missing_rows': 0}, 'Row 5 to Row 6': {'gap': 277, 'estimated_missing_rows': 0}}


In [None]:
def detect_and_display_checked_unchecked_rois(image_path):
    image = cv2.imread(image_path)
    if image is None:
        print("Error loading image.")
        return None

    # Predict on image
    results = DETECTION_MODEL.predict(source=image, conf=0.2, iou=0.8)
    boxes = results[0].boxes  # Get bounding boxes

    if len(boxes) == 0:
        print("No boxes detected.")
        return image

    # Create directories to save ROIs if they don't exist
    roi_dir_checked = "checked_rois"
    roi_dir_unchecked = "unchecked_rois"
    os.makedirs(roi_dir_checked, exist_ok=True)
    os.makedirs(roi_dir_unchecked, exist_ok=True)

    # Create an overlay image to highlight boxes
    overlay_image = np.zeros_like(image)

    # Separate checked and unchecked boxes
    checked_boxes = []
    unchecked_boxes = []
    for idx, box in enumerate(boxes):
        detection_class = list(BOX_COLORS)[int(box.cls)]
        start_box = (int(box.xyxy[0][0]), int(box.xyxy[0][1]))
        end_box = (int(box.xyxy[0][2]), int(box.xyxy[0][3]))

        if detection_class == "checked":
            checked_boxes.append({"start": start_box, "end": end_box, "idx": idx})
        elif detection_class == "unchecked":
            unchecked_boxes.append({"start": start_box, "end": end_box, "idx": idx})

    # Sort boxes by their y-coordinate to identify rows
    checked_boxes.sort(key=lambda box: box["start"][1])
    unchecked_boxes.sort(key=lambda box: box["start"][1])

    # Function to check if two boxes intersect
    def is_intersecting(box1, box2):
        x1, y1 = box1["start"]
        x2, y2 = box1["end"]
        x1_p, y1_p = box2["start"]
        x2_p, y2_p = box2["end"]

        return not (x2 < x1_p or x2_p < x1 or y2 < y1_p or y2_p < y1)

    # Filter boxes to keep only one per group of intersecting boxes
    def filter_overlapping_boxes(boxes):
        filtered_boxes = []
        used_boxes = set()
        for i, box in enumerate(boxes):
            if i in used_boxes:
                continue
            group = [box]
            for j in range(i + 1, len(boxes)):
                if j not in used_boxes and is_intersecting(box, boxes[j]):
                    group.append(boxes[j])
                    used_boxes.add(j)
            # Choose the first box in the group
            filtered_boxes.append(group[0])
            used_boxes.add(i)
        return filtered_boxes

    # Filter checked and unchecked boxes
    selected_checked_boxes = filter_overlapping_boxes(checked_boxes)
    selected_unchecked_boxes = filter_overlapping_boxes(unchecked_boxes)

    # Print counts
    print(f"Number of checked boxes: {len(selected_checked_boxes)}")
    print(f"Number of unchecked boxes: {len(selected_unchecked_boxes)}")

    # Draw checked and unchecked boxes
    for box in selected_checked_boxes:
        start_box, end_box, idx = box["start"], box["end"], box["idx"]
        overlay_image = cv2.rectangle(overlay_image, start_box, end_box, (0, 0, 255), 5)
        roi = image[start_box[1]:end_box[1], start_box[0]:end_box[0]]

    for box in selected_unchecked_boxes:
        start_box, end_box, idx = box["start"], box["end"], box["idx"]
        overlay_image = cv2.rectangle(overlay_image, start_box, end_box, (255, 0, 0), 5)
        roi = image[start_box[1]:end_box[1], start_box[0]:end_box[0]]

    # Overlay checked and unchecked boxes on the original image
    final_image = cv2.addWeighted(image, 0.6, overlay_image, 1.4, 0)
    return final_image

# Run detection and display ROIs
output_image = detect_and_display_checked_unchecked_rois(image_path)

# Display the output image
if output_image is not None:
    output_image_rgb = cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(10, 10))
    plt.imshow(output_image_rgb)
    plt.axis('off')
    plt.title("Output Image with Selected Checked and Unchecked ROIs Highlighted")
    plt.show()


## Extracting Text Blocks for OCR

In [None]:
import easyocr
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt

# Define box colors and padding
BOX_COLORS = {
    "unchecked": (242, 48, 48),
    "checked": (38, 115, 101),
    "block": (242, 159, 5)
}
BOX_PADDING = 2

def iou(box1, box2):
    """Calculate Intersection over Union (IoU) of two bounding boxes."""
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    # Calculate the area of intersection
    inter_area = max(0, x2 - x1 + 1) * max(0, y2 - y1 + 1)
    # Calculate the area of both bounding boxes
    box1_area = (box1[2] - box1[0] + 1) * (box1[3] - box1[1] + 1)
    box2_area = (box2[2] - box2[0] + 1) * (box2[3] - box2[1] + 1)
    # Calculate the IoU
    iou = inter_area / float(box1_area + box2_area - inter_area)
    return iou

def detect_and_display_block_rois(image_path):
    image = cv2.imread(image_path)
    if image is None:
        print("Error loading image.")
        return None

    # Predict on image
    results = DETECTION_MODEL.predict(source=image, conf=0.2, iou=0.8)
    boxes = results[0].boxes  # Get bounding boxes

    if len(boxes) == 0:
        print("No boxes detected.")
        return image

    # Create a directory to save ROIs if it doesn't exist
    roi_dir = "/content/block_rois"
    os.makedirs(roi_dir, exist_ok=True)

    # Filter out only "block" detections and store their information
    block_boxes = []
    for box in boxes:
        detection_class = list(BOX_COLORS)[int(box.cls)]
        if detection_class == "block":
            conf = box.conf.item()
            x1, y1, x2, y2 = int(box.xyxy[0][0]), int(box.xyxy[0][1]), int(box.xyxy[0][2]), int(box.xyxy[0][3])
            block_boxes.append({"coords": (x1, y1, x2, y2), "conf": conf})

    # Filter overlapping blocks, keeping only the highest confidence box in each overlap
    filtered_blocks = []
    for i, box1 in enumerate(block_boxes):
        keep = True
        for j, box2 in enumerate(block_boxes):
            if i != j:
                # Calculate IoU between box1 and box2
                if iou(box1["coords"], box2["coords"]) > 0.3:  # Threshold for considering overlap
                    # Keep the box with the higher confidence score
                    if box1["conf"] < box2["conf"]:
                        keep = False
                        break
        if keep:
            filtered_blocks.append(box1)

    # Process each filtered block
    for idx, box in enumerate(filtered_blocks):
        start_box = (box["coords"][0], box["coords"][1])
        end_box = (box["coords"][2], box["coords"][3])

        # Draw bounding box on the original image
        image = cv2.rectangle(image, start_box, end_box, BOX_COLORS["block"], 2)

        # Extract the Region of Interest (ROI) for the block
        roi = image[start_box[1]:end_box[1], start_box[0]:end_box[0]]

        # Save the ROI as a separate image
        roi_filename = os.path.join(roi_dir, f"block_roi_{idx}.png")
        cv2.imwrite(roi_filename, roi)
        print(f"Saved Block ROI: {roi_filename}")

        # Optionally display the ROI using Matplotlib
        plt.figure(figsize=(5, 5))
        plt.imshow(cv2.cvtColor(roi, cv2.COLOR_BGR2RGB))
        plt.axis('off')
        plt.title(f"Block ROI {idx} - block {round(box['conf'], 2)}")
        plt.show()

        # Label
        label = f"block {round(box['conf'], 2)}"
        cv2.putText(image, label, (start_box[0], start_box[1] - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)

    return image

# Run detection and display block ROIs
output_image = detect_and_display_block_rois(image_path)

# Display the output image with bounding boxes
if output_image is not None:
    output_image_rgb = cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(10, 10))
    plt.imshow(output_image_rgb)
    plt.axis('off')
    plt.show()


In [None]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt

# Define box colors and padding
BOX_COLORS = {
    "unchecked": (242, 48, 48),
    "checked": (38, 115, 101),
    "block": (242, 159, 5)
}
BOX_PADDING = 2

def iou(box1, box2):
    """Calculate Intersection over Union (IoU) of two bounding boxes."""
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    # Calculate the area of intersection
    inter_area = max(0, x2 - x1 + 1) * max(0, y2 - y1 + 1)
    # Calculate the area of both bounding boxes
    box1_area = (box1[2] - box1[0] + 1) * (box1[3] - box1[1] + 1)
    box2_area = (box2[2] - box2[0] + 1) * (box2[3] - box2[1] + 1)
    # Calculate the IoU
    iou = inter_area / float(box1_area + box2_area - inter_area)
    return iou

def detect_and_display_block_rois(image_path, save_dir):
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error loading image: {image_path}")
        return

    # Predict on image
    results = DETECTION_MODEL.predict(source=image, conf=0.2, iou=0.8)
    boxes = results[0].boxes  # Get bounding boxes

    if len(boxes) == 0:
        print(f"No boxes detected in {image_path}.")
        return

    # Filter out only "block" detections and store their information
    block_boxes = []
    for box in boxes:
        detection_class = list(BOX_COLORS)[int(box.cls)]
        if detection_class == "block":
            conf = box.conf.item()
            x1, y1, x2, y2 = int(box.xyxy[0][0]), int(box.xyxy[0][1]), int(box.xyxy[0][2]), int(box.xyxy[0][3])
            block_boxes.append({"coords": (x1, y1, x2, y2), "conf": conf})

    # Filter overlapping blocks, keeping only the highest confidence box in each overlap
    filtered_blocks = []
    for i, box1 in enumerate(block_boxes):
        keep = True
        for j, box2 in enumerate(block_boxes):
            if i != j:
                # Calculate IoU between box1 and box2
                if iou(box1["coords"], box2["coords"]) > 0.3:  # Threshold for considering overlap
                    # Keep the box with the higher confidence score
                    if box1["conf"] < box2["conf"]:
                        keep = False
                        break
        if keep:
            filtered_blocks.append(box1)

    # Process each filtered block
    for idx, box in enumerate(filtered_blocks):
        start_box = (box["coords"][0], box["coords"][1])
        end_box = (box["coords"][2], box["coords"][3])

        # Draw bounding box on the original image
        image = cv2.rectangle(image, start_box, end_box, BOX_COLORS["block"], 2)

        # Extract the Region of Interest (ROI) for the block
        roi = image[start_box[1]:end_box[1], start_box[0]:end_box[0]]

        # Save the ROI as a separate image
        roi_filename = os.path.join(save_dir, f"block_roi_{idx}.png")
        cv2.imwrite(roi_filename, roi)
        print(f"Saved Block ROI: {roi_filename}")

        # Optionally display the ROI using Matplotlib
        plt.figure(figsize=(5, 5))
        plt.imshow(cv2.cvtColor(roi, cv2.COLOR_BGR2RGB))
        plt.axis('off')
        plt.title(f"Block ROI {idx} - block {round(box['conf'], 2)}")
        plt.show()

        # Label
        label = f"block {round(box['conf'], 2)}"
        cv2.putText(image, label, (start_box[0], start_box[1] - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)

    return image

def process_directory(input_dir, output_dir):
    # Iterate over all PNG images in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith(".png"):
            image_path = os.path.join(input_dir, filename)
            print(f"Processing: {image_path}")

            # Create a folder named after the image file (without extension) in the output directory
            folder_name = os.path.splitext(filename)[0]
            save_dir = os.path.join(output_dir, folder_name)
            os.makedirs(save_dir, exist_ok=True)

            # Detect and save ROIs for the image
            detect_and_display_block_rois(image_path, save_dir)

# Input and output directories
input_directory = "/content/drive/MyDrive/GLKS/Completed Scans"
output_directory = "/content/drive/MyDrive/GLKS/block_rois"

# Process the directory
process_directory(input_directory, output_directory)


In [None]:
from transformers import TrOCRProcessor
import torch
from transformers import VisionEncoderDecoderModel

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
import os
import requests
from PIL import Image
import cv2
import numpy as np
from matplotlib import pyplot as plt
import torch

# Define the path to the directory containing the images
image_directory = "/content/block_rois"

# Iterate over all images in the directory
for filename in os.listdir(image_directory):
    image_path = os.path.join(image_directory, filename)

    # Read the image in grayscale
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # Apply a binary threshold
    _, binary_image = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)
    binary_image = cv2.fastNlMeansDenoising(binary_image, None, h=30)  # h is the filter strength; adjust as needed
    binary_image = Image.fromarray(binary_image)  # Convert back to PIL Image

    # Normalize pixel values (0-1 range)
    binary_image = binary_image.convert('RGB')
    pixel_values = processor(binary_image, return_tensors="pt").pixel_values.to(device)

    # Generate text using the model
    with torch.no_grad():
        generated_ids = model.generate(pixel_values, num_beams=10, early_stopping=True)

    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Display the image and the extracted generated text
    plt.imshow(binary_image)
    plt.axis('off')
    plt.show()
    print(f"Extracted text from {filename}: {generated_text}")

In [None]:
import os
import requests
from PIL import Image
import cv2
import numpy as np
from matplotlib import pyplot as plt
import torch

# Define the path to the directory containing the images
image_directory = "/content/block_rois"

# Iterate over all images in the directory
for filename in os.listdir(image_directory):

    image_path = os.path.join(image_directory, filename)
    # Read the image in grayscale
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    binary_image = cv2.fastNlMeansDenoising(image, None, h=30)  # h is the filter strength; adjust as needed
    binary_image = Image.fromarray(binary_image)  # Convert back to PIL Image
    # Normalize pixel values (0-1 range)
    binary_image = binary_image.convert('RGB')
    pixel_values = processor(binary_image, return_tensors="pt").pixel_values.to(device)

    # Generate text using the model
    with torch.no_grad():
        generated_ids = model.generate(pixel_values, num_beams=10, early_stopping=True)

    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Display the image and the extracted generated text
    plt.imshow(binary_image)
    plt.axis('off')
    plt.show()
    print(f"Extracted text from {filename}: {generated_text}")