In [36]:
import cv2
import pytesseract
from bs4 import BeautifulSoup

In [38]:
def detect_headers_footers(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) # Read the image using OpenCV
    text_boxes = pytesseract.image_to_boxes(image, config='--oem 1 --psm 6').splitlines()
    text_boxes = [(int(box.split()[1]), int(box.split()[2]), int(box.split()[3]), int(box.split()[4])) for box in text_boxes]

    headers = []
    footers = []
    for box in text_boxes:
        _, y, _, _ = box
        if y < image.shape[0] / 3:
            headers.append(box)
        elif y > 2 * image.shape[0] / 3:
            footers.append(box) 
    return headers, footers
# ---------------- (detecting headers and footers)

In [41]:
def perform_ocr_with_hocr(image_path):
    hocr_output = pytesseract.image_to_pdf_or_hocr(image_path, extension='hocr', config='--oem 1')
# ----------- (performing OCR and generating hOCR output)
    return hocr_output

In [44]:
def extract_header_footer_positions(hocr_output):
    if 'ocrx_word' in div.get('class', []):
            word_id = div['id']
            word_text = div.get_text()
            bbox = div['title'].split(';')[0].split()[1:]
            x, y, w, h = [int(coord) for coord in bbox]
            
            if y < image.shape[0] / 3:
                headers[word_id] = {'text': word_text, 'position': (x, y, w, h)}
            elif y > 2 * image.shape[0] / 3:
                footers[word_id] = {'text': word_text, 'position': (x, y, w, h)}
                # ------------(extracting header and footer positions from hOCR output)
    return headers, footers

In [None]:
def extract_content_text(image_path, content_boxes):
    content_text = []

    for box in content_boxes:
        x, y, w, h = box
        content_image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)[y:y+h, x:x+w]
        
        custom_config = r'--oem 3 --psm 6'  # Tesseract configuration for single line detection
        content = pytesseract.image_to_string(content_image, config=custom_config)
        
        content_text.append((box, content))
        # ... (extracting content text using OCR)
    return content_text

In [45]:
def link_hocr_ids_to_content(content_text, headers, footers, threshold):
    linked_content = []

    for content_box, content in content_text:
        linked_ids = []
        content_x, content_y, _, _ = content_box
        
        for hocr_id, header_info in headers.items():
            header_x, header_y, _, _ = header_info['position']
            if abs(content_x - header_x) < threshold and abs(content_y - header_y) < threshold:
                linked_ids.append((hocr_id, header_info['text']))
        
        for hocr_id, footer_info in footers.items():
            footer_x, footer_y, _, _ = footer_info['position']
            if abs(content_x - footer_x) < threshold and abs(content_y - footer_y) < threshold:
                linked_ids.append((hocr_id, footer_info['text']))
        
        linked_content.append((content_box, content, linked_ids))
        # ------------ (linking content with headers and footers)
    return linked_content

In [46]:
# image path
image_path = 'C:/Users/ukary/OneDrive/Masaüstü/page_171.png'

In [None]:
# Detect headers and footers
headers, footers = detect_headers_footers(image_path)

In [None]:
# Perform OCR and generate hOCR output
hocr_output = perform_ocr_with_hocr(image_path)

In [None]:
# Extract header and footer positions from hOCR output
headers, footers = extract_header_footer_positions(hocr_output)

In [49]:
# threshold for linking
threshold = 20

In [None]:
# Extract content boxes
content_boxes = (238, 22, 194,  320) #(x1, y1, x2, y2)

In [None]:
# Extract content text
content_text = extract_content_text(image_path, content_boxes)

In [None]:
# Link content with headers and footers based on IDs
linked_content = link_hocr_ids_to_content(content_text, headers, footers, threshold)

In [None]:
# Print the linked content
for content_box, content, linked_ids in linked_content:
    print("Content:", content)
    for hocr_id, linked_text in linked_ids:
        print("Linked ID:", hocr_id, "Linked Text:", linked_text)