In [None]:
import cv2
import pytesseract

def detect_headers_footers(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)  # Read image in grayscale
    # If the image is already binary (black and white), might not need edge detection
    # If not binary, you can apply thresholding to convert it to binary
    
    # Perform text region detection using pytesseract
    custom_config = r'--oem 3 --psm 6'  # Tesseract configuration for single line detection
    text_boxes = pytesseract.image_to_boxes(image, config=custom_config).splitlines()
    
    # Convert text boxes to a list of tuples (x, y, w, h) for convenience
    text_boxes = [(int(box.split()[1]), int(box.split()[2]), int(box.split()[3]), int(box.split()[4])) for box in text_boxes]
    
    # Identify header and footer regions based on layout analysis and position
    headers = []
    footers = []
    for box in text_boxes:
        x, y, w, h = box
        if y < image.shape[0] / 3:
            headers.append(box)
        elif y > 2 * image.shape[0] / 3:
            footers.append(box)
    
    return headers, footers

image_path = 'C:\Users\ukary\OneDrive\Masaüstü\header_footer_data\340.023.TAN.A.2023_1678965427_page_60.jpg'
headers, footers = detect_headers_footers(image_path)
print("Detected headers:", headers)
print("Detected footers:", footers)


In [None]:
def perform_ocr_with_hocr(image_path):
    hocr_output = pytesseract.image_to_pdf_or_hocr(image_path, extension='hocr', config='--oem 1')
    return hocr_output



In [None]:
from bs4 import BeautifulSoup

def extract_header_footer_positions(hocr_output):
    soup = BeautifulSoup(hocr_output, 'html.parser')
    headers = {}
    footers = {}
    
    for div in soup.find_all('div'):
        if 'ocrx_word' in div.get('class', []):
            word_id = div['id']
            word_text = div.get_text()
            bbox = div['title'].split(';')[0].split()[1:]
            x, y, w, h = [int(coord) for coord in bbox]
            
            if y < image.shape[0] / 3:
                headers[word_id] = {'text': word_text, 'position': (x, y, w, h)}
            elif y > 2 * image.shape[0] / 3:
                footers[word_id] = {'text': word_text, 'position': (x, y, w, h)}
    
    return headers, footers


In [None]:
def link_hocr_ids_to_content(content_text, headers, footers, threshold):
    linked_content = []
    threshold = 20
    
    for content_box, content in content_text:
        linked_ids = []
        content_x, content_y, _, _ = content_box
        
        for hocr_id, header_info in headers.items():
            header_x, header_y, _, _ = header_info['position']
            if abs(content_x - header_x) < threshold and abs(content_y - header_y) < threshold:
                linked_ids.append((hocr_id, header_info['text']))
        
        for hocr_id, footer_info in footers.items():
            footer_x, footer_y, _, _ = footer_info['position']
            if abs(content_x - footer_x) < threshold and abs(content_y - footer_y) < threshold:
                linked_ids.append((hocr_id, footer_info['text']))
        
        linked_content.append((content_box, content, linked_ids))
    
    return linked_content


In [None]:
hocr_output = perform_ocr_with_hocr(image_path)
headers, footers = extract_header_footer_positions(hocr_output)
linked_content = link_hocr_ids_to_content(content_text, headers, footers)

for content_box, content, linked_ids in linked_content:
    print("Content:", content)
    for hocr_id, linked_text in linked_ids:
        print("Linked ID:", hocr_id, "Linked Text:", linked_text)