In [None]:
import shutil
import os

src = '/content/drive/MyDrive/v4'
dst = '/content/folder'

if os.path.exists(dst):
    shutil.rmtree(dst)

shutil.copytree(src, dst)

'/content/folder'

In [None]:
#@title Triple méthode : note en rouge + texte en bleu + longue et fine ligne noire horizontale connexe

import cv2
import numpy as np
import math
import os
import collections
import traceback

print(f"OpenCV version: {cv2.__version__}")
print(f"NumPy version: {np.__version__}")

# --- 1. Configuration ---
BASE_FOLDER = "/content/folder"
SAVE_DEBUG_IMAGES = False # Saves masks, contours for Red/Blue if True

# --- Header Position/Crop Constraints ---
# Max height % allowed for the *final crop point* (safety net)
GLOBAL_MAX_CROP_HEIGHT_PERCENT = 0.25 # Reject crop if final cut point > 25% height
# Expected max height % for Red/Line detections (stricter check on results)
EXPECTED_MAX_RED_LINE_HEIGHT_PERCENT = 0.15 # Treat Red/Line found below 15% as invalid
# Expected min height % for Blue detection (ignore if too high, e.g., noise in header)
EXPECTED_MIN_BLUE_HEIGHT_PERCENT = 0.05 # Treat Blue found above 5% as invalid

# --- Header Saving Config ---
SAVE_REMOVED_HEADERS = True # Master switch to save the "offcuts"
HEADER_SAVE_DIR = "/content" # Directory to save the removed headers
# Save header if Line method used AND consistency fails OR if final crop point > suspicious %
SAVE_HEADER_ON_LINE_CONSISTENCY_FAIL = True
SAVE_HEADER_ON_SUSPICIOUS_HEIGHT = True
SUSPICIOUS_CROP_HEIGHT_PERCENT = 0.18 # If final cut point > 18% height, save header

# --- Line Method Consistency Check ---
CONSISTENCY_Y_TOLERANCE = 30 # Pixels, applies only if Line method is used

# --- Define Target Filenames ---
GROUP1_FILES = {"page_01.png", "page_05.png", "page_09.png", "page_13.png", "page_17.png"}
GROUP2_FILES = {"page_02.png", "page_06.png", "page_10.png", "page_14.png", "page_18.png"}
TARGET_FILES = GROUP1_FILES.union(GROUP2_FILES)

# --- Method 1: Red Detection Parameters ---
RED_LOWER1 = np.array([0, 100, 100])
RED_UPPER1 = np.array([10, 255, 255])
RED_LOWER2 = np.array([165, 100, 100])
RED_UPPER2 = np.array([180, 255, 255])
RED_MIN_CONTOUR_AREA = 100
CROP_PADDING_RED = 5

# --- Method 2: Blue Detection Parameters ---
BLUE_LOWER = np.array([95, 80, 50])
BLUE_UPPER = np.array([130, 255, 255])
BLUE_MIN_CONTOUR_AREA = 150
CROP_PADDING_BLUE = 10 # Pixels to keep *above* the detected blue object top

# --- Method 3: Line Detection Parameters ---
CANNY_LOW_THRESHOLD = 50
CANNY_HIGH_THRESHOLD = 150
HOUGH_RHO = 1
HOUGH_THETA = np.pi / 180
HOUGH_THRESHOLD = 50
HOUGH_MIN_LINE_LENGTH_BASE = 1200
HOUGH_MAX_LINE_GAP = 15
LINE_HEADER_AREA_MIN_PERCENT = 0.002 # 0.2% Search zone start
LINE_HEADER_AREA_MAX_PERCENT = 0.18  # 18% Search zone end (will check result later)
HORIZONTAL_ANGLE_TOLERANCE = 3
POST_FILTER_MIN_WIDTH_PERCENT = 0.50
CROP_PADDING_LINE = 5

# --- Helper Function: Find Lowest Red Object (with Position Check) ---
def find_lowest_red_object(image, height, width, image_path_for_debug=""):
    """Finds the lowest significant red object *within expected header height*. Returns its bottom y-coord or None."""
    try:
        hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
        mask1 = cv2.inRange(hsv, RED_LOWER1, RED_UPPER1)
        mask2 = cv2.inRange(hsv, RED_LOWER2, RED_UPPER2)
        red_mask = cv2.bitwise_or(mask1, mask2)

        if SAVE_DEBUG_IMAGES:
            debug_filename = os.path.join("/content", f"debug_mask_red_{os.path.basename(image_path_for_debug)}")
            cv2.imwrite(debug_filename, red_mask)

        contours, _ = cv2.findContours(red_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        lowest_y_max = -1
        found_red = False
        if contours:
            significant_contours = [cnt for cnt in contours if cv2.contourArea(cnt) > RED_MIN_CONTOUR_AREA]
            if significant_contours:
                 lowest_contour = max(significant_contours, key=lambda cnt: cv2.boundingRect(cnt)[1] + cv2.boundingRect(cnt)[3])
                 x, y, w, h = cv2.boundingRect(lowest_contour)
                 detected_y = y + h

                 # --- POSITION CHECK ---
                 max_allowed_y = height * EXPECTED_MAX_RED_LINE_HEIGHT_PERCENT
                 if detected_y <= max_allowed_y:
                     lowest_y_max = detected_y
                     found_red = True
                     print(f"  Red Detection: Found lowest object ending at y={lowest_y_max} (within {EXPECTED_MAX_RED_LINE_HEIGHT_PERCENT:.0%} limit)")
                     if SAVE_DEBUG_IMAGES:
                         img_copy = image.copy()
                         cv2.rectangle(img_copy, (x, y), (x + w, y + h), (0, 255, 255), 2) # Yellow box
                         cv2.putText(img_copy, f"Lowest Red y={lowest_y_max}", (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2)
                         debug_filename = os.path.join("/content", f"debug_found_red_{os.path.basename(image_path_for_debug)}")
                         cv2.imwrite(debug_filename, img_copy)
                 else:
                     print(f"  Red Detection: Found object ending at y={detected_y}, but REJECTED (>{max_allowed_y:.0f}px / {EXPECTED_MAX_RED_LINE_HEIGHT_PERCENT:.0%})")
            # else: print("  Red Detection: No significant contours found.")
        # else: print("  Red Detection: No contours found.")

        return lowest_y_max if found_red else None
    except Exception as e:
        print(f"  ERROR during Red Detection: {e}")
        traceback.print_exc()
        return None

# --- Helper Function: Find Highest Blue Object ---
def find_highest_blue_object(image, height, width, image_path_for_debug=""):
    """
    Finds the highest significant blue object. Returns its top y-coord or None.
    REVISED: Removed the minimum height percentage check based on user feedback
             that any significant blue detected is likely content to keep.
    """
    try:
        hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
        blue_mask = cv2.inRange(hsv, BLUE_LOWER, BLUE_UPPER)

        if SAVE_DEBUG_IMAGES:
            debug_filename = os.path.join("/content", f"debug_mask_blue_{os.path.basename(image_path_for_debug)}")
            cv2.imwrite(debug_filename, blue_mask)

        contours, _ = cv2.findContours(blue_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        highest_y_min = height + 1 # Initialize to an invalid value
        found_blue = False
        if contours:
            # Filter contours by area first
            significant_contours = [cnt for cnt in contours if cv2.contourArea(cnt) > BLUE_MIN_CONTOUR_AREA]

            if significant_contours:
                # Find the contour with the minimum y value (highest on the image)
                highest_contour = min(significant_contours, key=lambda cnt: cv2.boundingRect(cnt)[1])
                x, y, w, h = cv2.boundingRect(highest_contour)
                detected_y = y

                # --- POSITION CHECK REMOVED ---
                # The previous check `if detected_y >= min_allowed_y:` is removed.
                # We now accept any significant blue contour found.

                highest_y_min = detected_y
                found_blue = True
                print(f"  Blue Detection: Found highest object starting at y={highest_y_min}") # Removed the percentage limit text

                if SAVE_DEBUG_IMAGES:
                    img_copy = image.copy()
                    cv2.rectangle(img_copy, (x, y), (x + w, y + h), (255, 255, 0), 2) # Cyan box
                    cv2.putText(img_copy, f"Highest Blue y={highest_y_min}", (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 0), 2)
                    debug_filename = os.path.join("/content", f"debug_found_blue_{os.path.basename(image_path_for_debug)}")
                    cv2.imwrite(debug_filename, img_copy)

            # else: print("  Blue Detection: No significant contours found.") # Optional: uncomment for more detail
        # else: print("  Blue Detection: No contours found.") # Optional: uncomment for more detail

        # Return the found y-coordinate if valid, otherwise None
        return highest_y_min if found_blue and highest_y_min <= height else None # Keep the <= height check as a basic sanity check

    except Exception as e:
        print(f"  ERROR during Blue Detection: {e}")
        traceback.print_exc()
        return None

# --- Helper Function: Find Horizontal Line (with Position Check) ---
def find_horizontal_line(image, height, width, image_path_for_debug=""):
    """Detects the lowest long horizontal line within the search zone AND expected header height. Returns its y-coord or None."""
    try:
        # print(f"  Line Detection: Attempting...") # Reduced verbosity
        dynamic_min_line_length = max(int(width * 0.4), HOUGH_MIN_LINE_LENGTH_BASE)
        min_search_y = int(height * LINE_HEADER_AREA_MIN_PERCENT)
        max_search_y = int(height * LINE_HEADER_AREA_MAX_PERCENT)
        max_search_y = max(max_search_y, min_search_y + 5)
        if min_search_y >= max_search_y or max_search_y > height or min_search_y < 0:
            # print(f"    ERROR: Invalid line search zone [{min_search_y}-{max_search_y}].") # Reduced verbosity
            return None

        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        canny_roi_y_start = max(0, min_search_y - 20)
        canny_roi_y_end = min(height, max_search_y + 20)
        if canny_roi_y_start >= canny_roi_y_end:
             edges = cv2.Canny(gray, CANNY_LOW_THRESHOLD, CANNY_HIGH_THRESHOLD, apertureSize=3)
        else:
            edges_roi = cv2.Canny(gray[canny_roi_y_start:canny_roi_y_end, :], CANNY_LOW_THRESHOLD, CANNY_HIGH_THRESHOLD, apertureSize=3)
            edges = np.zeros_like(gray)
            edges[canny_roi_y_start:canny_roi_y_end, :] = edges_roi

        hough_roi_edges = edges[min_search_y:max_search_y, :]
        if hough_roi_edges.shape[0] <= 0:
             # print(f"    ERROR: Hough ROI zero height [{min_search_y}:{max_search_y}].") # Reduced verbosity
             return None

        lines = cv2.HoughLinesP(hough_roi_edges, HOUGH_RHO, HOUGH_THETA, HOUGH_THRESHOLD, None, dynamic_min_line_length, HOUGH_MAX_LINE_GAP)

        valid_horizontal_lines = []
        min_required_width = int(width * POST_FILTER_MIN_WIDTH_PERCENT)
        if lines is not None:
            for line in lines:
                x1, y1_rel, x2, y2_rel = line[0]
                y1 = y1_rel + min_search_y; y2 = y2_rel + min_search_y
                if x2 - x1 != 0: angle = math.degrees(math.atan2(y2 - y1, x2 - x1))
                else: angle = 90
                if abs(angle) <= HORIZONTAL_ANGLE_TOLERANCE:
                    line_width = abs(x2 - x1)
                    if line_width >= min_required_width:
                        valid_horizontal_lines.append((x1, y1, x2, y2))

        detected_y = None
        if valid_horizontal_lines:
            lowest_line_in_zone = max(valid_horizontal_lines, key=lambda line: max(line[1], line[3]))
            line_y_found = max(lowest_line_in_zone[1], lowest_line_in_zone[3])

            # --- POSITION CHECK ---
            max_allowed_y = height * EXPECTED_MAX_RED_LINE_HEIGHT_PERCENT
            if line_y_found <= max_allowed_y:
                detected_y = line_y_found
                print(f"  Line Detection: Found line at y={detected_y} (within {EXPECTED_MAX_RED_LINE_HEIGHT_PERCENT:.0%} limit)")
                if SAVE_DEBUG_IMAGES:
                    img_copy = image.copy()
                    l_x1, l_y1, l_x2, l_y2 = lowest_line_in_zone
                    cv2.line(img_copy, (l_x1, detected_y), (l_x2, detected_y), (0, 255, 0), 3)
                    cv2.line(img_copy, (0, min_search_y), (width, min_search_y), (0, 0, 255), 1)
                    cv2.line(img_copy, (0, max_search_y), (width, max_search_y), (0, 0, 255), 1)
                    debug_filename = os.path.join("/content", f"debug_found_line_{os.path.basename(image_path_for_debug)}")
                    cv2.imwrite(debug_filename, img_copy)
            else:
                print(f"  Line Detection: Found line at y={line_y_found}, but REJECTED (>{max_allowed_y:.0f}px / {EXPECTED_MAX_RED_LINE_HEIGHT_PERCENT:.0%})")
        else:
             print(f"  Line Detection: No suitable line found in zone [{min_search_y}-{max_search_y}].")

        return detected_y
    except Exception as e:
        print(f"  ERROR during Line Detection: {e}")
        traceback.print_exc()
        return None


# --- Main Processing Function (with Global Crop Cap and refined saving) ---
def process_image_and_remove_header(image_path, trigger_header_save_consistency=False):
    """
    Tries Red, Blue, Line detection with strict position checks.
    Applies global crop cap. Saves removed header ('offcut') if configured.
    Overwrites original file with cropped content.
    Returns: (detected_crop_y_unpadded, status_ok, method_used)
    """
    print(f"--- Processing image: {os.path.basename(image_path)}")
    method_used = None
    crop_y_unpadded = None
    crop_y_final = None # The actual pixel row where the cut happens
    status_ok = False
    save_header_reason = "" # Track why header is saved

    try:
        image = cv2.imread(image_path)
        if image is None:
            print(f"ERROR: Could not load image at {image_path}")
            return None, False, None

        height, width = image.shape[:2]
        print(f"  Image dimensions: {width}x{height}")

        # --- Try Method 1: Red Detection ---
        # print("  Attempting Red Detection...") # Reduced verbosity
        red_y = find_lowest_red_object(image, height, width, image_path)
        if red_y is not None:
            method_used = "Red"
            crop_y_unpadded = red_y
            crop_y_final = min(crop_y_unpadded + CROP_PADDING_RED, height)
            # print(f"  Crop point (Red): y={crop_y_final} (unpadded y={crop_y_unpadded})") # Reduced verbosity

        # --- Try Method 2: Blue Detection (if Red failed) ---
        if method_used is None:
            # print("  Attempting Blue Detection...") # Reduced verbosity
            blue_y = find_highest_blue_object(image, height, width, image_path)
            if blue_y is not None:
                method_used = "Blue"
                crop_y_unpadded = blue_y # Top of blue object
                crop_y_final = max(0, crop_y_unpadded - CROP_PADDING_BLUE) # Cut above blue
                # print(f"  Crop point (Blue): y={crop_y_final} (unpadded y={crop_y_unpadded})") # Reduced verbosity

        # --- Try Method 3: Line Detection (if Red and Blue failed) ---
        if method_used is None:
            # print("  Attempting Line Detection...") # Reduced verbosity
            line_y = find_horizontal_line(image, height, width, image_path)
            if line_y is not None:
                method_used = "Line"
                crop_y_unpadded = line_y
                crop_y_final = min(crop_y_unpadded + CROP_PADDING_LINE, height)
                # print(f"  Crop point (Line): y={crop_y_final} (unpadded y={crop_y_unpadded})") # Reduced verbosity

        # --- Process Result ---
        if method_used is None or crop_y_final is None:
            print(f"  FAILURE: No valid crop point found using any method for {os.path.basename(image_path)}.")
            return None, False, None

        print(f"  SUCCESS: Method '{method_used}' determined crop point y={crop_y_final} (unpadded y={crop_y_unpadded})")

        # --- GLOBAL CROP CAP ---
        max_allowed_crop_y = height * GLOBAL_MAX_CROP_HEIGHT_PERCENT
        if crop_y_final > max_allowed_crop_y:
            print(f"  ERROR: Final crop point y={crop_y_final} EXCEEDS global limit ({max_allowed_crop_y:.0f}px / {GLOBAL_MAX_CROP_HEIGHT_PERCENT:.0%}). Aborting crop.")
            return crop_y_unpadded, False, method_used # Return detected y, but status is fail

        # --- Check if Removed Header Should be Saved ---
        save_this_header_flag = False
        if SAVE_REMOVED_HEADERS:
            # Reason 1: Line method consistency failure (passed via flag)
            if method_used == "Line" and SAVE_HEADER_ON_LINE_CONSISTENCY_FAIL and trigger_header_save_consistency:
                save_this_header_flag = True
                save_header_reason = "Line Consistency"
            # Reason 2: Suspiciously large header height (based on final cut point)
            if SAVE_HEADER_ON_SUSPICIOUS_HEIGHT and (crop_y_final / height) > SUSPICIOUS_CROP_HEIGHT_PERCENT:
                 if not save_this_header_flag:
                    save_this_header_flag = True
                    save_header_reason = f"Suspicious Height (>{SUSPICIOUS_CROP_HEIGHT_PERCENT:.0%})"
                 else:
                     save_header_reason += f" & Suspicious Height"

        # --- Save Removed Header ("Offcut") if Flagged ---
        if save_this_header_flag:
            try:
                header_image = image[0:crop_y_final, 0:width] # The part being removed
                if header_image.shape[0] > 0 and header_image.shape[1] > 0:
                    parent_dir = os.path.dirname(image_path)
                    subfolder_name = os.path.basename(parent_dir)
                    original_filename = os.path.basename(image_path)
                    safe_subfolder_name = "".join([c if c.isalnum() or c in ('_','-') else '_' for c in subfolder_name])
                    header_save_filename = f"HEADER_{safe_subfolder_name}_{original_filename}"
                    header_save_path = os.path.join(HEADER_SAVE_DIR, header_save_filename)
                    os.makedirs(HEADER_SAVE_DIR, exist_ok=True)
                    cv2.imwrite(header_save_path, header_image)
                    print(f"  SAVED REMOVED HEADER ({save_header_reason}) to: {header_save_path}")
                else:
                    print(f"  WARNING: Header portion (offcut) to save was empty for {image_path}.")
            except Exception as e:
                print(f"  ERROR: Could not save removed header image for {image_path}: {e}")
                traceback.print_exc()

        # --- Perform the actual crop to get the *kept* part ---
        image_kept_content = image[crop_y_final:height, 0:width]

        if image_kept_content.shape[0] < 10 or image_kept_content.shape[1] < 10:
            print(f"  ERROR: Kept content resulted in a very small image ({image_kept_content.shape[1]}x{image_kept_content.shape[0]}). Skipping overwrite for {image_path}.")
            return crop_y_unpadded, False, method_used

        # --- Save Kept Content (Overwrite Original) ---
        try:
            cv2.imwrite(image_path, image_kept_content)
            # print(f"  OVERWRITE SUCCESS: {image_path}") # Reduced verbosity
            status_ok = True
        except Exception as e:
            print(f"  ERROR saving kept content (overwrite) for {image_path}: {e}")
            traceback.print_exc()
            status_ok = False

        return crop_y_unpadded, status_ok, method_used

    except Exception as e:
        print(f"!! UNEXPECTED ERROR processing {image_path}: {e}")
        traceback.print_exc()
        return None, False, method_used


# --- 3. Main Execution Logic ---

print("\nStarting header removal process...")
print(f"Base Folder: {BASE_FOLDER}")
print(f"Detection Order: Red -> Blue -> Line")
print(f"Global Max Crop Height: {GLOBAL_MAX_CROP_HEIGHT_PERCENT:.0%}")
print(f"Expected Max Red/Line Height: {EXPECTED_MAX_RED_LINE_HEIGHT_PERCENT:.0%}")
print(f"Expected Min Blue Height: {EXPECTED_MIN_BLUE_HEIGHT_PERCENT:.0%}")
print(f"Save Removed Headers: {SAVE_REMOVED_HEADERS}")
if SAVE_REMOVED_HEADERS:
    print(f"  Save Trigger: Line Consistency Fail = {SAVE_HEADER_ON_LINE_CONSISTENCY_FAIL}")
    print(f"  Save Trigger: Suspicious Height (Crop > {SUSPICIOUS_CROP_HEIGHT_PERCENT:.0%}) = {SAVE_HEADER_ON_SUSPICIOUS_HEIGHT}")
    print(f"  Headers Dir: {HEADER_SAVE_DIR}")
print(f"Line Method Consistency Tolerance: {CONSISTENCY_Y_TOLERANCE}px")
print("-" * 50)

if not os.path.isdir(BASE_FOLDER):
    print(f"ERROR: Base folder '{BASE_FOLDER}' not found or is not a directory.")
else:
    processed_files_count = 0
    skipped_no_crop_point = 0
    skipped_crop_too_low = 0
    error_files_count = 0
    consistency_warnings = 0
    headers_saved_count = 0 # Will count at the end
    method_counts = collections.defaultdict(int)
    subfolder_consistency_y_line = collections.defaultdict(dict) # Stores Y for Line method

    for root, dirs, files in os.walk(BASE_FOLDER):
        if root == BASE_FOLDER:
            dirs[:] = [d for d in dirs if d != '.' and not d.startswith('__')]
            continue

        print(f"\n>>> Entering Subfolder: {os.path.basename(root)}")
        files_in_subfolder = 0
        files.sort()

        for filename in files:
            if filename in TARGET_FILES:
                files_in_subfolder += 1
                image_path = os.path.join(root, filename)
                current_group = 1 if filename in GROUP1_FILES else 2
                trigger_save_consistency = False # Flag for process_image function

                # --- Check Consistency ONLY if Line method might be used ---
                # We need the potential Y value *if* Line method is the fallback
                # This requires knowing the previous Line Y value *before* running.
                potential_line_y = None # Simulate hypothetical line detection result
                if current_group in subfolder_consistency_y_line[root]:
                    # We can't know the *actual* detected_y yet, so we can only
                    # pass the trigger flag if *any* previous Line Y exists.
                    # The process_image function will use the flag *only* if method=='Line'.
                    # Let's check consistency *after* the run instead.
                    pass

                # --- Run Unified Header Removal ---
                detected_y, status_ok, method_used = process_image_and_remove_header(image_path, trigger_header_save_consistency=False) # Pass False initially

                # --- Post-processing Checks & Logging ---
                if method_used:
                    method_counts[method_used] += 1

                    # --- Consistency Check & Set Trigger for Potential Header Save ---
                    if method_used == "Line" and detected_y is not None:
                        if current_group not in subfolder_consistency_y_line[root]:
                            subfolder_consistency_y_line[root][current_group] = detected_y
                        else:
                            first_y = subfolder_consistency_y_line[root][current_group]
                            diff = abs(detected_y - first_y)
                            if diff > CONSISTENCY_Y_TOLERANCE:
                                print(f"  !! WARNING (Line Consistency): {filename} Diff={diff} > Tol={CONSISTENCY_Y_TOLERANCE}")
                                consistency_warnings += 1
                                trigger_save_consistency = True # Set flag for header saving check

                                # --- Manually trigger header save if needed and not already done ---
                                # Check if header was already saved by the function (e.g., due to height)
                                parent_dir = os.path.dirname(image_path)
                                subfolder_name = os.path.basename(parent_dir)
                                original_filename = os.path.basename(image_path)
                                safe_subfolder_name = "".join([c if c.isalnum() or c in ('_','-') else '_' for c in subfolder_name])
                                expected_header_filename = f"HEADER_{safe_subfolder_name}_{original_filename}"
                                expected_header_path = os.path.join(HEADER_SAVE_DIR, expected_header_filename)

                                if SAVE_REMOVED_HEADERS and SAVE_HEADER_ON_LINE_CONSISTENCY_FAIL and not os.path.exists(expected_header_path):
                                     try:
                                         print(f"  Manually saving header now due to Line Consistency failure...")
                                         # We need the image state *before* overwrite
                                         # Let's reload the (potentially already overwritten) image and crop based on detected_y
                                         # This is imperfect if overwrite failed, but best effort.
                                         current_image_state = cv2.imread(image_path)
                                         if current_image_state is not None:
                                             # Estimate original height if possible (tricky!)
                                             # Use the originally detected crop point for header size
                                             crop_y_final_for_header = min(detected_y + CROP_PADDING_LINE, current_image_state.shape[0])
                                             header_image = current_image_state[0:crop_y_final_for_header, :] # Assume header is still there if overwrite failed maybe? Risky.

                                             # Safer: Re-load original *if possible* (Not feasible here)
                                             # Let's trust the initial run's save logic based on height was likely sufficient.
                                             print(f"  NOTE: Manual header save skipped. Relying on initial trigger (e.g., suspicious height) during process run.")


                                     except Exception as e:
                                         print(f"  ERROR trying to manually save header on consistency fail: {e}")


                # --- Update Counters ---
                if status_ok:
                    processed_files_count += 1
                elif method_used is None: # No valid crop point found by any method
                    skipped_no_crop_point += 1
                elif detected_y is not None and not status_ok: # Crop point found, but failed later (e.g., global cap, save error)
                    # Check if it was due to global cap
                    temp_image_height_for_check = cv2.imread(image_path, cv2.IMREAD_UNCHANGED).shape[0] # Read height again
                    crop_y_final_recalc = 0
                    if method_used == "Red": crop_y_final_recalc = min(detected_y + CROP_PADDING_RED, temp_image_height_for_check)
                    elif method_used == "Blue": crop_y_final_recalc = max(0, detected_y - CROP_PADDING_BLUE)
                    elif method_used == "Line": crop_y_final_recalc = min(detected_y + CROP_PADDING_LINE, temp_image_height_for_check)

                    if crop_y_final_recalc > temp_image_height_for_check * GLOBAL_MAX_CROP_HEIGHT_PERCENT:
                         skipped_crop_too_low += 1
                    else:
                        error_files_count += 1 # Other error (save failed, etc.)
                else: # Load error or other unexpected early failure
                    error_files_count += 1


        if files_in_subfolder == 0:
            print("   No target image files found in this subfolder.")

    # Count saved headers accurately at the end
    try:
        if os.path.isdir(HEADER_SAVE_DIR) and SAVE_REMOVED_HEADERS:
            saved_header_files = [f for f in os.listdir(HEADER_SAVE_DIR) if f.startswith("HEADER_") and f.endswith(".png")]
            headers_saved_count = len(saved_header_files)
        else:
            headers_saved_count = 0
    except Exception:
        headers_saved_count = -1 # Indicate error counting

    print("-" * 50)
    print("Processing Summary:")
    print(f"  Successfully processed (cropped & overwritten): {processed_files_count}")
    print(f"  Skipped (no valid crop point found):          {skipped_no_crop_point}")
    print(f"  Skipped (crop rejected, too low):           {skipped_crop_too_low}")
    print(f"  Errors (load/save issues):                  {error_files_count}")
    print(f"  Method Used Counts: {dict(method_counts)}")
    print(f"  Line Method Consistency Warnings Issued:      {consistency_warnings}")
    if SAVE_REMOVED_HEADERS:
        count_str = str(headers_saved_count) if headers_saved_count >= 0 else "Error counting"
        print(f"  Removed Headers Saved (in {HEADER_SAVE_DIR}):     {count_str}")
    print("-" * 50)

# --- End of Script ---

OpenCV version: 4.11.0
NumPy version: 2.0.2

Starting header removal process...
Base Folder: /content/folder
Detection Order: Red -> Blue -> Line
Global Max Crop Height: 25%
Expected Max Red/Line Height: 15%
Expected Min Blue Height: 5%
Save Removed Headers: True
  Save Trigger: Line Consistency Fail = True
  Save Trigger: Suspicious Height (Crop > 18%) = True
  Headers Dir: /content
Line Method Consistency Tolerance: 30px
--------------------------------------------------

>>> Entering Subfolder: subfolder
--- Processing image: page_09.png
  Image dimensions: 2481x2676
  Blue Detection: Found highest object starting at y=115
  SUCCESS: Method 'Blue' determined crop point y=105 (unpadded y=115)
--------------------------------------------------
Processing Summary:
  Successfully processed (cropped & overwritten): 1
  Skipped (no valid crop point found):          0
  Skipped (crop rejected, too low):           0
  Errors (load/save issues):                  0
  Method Used Counts: {'Blu

In [None]:
import cv2
import numpy as np
import os
import collections
import math

print(f"OpenCV version: {cv2.__version__}")
print(f"NumPy version: {np.__version__}")

# --- 1. Configuration ---
BASE_FOLDER = "/content/folder"  # <<<--- Main folder containing subfolders with processed images
HEIGHT_TOLERANCE = 15  # <<<--- Maximum allowed pixel difference in height within a group/subfolder
                       # Adjust this value based on expected consistency (e.g., 5, 10, 20)

# --- Define Target Filenames (Should match the files processed previously) ---
GROUP1_FILES = {
    "page_01.png", "page_05.png", "page_09.png", "page_13.png", "page_17.png"
}
GROUP2_FILES = {
    "page_02.png", "page_06.png", "page_10.png", "page_14.png", "page_18.png"
}
TARGET_FILES = GROUP1_FILES.union(GROUP2_FILES)

# --- 2. Main Execution Logic ---

print("\nStarting image height consistency check...")
print(f"Base Folder: {BASE_FOLDER}")
print(f"Target Files: {len(TARGET_FILES)} specific names")
print(f"Height Tolerance within groups: {HEIGHT_TOLERANCE} pixels")
print("-" * 60)

if not os.path.isdir(BASE_FOLDER):
    print(f"ERROR: Base folder '{BASE_FOLDER}' not found or is not a directory.")
else:
    # Dictionary to store heights: {subfolder_path: {group_num: [(filename, height), ...], ...}}
    subfolder_heights = collections.defaultdict(lambda: {1: [], 2: []})
    read_errors = 0
    subfolders_with_warnings = 0
    subfolders_processed = 0

    # --- Step 1: Collect Heights ---
    print("Phase 1: Collecting image heights...")
    for root, dirs, files in os.walk(BASE_FOLDER):
        # Process only subfolders, not the base folder itself
        if root == BASE_FOLDER:
             # Optional: Clean up directory list if needed
             dirs[:] = [d for d in dirs if d != '.' and not d.startswith('__')]
             continue

        subfolders_processed += 1
        print(f"  Scanning Subfolder: {os.path.basename(root)}") # More concise logging
        files.sort() # Process in a consistent order

        for filename in files:
            if filename in TARGET_FILES:
                image_path = os.path.join(root, filename)
                current_group = 1 if filename in GROUP1_FILES else 2

                try:
                    # Read image dimensions using imread flags for efficiency if possible,
                    # but reading the whole image header is usually necessary for shape.
                    # Using standard imread and accessing shape[0] is reliable.
                    img = cv2.imread(image_path)
                    if img is None:
                        print(f"    ERROR: Could not read image: {filename}")
                        read_errors += 1
                        continue # Skip to next file

                    height = img.shape[0]
                    subfolder_heights[root][current_group].append((filename, height))

                except Exception as e:
                    print(f"    ERROR: Exception reading {filename}: {e}")
                    read_errors += 1

    print("Phase 1: Finished collecting heights.")
    print("-" * 60)

    # --- Step 2: Analyze and Report Heights ---
    print("Phase 2: Analyzing height consistency...")
    if not subfolder_heights:
         print("No target files found in any subfolders.")

    overall_warnings = 0
    for subfolder_path, groups_data in subfolder_heights.items():
        subfolder_name = os.path.basename(subfolder_path)
        print(f"\n--- Subfolder: {subfolder_name} ---")
        has_warning_in_subfolder = False

        for group_num in sorted(groups_data.keys()): # Process group 1 then group 2
            height_data = groups_data[group_num] # List of (filename, height) tuples

            print(f"  Group {group_num}:")
            if not height_data:
                print("    No target files found for this group.")
                continue # Skip to next group

            if len(height_data) == 1:
                filename, height = height_data[0]
                print(f"    Only one file found: '{filename}' (Height: {height}) - No comparison possible.")
                continue # Skip to next group

            # Extract heights and filenames for analysis
            filenames = [item[0] for item in height_data]
            heights = np.array([item[1] for item in height_data])

            # Calculate statistics
            min_h = np.min(heights)
            max_h = np.max(heights)
            avg_h = np.mean(heights)
            std_dev = np.std(heights)
            diff = max_h - min_h

            print(f"    Files Found: {len(heights)}")
            print(f"    Min Height: {min_h}")
            print(f"    Max Height: {max_h}")
            print(f"    Avg Height: {avg_h:.2f}")
            print(f"    Std Dev:    {std_dev:.2f}")
            print(f"    Difference (Max - Min): {diff}")

            # Check against tolerance
            if diff > HEIGHT_TOLERANCE:
                print(f"    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
                print(f"    !!! WARNING: Height difference ({diff}) exceeds tolerance ({HEIGHT_TOLERANCE})")
                print(f"    !!! Details:")
                for fname, h in height_data:
                     marker = ""
                     if h == min_h or h == max_h:
                         marker = f" <-- {'MIN' if h == min_h else 'MAX'}"
                     print(f"        '{fname}': {h}{marker}")
                print(f"    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")

                has_warning_in_subfolder = True
                overall_warnings += 1
            else:
                print(f"    Consistency: OK (Difference <= {HEIGHT_TOLERANCE})")

        if has_warning_in_subfolder:
             subfolders_with_warnings += 1


    print("-" * 60)
    print("Analysis Summary:")
    print(f"  Subfolders Scanned: {subfolders_processed}")
    print(f"  Subfolders with Height Warnings: {subfolders_with_warnings}")
    print(f"  Total Height Warning Instances (across all groups/subfolders): {overall_warnings}")
    if read_errors > 0:
        print(f"  Image Read Errors Encountered: {read_errors}")
    print("-" * 60)

# --- End of Script ---

OpenCV version: 4.11.0
NumPy version: 2.0.2

Starting image height consistency check...
Base Folder: /content/folder
Target Files: 10 specific names
Height Tolerance within groups: 15 pixels
------------------------------------------------------------
Phase 1: Collecting image heights...
  Scanning Subfolder: Le bonheur est-il affaire de hasard ou de nécessité ? (AGREG interne 2022 - note : 8)
  Scanning Subfolder: Le vrai et le réel (AGREG 2020 - note : 13,5)
  Scanning Subfolder: Le bonheur est-il affaire de hasard ou de nécessité ? (AGREG interne 2022 - note : 14)
  Scanning Subfolder: Langage et réalité (AGREG ext. 2018 - note : 19)
  Scanning Subfolder: Dire, est-ce autre chose que vouloir dire ? (CAPES 2021 - note : 12)
  Scanning Subfolder: Peut-on vivre en paix avec son inconscient ? (AGREG interne 2021 -note : 14)
  Scanning Subfolder: .ipynb_checkpoints
  Scanning Subfolder: Le bonheur est-il affaire de hasard ou de nécessité ? (AGREG interne 2022 - note : 10,5)
  Sc

In [None]:
!sudo apt update -q
!sudo apt install tesseract-ocr tesseract-ocr-fra -q
!pip install pytesseract Pillow -q

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists...
Building dependency tree...
Reading state information...
36 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelea

In [None]:
from PIL import Image, ImageOps # ImageOps might be useful later, Pillow handles basic modes
import os
import numpy as np
import cv2 # Import OpenCV
import pytesseract # <<< --- ADD THIS LINE ---
# from google.colab.patches import cv2_imshow # Import commented out

MAIN_FOLDER_PATH = '/content/folder' # Path to the main folder containing D_1, D_2, ...
TARGET_FILENAMES = [
    "page_02.png",
    "page_06.png",
    "page_10.png",
    "page_14.png",
    "page_18.png"
]
# Define target keywords in order of priority, including partial matches
TARGET_KEYWORDS_ORDERED = [
    'ANONYMAT', 'N231', 'A000', # Primary targets
    'NOMBRE', 'PAGES',           # Variations for CONSIGNES
    'nonymat', 'nombre',               # Variations for SESSION
    'nombr', 'anonyma'                # Variations for MATIERE
]
CROP_FRACTION_FOR_SEARCH = 0.4 # 40% - adjust if needed
CONFIDENCE_THRESHOLD = 38 # Slightly lower threshold maybe needed for partials/difficult OCR
MARGIN_PIXELS = 5
# Tesseract configuration: PSM 6 assumes a single uniform block of text.
TESSERACT_CONFIG = '--psm 6 -l fra' # PSM 6 + Language French

# --- Important Dependencies ---
# NOTE: This script requires the Tesseract OCR engine to be installed on your system.
#       See: https://github.com/tesseract-ocr/tesseract
# NOTE: Ensure the necessary Tesseract language data ('fra' for French) is installed.
#       On Linux (Debian/Ubuntu): sudo apt-get install tesseract-ocr-fra
#       Check Tesseract documentation for other OS.

# Lists for final reporting
keywords_not_found_initially = [] # Files where OCR failed initially
files_cropped_in_second_pass = [] # Files successfully cropped using shared coords
files_failed_second_pass = []     # Files that failed even in second pass (e.g., coords too low)
errors_list = []                  # General processing errors

print(f"Starting processing in main folder: {MAIN_FOLDER_PATH}")
print(f"Target filenames to process: {TARGET_FILENAMES}")
print(f"Keywords to search in order: {TARGET_KEYWORDS_ORDERED}")
print(f"Using Tesseract config: {TESSERACT_CONFIG}")
print(f"Confidence threshold: {CONFIDENCE_THRESHOLD}")
print(f"Searching top {int(CROP_FRACTION_FOR_SEARCH*100)}% of images.")
print("-" * 30)

# Step 4: Iterate through subfolders in the main folder
if not os.path.exists(MAIN_FOLDER_PATH):
    print(f"Error: Main folder not found at {MAIN_FOLDER_PATH}")
else:
    subfolder_names = sorted(os.listdir(MAIN_FOLDER_PATH))

    for subfolder_name in subfolder_names:
        subfolder_path = os.path.join(MAIN_FOLDER_PATH, subfolder_name)

        if os.path.isdir(subfolder_path):
            print(f"\n>>> Entering subfolder: {subfolder_path}")

            # --- Subfolder Level Data ---
            subfolder_successful_crop_y = None # Store the first successful Y coord for this folder
            subfolder_files_ocr_failed = [] # List of files that failed OCR in this folder

            # --- First Pass: Process each target file, try OCR ---
            print("--- First Pass: OCR and Immediate Crop ---")
            for filename in TARGET_FILENAMES:
                current_image_path = os.path.join(subfolder_path, filename)

                if os.path.exists(current_image_path):
                    print(f"\n  Processing file: {current_image_path}")
                    found_keyword = None # Reset for each file
                    word_coords = None   # Reset for each file
                    processed_successfully = False # Track if this file was cropped in the first pass

                    try:
                        # Step 7: Load image
                        img_pil = Image.open(current_image_path)
                        original_width, original_height = img_pil.size
                        # print(f"    Original dimensions: W={original_width}, H={original_height}") # Less verbose

                        # Step 8 & 9: Preprocessing
                        search_crop_height = int(original_height * CROP_FRACTION_FOR_SEARCH)
                        search_crop_height = max(1, search_crop_height) # Ensure height is at least 1
                        # print(f"    Preprocessing top {search_crop_height}px...") # Less verbose

                        # Ensure image has content before processing
                        if original_width <= 0 or search_crop_height <= 0:
                             print(f"    Warning: Invalid image dimensions or crop height ({original_width}x{search_crop_height}) for {current_image_path}. Skipping.")
                             errors_list.append(f"{current_image_path} (Invalid dimensions/crop)")
                             continue # Skip to next file

                        img_cv = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)

                        # Ensure the crop dimensions are valid before slicing
                        if search_crop_height > img_cv.shape[0]:
                             print(f"    Warning: Calculated search crop height ({search_crop_height}) exceeds image height ({img_cv.shape[0]}). Using full height for search.")
                             search_crop_height = img_cv.shape[0]

                        if search_crop_height <= 0: # Double check after adjustment
                            print(f"    Warning: Final search crop height is zero or negative. Skipping OCR for {current_image_path}.")
                            errors_list.append(f"{current_image_path} (Zero search height)")
                            subfolder_files_ocr_failed.append(current_image_path) # Add to failed list
                            continue # Skip OCR for this file


                        top_img_part_cv = img_cv[0:search_crop_height, 0:original_width]

                        # Add check if top_img_part_cv is empty before processing further
                        if top_img_part_cv.size == 0:
                            print(f"    Warning: Cropped top image part is empty for {current_image_path}. Skipping OCR.")
                            errors_list.append(f"{current_image_path} (Empty top crop)")
                            subfolder_files_ocr_failed.append(current_image_path) # Add to failed list
                            continue # Skip OCR for this file


                        gray_top_part = cv2.cvtColor(top_img_part_cv, cv2.COLOR_BGR2GRAY)
                        # Apply thresholding
                        thresh_val, binary_top_part = cv2.threshold(gray_top_part, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
                        # print(f"    Preprocessing complete. Otsu threshold: {thresh_val}") # Less verbose

                        # Step 10: OCR on preprocessed part
                        # print(f"    Running OCR with config '{TESSERACT_CONFIG}'...") # Less verbose
                        try:
                            ocr_data = pytesseract.image_to_data(binary_top_part, config=TESSERACT_CONFIG, output_type=pytesseract.Output.DICT)
                        except pytesseract.TesseractNotFoundError:
                             print("\n\nERROR: Tesseract is not installed or not in your PATH.")
                             print("Please install Tesseract OCR: https://github.com/tesseract-ocr/tesseract")
                             print("Exiting script.")
                             # You might want to exit more gracefully depending on context
                             import sys
                             sys.exit(1) # Exit script if Tesseract is missing
                        except Exception as ocr_error: # Catch other potential Tesseract errors
                            print(f"    ERROR during Tesseract execution for {current_image_path}: {ocr_error}")
                            errors_list.append(f"{current_image_path} (Tesseract execution error: {ocr_error})")
                            subfolder_files_ocr_failed.append(current_image_path) # Add to failed list
                            continue # Skip to next file if OCR execution fails

                        # Step 11: Search for keywords (including partials)
                        num_words = len(ocr_data.get('text', [])) # Use .get for safety
                        if num_words == 0:
                             print("    OCR returned no recognized words.")
                             # Continue to next step, will be marked as failed if no keyword found later

                        for keyword_to_find in TARGET_KEYWORDS_ORDERED:
                            for i in range(num_words):
                                try:
                                    # Check if index i is valid for all required keys
                                    if i < len(ocr_data['conf']) and i < len(ocr_data['text']) and \
                                       i < len(ocr_data['left']) and i < len(ocr_data['top']) and \
                                       i < len(ocr_data['width']) and i < len(ocr_data['height']):

                                        confidence_str = ocr_data['conf'][i]
                                        recognized_text_raw = ocr_data['text'][i]

                                        # Handle potential non-numeric confidence values more robustly
                                        try:
                                            confidence = int(float(confidence_str))
                                        except ValueError:
                                            confidence = -1 # Assign low confidence if conversion fails

                                        # Skip if confidence is too low or text is empty/whitespace
                                        if confidence < CONFIDENCE_THRESHOLD or not recognized_text_raw or recognized_text_raw.isspace():
                                             continue

                                        recognized_text = recognized_text_raw.strip().upper()

                                        # Use exact match
                                        if recognized_text == keyword_to_find:
                                            x = ocr_data['left'][i]
                                            y = ocr_data['top'][i]
                                            w = ocr_data['width'][i]
                                            h = ocr_data['height'][i]
                                            word_coords = {'left': x, 'top': y, 'width': w, 'height': h}
                                            found_keyword = keyword_to_find
                                            print(f"    Found '{found_keyword}' (Conf: {confidence}%) at Y={y} (First Pass)")
                                            break # Stop searching words loop
                                    # else: # Index out of bounds for one of the keys, OCR data might be inconsistent
                                    #     print(f"    Warning: Inconsistent OCR data at index {i}. Skipping word.")

                                except KeyError as ke:
                                     print(f"    Warning: Missing key in OCR data: {ke}. Skipping word.")
                                     continue # Skip this word if data structure is unexpected
                                except IndexError:
                                     print(f"    Warning: Index out of bounds accessing OCR data at index {i}. Skipping word.")
                                     continue # Skip this word if index is invalid

                            if found_keyword:
                                break # Stop searching keywords loop


                        # Step 12: If keyword found in first pass, crop and store coordinate
                        if found_keyword and word_coords: # Ensure word_coords is set
                            final_crop_top_y = word_coords['top'] + word_coords['height'] + MARGIN_PIXELS
                            # print(f"    Calculating crop below '{found_keyword}' (Y >= {final_crop_top_y})...") # Less verbose

                            if final_crop_top_y < original_height:
                                # Store the *first* successful coordinate for this subfolder
                                if subfolder_successful_crop_y is None:
                                    subfolder_successful_crop_y = final_crop_top_y
                                    print(f"    * Storing successful crop Y={subfolder_successful_crop_y} for this subfolder.")

                                final_crop_box_pil = (0, final_crop_top_y, original_width, original_height)
                                image_below_keyword = img_pil.crop(final_crop_box_pil)
                                image_below_keyword.save(current_image_path) # Overwrite
                                print(f"    Successfully cropped & saved (First Pass).")
                                processed_successfully = True

                            else:
                                print(f"    Warning: Found keyword '{found_keyword}' too low to crop (Crop Y={final_crop_top_y} >= H={original_height}).")
                                errors_list.append(f"{current_image_path} (Keyword '{found_keyword}' found too low)")
                                # Even if too low, store coord if it's the first success? Yes, might work for other images.
                                if subfolder_successful_crop_y is None:
                                    subfolder_successful_crop_y = final_crop_top_y
                                    print(f"    * Storing successful (but too low for this file) crop Y={subfolder_successful_crop_y} for this subfolder.")


                        # If not processed successfully in this pass, add to failed list for second pass
                        if not processed_successfully:
                             # Only add if it wasn't processed AND a keyword wasn't found (or was found too low)
                             if found_keyword is None:
                                 print(f"    Keyword not found with sufficient confidence (First Pass).")
                             # else: # Keyword was found, but too low (already warned above)
                             #     pass
                             # Add to the list regardless if it wasn't successfully processed in this pass
                             if current_image_path not in subfolder_files_ocr_failed:
                                 subfolder_files_ocr_failed.append(current_image_path)
                             if current_image_path not in keywords_not_found_initially:
                                 keywords_not_found_initially.append(current_image_path) # Add to global initial fail list


                    except cv2.error as cv_err:
                         print(f"    ERROR during OpenCV processing for {current_image_path}: {cv_err}")
                         errors_list.append(f"{current_image_path} (OpenCV Error: {cv_err})")
                         if not processed_successfully and current_image_path not in subfolder_files_ocr_failed:
                             subfolder_files_ocr_failed.append(current_image_path)
                    except FileNotFoundError:
                         print(f"    ERROR: File not found during processing (might have been deleted?): {current_image_path}")
                         errors_list.append(f"{current_image_path} (File Not Found Error)")
                         # Don't add to failed list if it doesn't exist
                    except Exception as e:
                        print(f"    UNEXPECTED ERROR during first pass for {current_image_path}: {e}")
                        errors_list.append(f"{current_image_path} (Unexpected Error: {e})")
                        # Add to failed list if error occurred before potential success
                        if not processed_successfully and current_image_path not in subfolder_files_ocr_failed:
                             subfolder_files_ocr_failed.append(current_image_path)


                # else: (File doesn't exist initially)
                #     print(f"    Target file not found: {current_image_path}") # Optional logging

            # --- Second Pass: Apply stored coordinates to failed files ---
            print(f"\n--- Second Pass: Applying saved coordinates for {subfolder_name} ---")
            if subfolder_successful_crop_y is not None and subfolder_files_ocr_failed:
                print(f"  Found successful crop Y={subfolder_successful_crop_y} in this subfolder. Applying to {len(subfolder_files_ocr_failed)} file(s) where OCR failed or keyword was too low.")

                # Create a copy of the list to iterate over, allowing removal from original
                failed_files_to_process = list(subfolder_files_ocr_failed)

                for failed_image_path in failed_files_to_process:
                    if not os.path.exists(failed_image_path): # Check again, might have been deleted
                        print(f"    Skipping {failed_image_path}, file no longer exists.")
                        if failed_image_path in subfolder_files_ocr_failed: subfolder_files_ocr_failed.remove(failed_image_path) # Clean up lists
                        if failed_image_path in keywords_not_found_initially: keywords_not_found_initially.remove(failed_image_path)
                        continue

                    print(f"  Retrying crop for: {failed_image_path}")
                    try:
                        img_pil_retry = Image.open(failed_image_path)
                        retry_width, retry_height = img_pil_retry.size

                        # Ensure crop coordinate is valid for this specific image
                        if subfolder_successful_crop_y > 0 and subfolder_successful_crop_y < retry_height:
                            final_crop_box_retry = (0, subfolder_successful_crop_y, retry_width, retry_height)
                            image_below_keyword_retry = img_pil_retry.crop(final_crop_box_retry)
                            image_below_keyword_retry.save(failed_image_path) # Overwrite
                            print(f"    Successfully cropped & saved using shared Y={subfolder_successful_crop_y} (Second Pass).")
                            files_cropped_in_second_pass.append(failed_image_path)
                            # Remove from the initial failure list as it's now processed
                            if failed_image_path in keywords_not_found_initially:
                                keywords_not_found_initially.remove(failed_image_path)
                            # Remove from the subfolder failed list as well
                            if failed_image_path in subfolder_files_ocr_failed:
                                subfolder_files_ocr_failed.remove(failed_image_path)
                        else:
                            print(f"    Warning: Saved coordinate Y={subfolder_successful_crop_y} is invalid for this image (H={retry_height}). Cannot crop.")
                            # Add to specific second pass failure list
                            if f"{failed_image_path} (Saved Y={subfolder_successful_crop_y} invalid for H={retry_height})" not in files_failed_second_pass:
                                files_failed_second_pass.append(f"{failed_image_path} (Saved Y={subfolder_successful_crop_y} invalid for H={retry_height})")
                            # Keep it in keywords_not_found_initially if it was there

                    except FileNotFoundError:
                         print(f"    ERROR: File not found during second pass (might have been deleted?): {failed_image_path}")
                         errors_list.append(f"{failed_image_path} (Second Pass File Not Found Error)")
                         if failed_image_path in subfolder_files_ocr_failed: subfolder_files_ocr_failed.remove(failed_image_path) # Clean up lists
                         if failed_image_path in keywords_not_found_initially: keywords_not_found_initially.remove(failed_image_path)

                    except Exception as e:
                        print(f"    ERROR during second pass for {failed_image_path}: {e}")
                        # Add to specific second pass failure list
                        if f"{failed_image_path} (Second Pass Error: {e})" not in files_failed_second_pass:
                            files_failed_second_pass.append(f"{failed_image_path} (Second Pass Error: {e})")
                        errors_list.append(f"{failed_image_path} (Second Pass Error: {e})")
                        # Keep it in keywords_not_found_initially if it was there


            elif subfolder_files_ocr_failed:
                 print(f"  No successful OCR coordinate found in this subfolder, cannot apply second pass crop to {len(subfolder_files_ocr_failed)} file(s).")
                 # These files remain in the 'keywords_not_found_initially' list
            else:
                 print(f"  No files required second pass in this subfolder.")


        else: # Not a directory
            print(f"Skipping '{subfolder_name}' as it is not a directory.")

    # --- Final Report ---
    print("\n" + "-" * 30)
    print("Processing Complete. Final Summary:")
    print("-" * 30)

    if files_cropped_in_second_pass:
        print(f"\nFiles successfully cropped using shared coordinates (Second Pass): {len(files_cropped_in_second_pass)}")
        # for fpath in files_cropped_in_second_pass: print(f"  - {fpath}") # Optional: list files

    # keywords_not_found_initially now correctly represents files that failed OCR AND couldn't be rescued by 2nd pass
    if keywords_not_found_initially:
        print(f"\nFiles where keyword search ultimately FAILED (OCR failed & no valid shared coord): {len(keywords_not_found_initially)}")
        for fpath in keywords_not_found_initially:
            print(f"  - {fpath}")

    if files_failed_second_pass:
         print(f"\nFiles specifically FAILED second pass crop (e.g., shared coord invalid): {len(files_failed_second_pass)}")
         for fpath_reason in files_failed_second_pass:
             print(f"  - {fpath_reason}")

    if errors_list:
        print(f"\nProcessing errors encountered: {len(errors_list)}")
        for error_info in errors_list:
            print(f"  - {error_info}")
    else:
        print("\nNo processing errors encountered.")


    # A more accurate count of processed items might be needed if the main folder contains non-folders.
    # num_subfolders = len([name for name in os.listdir(MAIN_FOLDER_PATH) if os.path.isdir(os.path.join(MAIN_FOLDER_PATH, name))])
    # print(f"\nFinished processing {num_subfolders} subfolders in: {MAIN_FOLDER_PATH}")
    print(f"\nFinished processing subfolders in: {MAIN_FOLDER_PATH}")

Starting processing in main folder: /content/folder
Target filenames to process: ['page_02.png', 'page_06.png', 'page_10.png', 'page_14.png', 'page_18.png']
Keywords to search in order: ['ANONYMAT', 'N231', 'A000', 'NOMBRE', 'PAGES', 'nonymat', 'nombre', 'nombr', 'anonyma']
Using Tesseract config: --psm 6 -l fra
Confidence threshold: 38
Searching top 40% of images.
------------------------------

>>> Entering subfolder: /content/folder/Dire, est-ce autre chose que vouloir dire ? (CAPES 2021 - note : 12)
--- First Pass: OCR and Immediate Crop ---

--- Second Pass: Applying saved coordinates for Dire, est-ce autre chose que vouloir dire ? (CAPES 2021 - note : 12) ---
  No files required second pass in this subfolder.

>>> Entering subfolder: /content/folder/Est-ce une chance de naître humain ? (AGREG interne 2023 - note : 13)
--- First Pass: OCR and Immediate Crop ---

  Processing file: /content/folder/Est-ce une chance de naître humain ? (AGREG interne 2023 - note : 13)/page_02.png
 

In [None]:
import cv2
import numpy as np
import math
import os
import collections

print(f"OpenCV version: {cv2.__version__}")
print(f"NumPy version: {np.__version__}")

# --- 1. Configuration ---
BASE_FOLDER = "/content/folder"  # <<<--- Main folder containing subfolders
SAVE_DEBUG_IMAGES = False # Set to True to save intermediate steps (uses hardcoded paths in /content/)
CONSISTENCY_Y_TOLERANCE = 30 # Max allowed pixel difference in Y for detected lines within the same group/subfolder

# --- Define Target Filenames ---
GROUP1_FILES = {
    "page_01.png", "page_05.png", "page_09.png", "page_13.png", "page_17.png"
}
GROUP2_FILES = {
    "page_02.png", "page_06.png", "page_10.png", "page_14.png", "page_18.png"
}
TARGET_FILES = GROUP1_FILES.union(GROUP2_FILES)

# --- Parameters for Line Detection ( *** TUNING AREA *** ) ---
# These parameters are applied individually to each image. Fine-tune as needed.
CANNY_LOW_THRESHOLD = 50
CANNY_HIGH_THRESHOLD = 150
HOUGH_RHO = 1
HOUGH_THETA = np.pi / 180
HOUGH_THRESHOLD = 50  # Adjust if target line not detected
HOUGH_MIN_LINE_LENGTH = 1200 # <<<*** ADJUST BASED ON TYPICAL IMAGE WIDTH *** Might need tuning if widths vary drastically
HOUGH_MAX_LINE_GAP = 15
HEADER_AREA_MIN_PERCENT = 0.002 # Look for lines STARTING FROM 0.2% down (as per description)
HEADER_AREA_MAX_PERCENT = 0.16  # Look for lines ENDING BY 18% down (as per description)
HORIZONTAL_ANGLE_TOLERANCE = 3  # Allow lines within +/- degrees of horizontal
POST_FILTER_MIN_WIDTH_PERCENT = 0.50 # Require line > 50% of image width
CROP_PADDING = 5 # Pixels to add below the detected line (reduced from 10 for potentially thin headers)

# --- 2. Processing Function (Modified to return crop_y) ---

def remove_header_strict_zone(image_path):
    """
    Loads image, detects lowest LONG horizontal line strictly within the
    defined vertical percentage band, crops below it, overwrites the
    original file, and returns the detected crop_y (unpadded) on success.
    """
    print(f"--- Processing image: {image_path}")
    try:
        image = cv2.imread(image_path)
        if image is None:
            print(f"ERROR: Could not load image at {image_path}")
            return None, None # Return None for both crop_y and status

        height, width = image.shape[:2]
        print(f"  Image dimensions: {width}x{height}")

        # Check if min line length is reasonable, adjust if needed (simple heuristic)
        dynamic_min_line_length = max(int(width * 0.4), HOUGH_MIN_LINE_LENGTH) # Ensure min length isn't ridiculously larger than width, use at least 40% width or configured value
        # print(f"  Using Min Line Length: {dynamic_min_line_length} (Config was: {HOUGH_MIN_LINE_LENGTH})")


        # --- Calculate Strict Search Zone Coordinates ---
        min_search_y = int(height * HEADER_AREA_MIN_PERCENT)
        max_search_y = int(height * HEADER_AREA_MAX_PERCENT)
        # Ensure max_search_y is at least a few pixels beyond min_search_y
        max_search_y = max(max_search_y, min_search_y + 5)
        print(f"  Strict Search Zone: y = {min_search_y} to {max_search_y} pixels")
        if min_search_y >= max_search_y or max_search_y > height:
            print(f"ERROR: Invalid search zone [{min_search_y}-{max_search_y}] for height {height}. Check percentages.")
            return None, None

        # --- Preprocessing ---
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        edges = cv2.Canny(gray, CANNY_LOW_THRESHOLD, CANNY_HIGH_THRESHOLD, apertureSize=3)
        # print("  Edge detection complete.")
        if SAVE_DEBUG_IMAGES:
            debug_filename = os.path.join("/content", f"debug_edges_{os.path.basename(image_path)}")
            cv2.imwrite(debug_filename, edges)

        # --- Hough Line Transform ---
        lines = cv2.HoughLinesP(
            edges[min_search_y:max_search_y, :], # <<< Optimization: Apply Hough only on the edge map ROI
            rho=HOUGH_RHO, theta=HOUGH_THETA, threshold=HOUGH_THRESHOLD,
            minLineLength=dynamic_min_line_length, # Use dynamic length
            maxLineGap=HOUGH_MAX_LINE_GAP
        )

        # --- Filter Lines ---
        valid_horizontal_lines = []
        image_with_lines = image.copy() # Keep for potential debug saving
        min_required_width = int(width * POST_FILTER_MIN_WIDTH_PERCENT)

        if lines is not None:
            # print(f"  HoughLinesP initially detected {len(lines)} line segments in the ROI.")
            for line in lines:
                # Coordinates are relative to the ROI, need to add min_search_y back for absolute Y
                x1, y1_rel, x2, y2_rel = line[0]
                y1 = y1_rel + min_search_y
                y2 = y2_rel + min_search_y

                # 1. Check Angle (already implicitly within zone due to ROI processing)
                if x2 - x1 != 0: angle = math.degrees(math.atan2(y2 - y1, x2 - x1))
                else: angle = 90 # Vertical

                if abs(angle) <= HORIZONTAL_ANGLE_TOLERANCE:
                    # 2. Check Width Post-Filter
                    line_width = abs(x2 - x1)
                    if line_width >= min_required_width:
                        # Store with absolute coordinates
                        valid_horizontal_lines.append((x1, y1, x2, y2))
                        if SAVE_DEBUG_IMAGES:
                             cv2.line(image_with_lines, (x1, y1), (x2, y2), (0, 255, 0), 2) # Green
                    # else: # Debug width fail
                    #     if SAVE_DEBUG_IMAGES: cv2.line(image_with_lines, (x1, y1), (x2, y2), (0, 165, 255), 1) # Orange
                # else: # Debug angle fail
                #    if SAVE_DEBUG_IMAGES: cv2.line(image_with_lines, (x1, y1), (x2, y2), (255, 0, 255), 1) # Magenta

            print(f"  Found {len(valid_horizontal_lines)} final candidate horizontal lines.")
        else:
            print("  HoughLinesP did not detect any lines within the zone.")

        # --- Visualization: Detected Lines & Zone (Only if saving debug images) ---
        if SAVE_DEBUG_IMAGES:
            # Draw the strict search zone boundaries
            cv2.line(image_with_lines, (0, min_search_y), (width, min_search_y), (0, 0, 255), 2) # Top boundary (Blue)
            cv2.line(image_with_lines, (0, max_search_y), (width, max_search_y), (0, 0, 255), 2) # Bottom boundary (Blue)
            debug_filename = os.path.join("/content", f"debug_detected_lines_{os.path.basename(image_path)}")
            cv2.imwrite(debug_filename, image_with_lines)


        # --- Find Crop Line (Lowest line within the valid set) ---
        crop_y_unpadded = 0
        if valid_horizontal_lines:
            # Find the line with the maximum y-coordinate among the valid ones
            # Use max(y1, y2) for each line segment to get its lowest point
            lowest_line_in_zone = max(valid_horizontal_lines, key=lambda line: max(line[1], line[3]))
            crop_y_unpadded = max(lowest_line_in_zone[1], lowest_line_in_zone[3])
            print(f"  Lowest suitable horizontal line FOUND. Max Y: {crop_y_unpadded}")
        else:
            print(f"  WARNING: Could not find a suitable horizontal line in {os.path.basename(image_path)}.")
            print(f"           Check zone [{min_search_y}-{max_search_y}] & parameters (Hough Threshold/MinLength, Width Filter).")
            print(f"           Skipping crop for this file.")
            # If debug images are on, save the annotated image even on failure
            if SAVE_DEBUG_IMAGES:
                 cv2.putText(image_with_lines, "NO SUITABLE LINE FOUND", (width//2 - 200, height//2),
                             cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0,0,255), 3)
                 debug_filename = os.path.join("/content", f"debug_FAILED_detection_{os.path.basename(image_path)}")
                 cv2.imwrite(debug_filename, image_with_lines)
            return None, False # Return None for crop_y, False for status (no crop performed)

        # --- Crop Image ---
        if crop_y_unpadded > 0:
             crop_y_padded = min(crop_y_unpadded + CROP_PADDING, height)
             print(f"  Performing crop at y = {crop_y_padded} (line at {crop_y_unpadded} + padding {CROP_PADDING})")
             image_without_header = image[crop_y_padded:height, 0:width]

             if image_without_header.shape[0] < 10 or image_without_header.shape[1] < 10:
                 print(f"  ERROR: Crop resulted in a very small image ({image_without_header.shape[1]}x{image_without_header.shape[0]}). Potential error in line detection. Skipping save.")
                 return crop_y_unpadded, False # Return detected y, but indicate failure to crop/save

             # --- Visualization: Crop Line (Only if saving debug images) ---
             if SAVE_DEBUG_IMAGES:
                 image_with_crop_line = image.copy()
                 cv2.line(image_with_crop_line, (0, min_search_y), (width, min_search_y), (0, 0, 255), 1)
                 cv2.line(image_with_crop_line, (0, max_search_y), (width, max_search_y), (0, 0, 255), 1)
                 cv2.line(image_with_crop_line, (0, crop_y_padded), (width, crop_y_padded), (255, 0, 0), 3) # Red crop line
                 debug_filename = os.path.join("/content", f"debug_crop_line_{os.path.basename(image_path)}")
                 cv2.imwrite(debug_filename, image_with_crop_line)

             # --- Save Cropped Image (Overwrite) ---
             try:
                 cv2.imwrite(image_path, image_without_header)
                 print(f"  SUCCESS: Cropped image saved by overwriting: {image_path}")
                 return crop_y_unpadded, True # Return detected y and True status
             except Exception as e:
                 print(f"  ERROR saving cropped image (overwrite) for {image_path}: {e}")
                 return crop_y_unpadded, False # Return detected y, but indicate save failure
        else:
             # Should have been caught earlier when no valid lines were found
             print(f"  ERROR: Internal logic error - crop_y is zero after supposedly finding a line.")
             return None, False

    except Exception as e:
        print(f"!! UNEXPECTED ERROR processing {image_path}: {e}")
        import traceback
        traceback.print_exc()
        return None, False


# --- 3. Main Execution Logic ---

print("\nStarting header removal process...")
print(f"Base Folder: {BASE_FOLDER}")
print(f"Target Files: {len(TARGET_FILES)} specific names")
print(f"Consistency Check Tolerance (Y-pixel diff): {CONSISTENCY_Y_TOLERANCE}")
print("-" * 50)

if not os.path.isdir(BASE_FOLDER):
    print(f"ERROR: Base folder '{BASE_FOLDER}' not found or is not a directory.")
else:
    processed_files_count = 0
    skipped_files_count = 0
    error_files_count = 0
    consistency_warnings = 0

    # Dictionary to store the first detected crop_y for each group within each subfolder
    # Key: subfolder_path, Value: {1: first_crop_y_group1, 2: first_crop_y_group2}
    subfolder_consistency_y = collections.defaultdict(dict)

    # Walk through the directory tree
    for root, dirs, files in os.walk(BASE_FOLDER):
        # Skip the top-level base folder itself, only process subfolders
        if root == BASE_FOLDER:
            # Optional: If you expect sub-subfolders, remove this check
            # print(f"Scanning subfolders in {root}...")
            continue # Go to the next level (subfolders)

        print(f"\n>>> Entering Subfolder: {root}")
        files_in_subfolder = 0

        # Sort files for potentially more predictable processing order (e.g., page_01 before page_05)
        files.sort()

        for filename in files:
            if filename in TARGET_FILES:
                files_in_subfolder += 1
                image_path = os.path.join(root, filename)

                # Determine group for consistency check
                current_group = 1 if filename in GROUP1_FILES else 2

                # Run header removal
                detected_y, status_ok = remove_header_strict_zone(image_path)

                if detected_y is not None: # Line detection attempted (even if crop failed later)
                    # --- Consistency Check ---
                    if current_group not in subfolder_consistency_y[root]:
                        # First file of this group in this subfolder
                        subfolder_consistency_y[root][current_group] = detected_y
                        print(f"  Consistency: First file for group {current_group} in this subfolder. Storing Y={detected_y}.")
                    else:
                        # Compare with the first detected Y for this group/subfolder
                        first_y = subfolder_consistency_y[root][current_group]
                        diff = abs(detected_y - first_y)
                        if diff > CONSISTENCY_Y_TOLERANCE:
                            print(f"  !! WARNING: Consistency check failed for {filename} (Group {current_group}) !!")
                            print(f"     Detected Y={detected_y}, First Y for group was {first_y}. Difference={diff} > Tolerance={CONSISTENCY_Y_TOLERANCE}")
                            consistency_warnings += 1
                        # else:
                        #    print(f"  Consistency: OK for group {current_group}. Y={detected_y} vs First Y={first_y}. Diff={diff}.")
                    # --- End Consistency Check ---

                # Update counters based on status
                if status_ok:
                    processed_files_count += 1
                elif detected_y is None and not status_ok : # Complete failure (imread failed, zone invalid, unexpected error)
                     error_files_count += 1
                else: # Line detected (or detection failed) but crop/save failed or skipped
                    skipped_files_count += 1 # Count as skipped because original wasn't overwritten successfully


        if files_in_subfolder == 0:
            print("   No target image files found in this subfolder.")
        else:
            print(f"<<< Finished Subfolder: {root}")


    print("-" * 50)
    print("Processing Summary:")
    print(f"  Successfully processed (cropped & overwritten): {processed_files_count}")
    print(f"  Skipped (line not found or crop/save failed): {skipped_files_count}")
    print(f"  Errors (load/unexpected issues):             {error_files_count}")
    print(f"  Consistency Warnings Issued:                 {consistency_warnings}")
    print("-" * 50)

# --- End of Script ---

OpenCV version: 4.11.0
NumPy version: 2.0.2

Starting header removal process...
Base Folder: /content/folder
Target Files: 10 specific names
Consistency Check Tolerance (Y-pixel diff): 30
--------------------------------------------------

>>> Entering Subfolder: /content/folder/Peut-on vivre en paix avec son inconscient ? (AGREG interne 2021 -note : 14)
--- Processing image: /content/folder/Peut-on vivre en paix avec son inconscient ? (AGREG interne 2021 -note : 14)/page_01.png
  Image dimensions: 2481x2695
  Strict Search Zone: y = 5 to 485 pixels
  Found 10 final candidate horizontal lines.
  Lowest suitable horizontal line FOUND. Max Y: 85
  Performing crop at y = 90 (line at 85 + padding 5)
  SUCCESS: Cropped image saved by overwriting: /content/folder/Peut-on vivre en paix avec son inconscient ? (AGREG interne 2021 -note : 14)/page_01.png
  Consistency: First file for group 1 in this subfolder. Storing Y=85.
--- Processing image: /content/folder/Peut-on vivre en paix avec son inc