## Massen Image Cropping
OpenCV (Open Source Computer Vision Library) is an open-source computer vision and machine learning software library. It offers a rich set of image processing capabilities, including image transformation, filtering, feature detection, image segmentation, and more. Observations of scanned newspaper documents reveal the following characteristics: 1. The quality of the newspaper scans overall is good. 2. Some of the newspaper pages scanned include unwanted content (for example, a small part of the right page is included in the scan of the left page).

In [29]:
import cv2
import numpy as np
import matplotlib.pyplot as plt

In [30]:
def auto_crop_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150)

    kernel = np.ones((5,5), np.uint8)
    dilated_edges = cv2.dilate(edges, kernel, iterations=1)

    mask = np.ones_like(dilated_edges) * 255
    mask[dilated_edges == 255] = 0

    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        sorted_contours = sorted(contours, key=cv2.contourArea, reverse=True)
        x, y, w, h = cv2.boundingRect(sorted_contours[0])
        cropped_image = image[y:y+h, x:x+w]
        return image, cropped_image
    else:
        return image, None

In [18]:
original_image, cropped_image = auto_crop_image('/home/vivek/Desktop/Freiheitskampf/1938/01/files/02/files')
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

axes[0].imshow(cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB))
axes[0].set_title("Original Image")
axes[0].axis('off')

if cropped_image is not None:
    axes[1].imshow(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))
    axes[1].set_title("Cropped Image")
    save_path = '/home/vivek/Desktop/'
    cv2.imwrite(save_path, cropped_image)
else:
    axes[1].text(0.5, 0.5, 'No Change', horizontalalignment='center', verticalalignment='center')
    save_path = '/Users/yu/Desktop/linshi/result/00000003_uncropped.jpg'
    cv2.imwrite(save_path, original_image)
axes[1].axis('off')

plt.show()

error: OpenCV(4.6.0) /croot/opencv-suite_1691620365762/work/modules/imgproc/src/color.cpp:182: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'


In [15]:
original_image, cropped_image = auto_crop_image('/Users/yu/Desktop/00000004.tif.large.jpg')
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

axes[0].imshow(cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB))
axes[0].set_title("Original Image")
axes[0].axis('off')

if cropped_image is not None:
    axes[1].imshow(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))
    axes[1].set_title("Cropped Image")
    save_path = '/Users/yu/Desktop/linshi/result/00000004_cropped.jpg'
    cv2.imwrite(save_path, cropped_image)
else:
    axes[1].text(0.5, 0.5, 'No Change', horizontalalignment='center', verticalalignment='center')
    save_path = '/Users/yu/Desktop/linshi/result/00000004_uncropped.jpg'
    cv2.imwrite(save_path, original_image)
axes[1].axis('off')

plt.show()

[ WARN:0@1086.434] global /croot/opencv-suite_1691620365762/work/modules/imgcodecs/src/loadsave.cpp (239) findDecoder imread_('/Users/yu/Desktop/00000004.tif.large.jpg'): can't open/read file: check file path/integrity


error: OpenCV(4.6.0) /croot/opencv-suite_1691620365762/work/modules/imgproc/src/color.cpp:182: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'


In [None]:
import cv2
import numpy as np
import os
import glob

def auto_crop_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150)

    kernel = np.ones((5,5), np.uint8)
    dilated_edges = cv2.dilate(edges, kernel, iterations=1)

    mask = np.ones_like(dilated_edges) * 255
    mask[dilated_edges == 255] = 0

    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        sorted_contours = sorted(contours, key=cv2.contourArea, reverse=True)
        x, y, w, h = cv2.boundingRect(sorted_contours[0])
        cropped_image = image[y:y+h, x:x+w]
        return image, cropped_image
    else:
        return image, None
    
source_folder = "/home/vivek/Desktop/Freiheitskampf/1942"
result_folder = "/home/vivek/Desktop/result"


# Ensure the result folder exists
if not os.path.exists(result_folder):
    os.makedirs(result_folder)

for month in range(1, 13):
    month_str = str(month).zfill(2)
    month_source_folder = os.path.join(source_folder, month_str, "files")
    month_result_folder = os.path.join(result_folder, month_str)

    # Ensure the month folder in the result directory exists
    if not os.path.exists(month_result_folder):
        os.makedirs(month_result_folder)

    for day in range(1, 32):  # Assuming up to 31 days in a month
        day_str = str(day).zfill(2)
        day_image_folder = os.path.join(month_source_folder, day_str, "files")

        if os.path.isdir(day_image_folder):
            for image_file in glob.glob(os.path.join(day_image_folder, "*.jpg")):
                original_image, cropped_image = auto_crop_image(image_file)
                
                # Decide which image to save (cropped or original)
                image_to_save = cropped_image if cropped_image is not None else original_image
                if image_to_save is not None:
                    # Create a unique file name to prevent overwriting
                    image_basename = os.path.basename(image_file)
                    new_image_name = f"{month_str}_{day_str}_{image_basename.split('.')[0]}_cropped.jpg" if cropped_image is not None else f"{month_str}_{day_str}_{image_basename}"
                    save_path = os.path.join(month_result_folder, new_image_name)
                    cv2.imwrite(save_path, image_to_save)
