# Mass Image cropping
In this phase, the previously developed image cropping algorithm needs to be applied to all the newspaper images from 1942. Observations and tests have revealed that the majority of images require cropping due to the presence of content from other pages. However, the bulk scanned images vary, generally falling into two cases: one where the correct page outline is much fainter compared to other lines on the page, and the other where the correct outline is strong but overshadowed by even stronger lines on the page. For these scenarios, the former requires enhanced sensitivity in contour recognition, while the latter necessitates multiple iterations of edge dilation to make the image edges more prominent, aiding in clearer separation of the main part of the image.**(The algorithm used remains the same, but the parameters are adjusted differently for the two different situations).**

**Core objective is to achieve the best cropping results for various types of scanned images while maximizing the reduction of manual effort in mass processing.**

## 1. Algorithm adjustment 
Adjust the algorithm parameters according to different scan conditions.

In [78]:
import cv2
import numpy as np
import matplotlib.pyplot as plt

1. This is for the first case, which is, the correct outline of the page is fainter/lighter compared to other frames on the page, so focus on the threshold setting during edge recognition:

In [3]:
def auto_crop_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 1, 2) 
    # Tests have shown that minimal values are most effective in identifying extreme cases
    # where edges are very faint and almost indistinguishable

    kernel = np.ones((5,5), np.uint8)
    dilated_edges = cv2.dilate(edges, kernel, iterations=1)

    mask = np.ones_like(dilated_edges) * 255
    mask[dilated_edges == 255] = 0

    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        sorted_contours = sorted(contours, key=cv2.contourArea, reverse=True)
        x, y, w, h = cv2.boundingRect(sorted_contours[0])
        cropped_image = image[y:y+h, x:x+w]
        return image, cropped_image
    else:
        return image, None

2. This is for the second case, which is, the correct outline is strong but overshadowed by even stronger lines on the page, so focus on the number of iterations:

In [187]:
def auto_crop_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150)

    kernel = np.ones((5,5), np.uint8)
    dilated_edges = cv2.dilate(edges, kernel, iterations=4) 
    # Tests have shown that setting iterations to 4 yielded the desired results

    mask = np.ones_like(dilated_edges) * 255
    mask[dilated_edges == 255] = 0

    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        sorted_contours = sorted(contours, key=cv2.contourArea, reverse=True)
        x, y, w, h = cv2.boundingRect(sorted_contours[0])
        cropped_image = image[y:y+h, x:x+w]
        return image, cropped_image
    else:
        return image, None

## 2. Execute mass processing
Develop corresponding iterative methods based on the structure of mass files stored on the local computer.

In [39]:
import cv2
import numpy as np
import os
import glob

def auto_crop_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150)

    kernel = np.ones((5,5), np.uint8)
    dilated_edges = cv2.dilate(edges, kernel, iterations=1)

    mask = np.ones_like(dilated_edges) * 255
    mask[dilated_edges == 255] = 0

    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        sorted_contours = sorted(contours, key=cv2.contourArea, reverse=True)
        x, y, w, h = cv2.boundingRect(sorted_contours[0])
        cropped_image = image[y:y+h, x:x+w]
        return image, cropped_image
    else:
        return image, None
    
source_folder = "/home/vivek/Desktop/Freiheitskampf/1942"
result_folder = "/home/vivek/Desktop/result"


# Ensure the result folder exists
if not os.path.exists(result_folder):
    os.makedirs(result_folder)

for month in range(1, 13):
    month_str = str(month).zfill(2)
    month_source_folder = os.path.join(source_folder, month_str, "files")
    month_result_folder = os.path.join(result_folder, month_str)

    # Ensure the month folder in the result directory exists
    if not os.path.exists(month_result_folder):
        os.makedirs(month_result_folder)

    for day in range(1, 32):  # Assuming up to 31 days in a month
        day_str = str(day).zfill(2)
        day_image_folder = os.path.join(month_source_folder, day_str, "files")

        if os.path.isdir(day_image_folder):
            for image_file in glob.glob(os.path.join(day_image_folder, "*.jpg")):
                original_image, cropped_image = auto_crop_image(image_file)
                
                # Decide which image to save (cropped or original)
                image_to_save = cropped_image if cropped_image is not None else original_image
                if image_to_save is not None:
                    # Create a unique file name to prevent overwriting
                    image_basename = os.path.basename(image_file)
                    new_image_name = f"{month_str}_{day_str}_{image_basename.split(".")[0]}_cropped.jpg" if cropped_image is not None else f"{month_str}_{day_str}_{image_basename}"
                    save_path = os.path.join(month_result_folder, new_image_name)
                    cv2.imwrite(save_path, image_to_save)


## 3. Post-processing
Images that were not successfully recognized by the first algorithm can undergo further processing using the second algorithm. It's important to note that in this process, there is no need for manual selection of successfully recognized images; all images can be collectively processed using the second algorithm.

In [None]:
first_folder = "/home/vivek/Desktop/result/first_01"
second_folder = "/home/vivek/Desktop/result/second_01"

for filename in os.listdir(false_folder):
    # Build the complete file path
    file_path = os.path.join(false_folder, filename)
    
    # Check if it's a file
    if os.path.isfile(file_path):
        # Apply auto cropping
        _, cropped_image = auto_crop_image(file_path)
        
        # If there's a cropping result, save it to the target folder
        if cropped_image is not None:
            # Build the save path
            save_path = os.path.join(corrected_folder, filename)
            # Save the cropped image
            cv2.imwrite(save_path, cropped_image)