## Requirements

In [1]:
import numpy as np
from rembg import remove
import sys, os
import cv2
import shutil

## Data preprocessing

### Test data

In [7]:
# Remove background of test images

data_path = "./data/test_image_headmind"

for i, image_name in enumerate(os.listdir(data_path)):
    
    if i%4 == 0:
        print(f"{(100*i/80):.1f}%")
    
    input_path = data_path + "/" + image_name
    output_path = f"./data/preprocessed_test/{image_name}"
    
    input = cv2.imread(input_path)
    output = remove(input, bgcolor=(255, 255, 255, 255))
    
    cv2.imwrite(output_path, output)

0.0%


KeyboardInterrupt: 

In [None]:
# Change photo sizes to 256x256

def change_image_size(input_dir, output_dir):

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Iterate through all files in the input directory
    for file_name in os.listdir(input_dir):
        file_path = os.path.join(input_dir, file_name)
        
        try:
            # Read the image
            img = cv2.imread(file_path)
            if img is None:
                print(f"Skipping {file_name}: Not a valid image")
                continue
            
            # Get the dimensions of the original image
            height, width = img.shape[:2]
            
            # Determine the size of the square
            square_size = max(width, height)
            
            # Create a white square canvas
            square_img = np.ones((square_size, square_size, 3), dtype=np.uint8) * 255
            
            # Calculate the position to center the original image on the canvas
            x_offset = (square_size - width) // 2
            y_offset = (square_size - height) // 2
            
            # Place the original image onto the square canvas
            square_img[y_offset:y_offset+height, x_offset:x_offset+width] = img
            
            # Resize the square image to 256x256
            resized_img = cv2.resize(square_img, (256, 256), interpolation=cv2.INTER_LINEAR)
            
            # Save the processed image to the output directory
            output_path = os.path.join(output_dir, file_name)
            cv2.imwrite(output_path, resized_img)
            
            #print(f"Processed and saved: {output_path}")
        except Exception as e:
            print(f"Failed to process {file_name}: {e}")

input_directory = "./data/preprocessed_test"
output_directory = "./data/preprocessed_test"

change_image_size(input_directory, output_directory)


### Training data

In [6]:
data_path = "../../data/DAM"
output_dir = "../../data/DAM_white_background"

os.makedirs(output_dir, exist_ok=True)

# Blacklist explícita
blacklist = [
    'MONSIEUR2XWY1I.jpeg', 'MISDB1UXRH4D0.jpeg',
    'MISDB1UXRH4C0.jpeg', 'MISDB1UXRB0O0.jpeg', 'M1710ZTEIM927.jpeg'
]

# Prefixos permitidos (exceções na blacklist para arquivos começando com 'S')
whitelisted_prefixes = ['S204', 'S0949', 'S0918', 'S0856', 'S0204J', 'S0204OA', 'S0204OC', 'S0204OL', 'S0204OO', 'S0204P']

# Remove fundo das imagens no diretório A (apenas "M" ou "S")
print("Starting to process training images and remove background...")

for i, file_name in enumerate(os.listdir(data_path)):
    if i % 100 == 0:
        print(f"i={i}")

    # Processa apenas arquivos que começam com "M" ou "S"
    if file_name[0] not in {"M", "S"}:
        continue

    # Ignorar arquivos explícitos da blacklist
    if file_name in blacklist:
        continue

    # Ignorar arquivos que começam com "S" mas não estão nas exceções
    if file_name.startswith('S') and not any(file_name.startswith(prefix) for prefix in whitelisted_prefixes):
        continue

    input_path = os.path.join(data_path, file_name)
    output_path = os.path.join(output_dir, file_name)

    try:
        # Remove o fundo preto e adiciona fundo branco
        input_img = cv2.imread(input_path)
        output_img = remove(input_img, bgcolor=(255, 255, 255, 255))

        # Salva a imagem processada no diretório de saída
        cv2.imwrite(output_path, output_img)
    except Exception as e:
        print(f"Error processing {file_name}: {e}")

print("Finished processing training images.")


Starting to process training images and remove background...
i=0
i=100
i=200
i=300
i=400
i=500
i=600
i=700
i=800
i=900
i=1000
i=1100
i=1200
i=1300
i=1400
i=1500
i=1600
i=1700
i=1800
i=1900
i=2000
i=2100
i=2200
i=2300
i=2400
i=2500
i=2600
i=2700
Finished processing training images.


## Test data consistency

In [8]:
def compare_directory_filenames(dir1, dir2):
    # Get the set of file names in each directory
    files_dir1 = set(os.listdir(dir1))
    files_dir2 = set(os.listdir(dir2))
    
    # Compare the sets
    if files_dir1 == files_dir2:
        print("The two directories have exactly the same file names.")
    else:
        # Find discrepancies
        only_in_dir1 = files_dir1 - files_dir2
        only_in_dir2 = files_dir2 - files_dir1
        
        if only_in_dir1:
            print("Files only in directory 1:")
            for file_name in sorted(only_in_dir1):
                print(f"  {file_name}")
        
        if only_in_dir2:
            print("Files only in directory 2:")
            for file_name in sorted(only_in_dir2):
                print(f"  {file_name}")
        
        print("\nThe directories do not have the same file names.")

# Example usage
dir1 = "../../data/DAM_white_background"
dir2 = "../../data/DAM"

compare_directory_filenames(dir1, dir2)

The two directories have exactly the same file names.
