In [1]:
import os
import zipfile
import shutil
import cv2
from skimage.metrics import structural_similarity as ssim
from tqdm import tqdm
import pickle
from logging_file import *
from utils.plot_utils import *

In [12]:
def extract_zip(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def make_data_folder(zip_dir, data_dir):
    images_dir = os.path.join(data_dir, "images")
    annotations_dir = os.path.join(data_dir, "annotations")

    if os.path.exists(data_dir):
        shutil.rmtree(data_dir)
    os.makedirs(data_dir)
    os.makedirs(images_dir, exist_ok=True)
    os.makedirs(annotations_dir, exist_ok=True)

    zip_files = [f for f in os.listdir(zip_dir) if f.endswith(".zip")]
    
    for zip_file in tqdm(zip_files):
        zip_path = os.path.join(zip_dir, zip_file)
        extract_to = os.path.join(zip_dir, "tmp")
        
        extract_zip(zip_path, extract_to)
        
        intermediate_folders = os.listdir(extract_to)
        if len(intermediate_folders) != 1:
            continue 
        intermediate_folder = os.path.join(extract_to, intermediate_folders[0])
        
        for class_dir in os.listdir(intermediate_folder):
            class_path = os.path.join(intermediate_folder, class_dir)
            
            if os.path.isdir(class_path):  
                for file in os.listdir(class_path):
                    file_path = os.path.join(class_path, file)
                    
                    if file.endswith((".jpg")):
                        shutil.move(file_path, os.path.join(images_dir, file))
                    
                    elif file.endswith(".txt"):
                        shutil.move(file_path, os.path.join(annotations_dir, file))
        
        shutil.rmtree(extract_to)

def clean_and_group_files(image_folder='data/labelized/images/', annotation_folder='data/labelized/annotations/', backup_folder="backup_invalid_files", similarity_threshold=0.8):
    logger = init_preprocess_logger()
    
    os.makedirs(backup_folder, exist_ok=True)

    image_files = set(f.replace('.jpg', '') for f in os.listdir(image_folder) if f.endswith('.jpg'))
    annotation_files = set(f.replace('.txt', '') for f in os.listdir(annotation_folder) if f.endswith('.txt'))

    images_to_delete = image_files - annotation_files
    annotations_to_delete = annotation_files - image_files

    for img in images_to_delete:
        img_path = os.path.join(image_folder, img + '.jpg')
        try:
            shutil.move(img_path, os.path.join(backup_folder, os.path.basename(img_path)))
            logger.info(f"Moved unmatched image {img_path} to backup folder.")
        except FileNotFoundError:
            logger.error(f"File not found error: {img_path}")
        except Exception as e:
            logger.error(f"Error moving {img_path}: {e}")

    for txt in annotations_to_delete:
        txt_path = os.path.join(annotation_folder, txt + '.txt')
        try:
            shutil.move(txt_path, os.path.join(backup_folder, os.path.basename(txt_path)))
            logger.info(f"Moved unmatched annotation {txt_path} to backup folder.")
        except FileNotFoundError:
            logger.error(f"File not found error: {txt_path}")
        except Exception as e:
            logger.error(f"Error moving {txt_path}: {e}")

    images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if img.endswith(".jpg")])
    groups = [] 
    current_group = [images[0]]

    for i in tqdm(range(1, 100)):
        try:
            score = compare_images(images[i-1], images[i], logger)

            if score >= similarity_threshold:
                current_group.append(images[i])
            else:
                groups.append(current_group)
                current_group = [images[i]]
        
        except Exception as e:
            logger.error(f"Error index {i}, image {images[i]} -> {str(e)}")
            shutil.move(images[i], os.path.join(backup_folder, os.path.basename(images[i])))
            logger.info(f"Moved invalid image {images[i]} to backup folder.")
            continue  

    return groups


def compare_images(img_path1, img_path2, logger):
    img1 = cv2.imread(img_path1)
    img2 = cv2.imread(img_path2)

    if img1 is None or img2 is None:
        logger.warning(f"One of the images is None: {img_path1}, {img_path2}")
        return None 

    if img1.size != img2.size:
        return 0

    img1_gray = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
    img2_gray = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)

    score = ssim(img1_gray, img2_gray)
    return score

In [7]:
groups = clean_and_group_files()
with open('groups.pkl', 'wb') as file:
    pickle.dump(groups, file)

100%|██████████████████████████████████████████████████████████████████████████████████| 99/99 [00:22<00:00,  4.35it/s]


In [13]:
for data in groups[0]:
    print(data.split('/')[-1].replace('.jpg', '.txt'))

FR_N0431633-924_W0000256-9860_20210401_Ardea-cinerea_RCNX0031_SmallestMaxSize.txt
FR_N0431633-924_W0000256-9860_20210401_Ardea-cinerea_RCNX0032_SmallestMaxSize.txt
FR_N0431633-924_W0000256-9860_20210401_Ardea-cinerea_RCNX0033_SmallestMaxSize.txt
