# Check and move corrupted Tif files

This code scans a directory for TIFF files and detects corrupted ones by verifying and loading each image. It uses a PyTorch DataLoader to process files in batches, handling various exceptions to identify corrupted files accurately. Additionally, it prints the current subdirectory being processed for monitoring purposes.

In [1]:
import os
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import csv
import shutil
import pandas as pd

In [2]:
num_cores = os.cpu_count()
print(f"Number of CPU cores: {num_cores}")

Number of CPU cores: 32


In [3]:
# Define the path to the main directory

#main_dir = "../Project003a_Plankton_imager/data_tar/2023-06-07_error"
main_dir = "data/DETAILED_merged"

In [4]:
# Define funtion

class TiffDataset(Dataset):
    def __init__(self, file_paths):
        self.file_paths = file_paths

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        try:
            # First, try to open and verify the image
            with Image.open(file_path) as img:
                img.verify()  # Verify the image integrity
            
            # Re-open the image and try to load it
            with Image.open(file_path) as img:
                img.load()  # Load the image to catch truncation issues
            return file_path, False  # File is not corrupted
        except (IOError, SyntaxError) as e:
            # Catch general corruption or syntax issues
            print(f"Corrupted file detected: {file_path}\nError: {e}")
            return file_path, True  # File is corrupted
        except OSError as e:
            # Specific handling for truncated files or load issues
            print(f"File loading issue: {file_path}\nError: {e}")
            return file_path, True  # File is corrupted
        except Exception as e:
            # Handle any unexpected errors separately
            print(f"Unexpected error for file: {file_path}\nError: {e}")
            return file_path, True  # Consider unexpected errors as corruption

def find_corrupted_tiff_files(main_dir, batch_size=600, num_workers=num_cores):
    all_tiff_files = []

    # Traverse the directory recursively and gather all TIFF files
    for root, _, files in os.walk(main_dir):
        print(f"Processing directory: {root}")  # Monitor current subdirectory
        tiff_files = [os.path.join(root, file) for file in files if file.lower().endswith((".tiff", ".tif"))]
        all_tiff_files.extend(tiff_files)

    if not all_tiff_files:
        print("No TIFF files found.")
        return []

    # Create a PyTorch Dataset and DataLoader
    dataset = TiffDataset(all_tiff_files)
    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)

    corrupted_files = []

    # Process the files using the DataLoader
    for batch in tqdm(dataloader, desc="Processing TIFF files", unit="batch"):
        for file_path, is_corrupted in zip(batch[0], batch[1]):
            if is_corrupted:
                corrupted_files.append(file_path)
                print(f"\nCorrupted file found: {file_path}")

    if not corrupted_files:
        print("No corrupted files found.")
    return corrupted_files


In [5]:
%%time

# Run the function
corrupted_files = find_corrupted_tiff_files(main_dir)



Processing directory: data/DETAILED_merged
Processing directory: data/DETAILED_merged/Cnidaria_Hydrozoa-polyp
Processing directory: data/DETAILED_merged/Crustacea_Cirripedia-larvae
Processing directory: data/DETAILED_merged/artefacts
Processing directory: data/DETAILED_merged/Crustacea_Copepoda-nauplii
Processing directory: data/DETAILED_merged/Crustacea_Copepoda_Monstrilloidae
Processing directory: data/DETAILED_merged/Mollusca_Gastropoda
Processing directory: data/DETAILED_merged/Cnidaria_Scyphozoa-ephyrae
Processing directory: data/DETAILED_merged/Fish-eggs
Processing directory: data/DETAILED_merged/Crustacea_Cladocera
Processing directory: data/DETAILED_merged/Crustacea_Copepoda_Calanoida
Processing directory: data/DETAILED_merged/Cnidaria_Hydrozoa-medusa
Processing directory: data/DETAILED_merged/Bryozoa-larvae
Processing directory: data/DETAILED_merged/Platyhelminthes
Processing directory: data/DETAILED_merged/Ciliophora
Processing directory: data/DETAILED_merged/Echinodermata_As

Processing TIFF files: 100%|██████████| 95/95 [00:08<00:00, 10.74batch/s]

No corrupted files found.
CPU times: user 267 ms, sys: 154 ms, total: 421 ms
Wall time: 9.02 s





In [6]:
# Export filenames of corrupted files to csv file

# Specify the output CSV file path
output_csv_path = "corrupted_files_v01.csv"

# Write the corrupted files to the CSV file
with open(output_csv_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Corrupted File Path"])  # Write header
    for file_path in corrupted_files:
        writer.writerow([file_path])

print(f"Corrupted files have been exported to {output_csv_path}.")

Corrupted files have been exported to corrupted_files_v01.csv.


In [7]:
# Move corrupted files to new folder

# Load the CSV file
csv_file_path = "corrupted_files_v01.csv"
df = pd.read_csv(csv_file_path)

# Specify the new folder where you want to move the corrupted files
new_folder_path = "corrupted_files_v01"
os.makedirs(new_folder_path, exist_ok=True)  # Create the folder if it doesn't exist

# Iterate over the file paths and move each file to the new folder
for file_path in df["Corrupted File Path"]:
    try:
        # Move the file
        shutil.move(file_path, new_folder_path)
        print(f"Moved: {file_path}")
    except Exception as e:
        print(f"Error moving {file_path}: {e}")

print("File moving process is complete.")


File moving process is complete.


In [8]:
# Checking for duplicate files

import hashlib
from pathlib import Path
from collections import defaultdict

def hash_file(filepath, block_size=65536):
    hasher = hashlib.sha256()
    with open(filepath, "rb") as f:
        for block in iter(lambda: f.read(block_size), b""):
            hasher.update(block)
    return hasher.hexdigest()

# Folder to check
base_folder = Path("data") / "WMR"

# Dictionary to group files by their hash
hash_to_files = defaultdict(list)

# Walk through all files recursively
for file in base_folder.rglob("*"):
    if file.is_file():
        file_hash = hash_file(file)
        hash_to_files[file_hash].append(file)

# Find and print duplicates
duplicates_found = False
for file_list in hash_to_files.values():
    if len(file_list) > 1:
        duplicates_found = True
        print("Duplicate files:")
        for path in file_list:
            print(f"  - {path}")
        print()

if not duplicates_found:
    print("No duplicate files found.")


No duplicate files found.
