In [4]:
import os
import random

def get_region_dirs(base_path):
    """
    Gets a list of all subdirectories in the base_path.
    This function assumes that each subdirectory is a region.
    """
    try:
        all_entries = os.listdir(base_path)
        # Filter out files, keep only directories
        region_dirs = [os.path.join(base_path, d) for d in all_entries if os.path.isdir(os.path.join(base_path, d))]
        if not region_dirs:
            print(f"Warning: No subdirectories found in '{base_path}'. Please check the path.")
        return region_dirs
    except FileNotFoundError:
        print(f"Error: The directory '{base_path}' was not found.")
        return []

def count_patches_in_dirs(dir_list, file_extension='.npy'):
    """
    Counts all files with a specific extension in a list of directories.
    """
    total_patches = 0
    for region_dir in dir_list:
        try:
            # List all files in the directory and count the ones with the correct extension
            num_files = len([name for name in os.listdir(region_dir) if name.endswith(file_extension)])
            total_patches += num_files
        except FileNotFoundError:
            print(f"Warning: Directory '{region_dir}' not found during counting.")
            continue
    return total_patches

def analyze_data_split():
    """
    Performs the data split by region and reports the resulting patch counts and percentages.
    """
    # --- Configuration ---
    # 1. Set the path to your parent directory containing the region folders.
    sequences_base_path = "data/sequences"
    
    # 2. (Optional) Set a seed for a reproducible shuffle. Use the same seed to get the same split every time.
    # If you want a different random split each time, comment this line out.
    random_seed = 42
    random.seed(random_seed)
    
    # 3. Define the file extension of your patch files.
    patch_file_extension = '.npy'
    # --- End of Configuration ---

    print("Starting data split analysis...")

    # 1. Get all region directories
    all_dirs = get_region_dirs(sequences_base_path)

    if not all_dirs:
        print("Analysis stopped because no region directories were found.")
        return

    print(f"Found {len(all_dirs)} total regions.")

    # 2. Shuffle and split the list of region directories
    random.shuffle(all_dirs)
    split_index = int(0.7 * len(all_dirs))
    
    train_dirs = all_dirs[:split_index]
    val_dirs = all_dirs[split_index:]

    # 3. Count the actual number of patches in each set of directories
    num_train_patches = count_patches_in_dirs(train_dirs, patch_file_extension)
    num_val_patches = count_patches_in_dirs(val_dirs, patch_file_extension)
    total_patches = num_train_patches + num_val_patches

    # 4. Print the detailed report
    print("\n--- Data Split Analysis Report ---")
    print(f"Random Seed Used: {random_seed}")
    print("-" * 34)
    
    print(f"Region Split:")
    print(f"  - Training regions:   {len(train_dirs)}")
    print(f"  - Validation regions: {len(val_dirs)}")
    
    print("\nPatch Count Split:")
    if total_patches > 0:
        train_percentage = (num_train_patches / total_patches) * 100
        val_percentage = (num_val_patches / total_patches) * 100
        print(f"  - Training patches:   {num_train_patches} ({train_percentage:.2f}%)")
        print(f"  - Validation patches: {num_val_patches} ({val_percentage:.2f}%)")
        print(f"  - Total patches:      {total_patches}")
    else:
        print("  - No patch files found to analyze.")
        
    print("\nTraining Regions:")
    for i, dir_path in enumerate(train_dirs):
        print(f"  {i+1}. {os.path.basename(dir_path)}")
        
    print("\nValidation Regions:")
    for i, dir_path in enumerate(val_dirs):
        print(f"  {i+1}. {os.path.basename(dir_path)}")
        
    print("\n--- End of Report ---\n")


if __name__ == "__main__":
    analyze_data_split()


Starting data split analysis...
Found 13 total regions.

--- Data Split Analysis Report ---
Random Seed Used: 42
----------------------------------
Region Split:
  - Training regions:   9
  - Validation regions: 4

Patch Count Split:
  - Training patches:   58200 (55.25%)
  - Validation patches: 47142 (44.75%)
  - Total patches:      105342

Training Regions:
  1. MGD
  2. GUP
  3. CQC
  4. NET
  5. WET
  6. EIU
  7. SEQ
  8. MUL
  9. CYP

Validation Regions:
  1. DEU
  2. BRB
  3. CHC
  4. NWH

--- End of Report ---

