In [1]:
import os

In [2]:
# --- Configuration ---
# The root directory of your dataset (the folder containing 'train' and 'val')
DATASET_ROOT = "../data/classifier_data/"

In [3]:

# --- Main Script ---

def create_label_files(root_dir):
    """
    Manually creates train_list.txt and val_list.txt for PaddleClas
    by walking through the 'train' and 'val' directories.
    """
    print(f"Scanning dataset root: {root_dir}")

    # Check if the root directory exists
    if not os.path.isdir(root_dir):
        print(f"Error: Directory not found at '{root_dir}'. Please check the DATASET_ROOT path.")
        return

    # Define the class names and their corresponding integer labels
    # This must match the order in your label_list.txt!
    class_to_idx = {"handwritten": 0, "printed": 1}
    print(f"Using class mapping: {class_to_idx}")

    # Process both 'train' and 'val' splits
    for split in ['train', 'val']:
        print(f"\nProcessing '{split}' set...")
        
        split_dir = os.path.join(root_dir, split)
        if not os.path.isdir(split_dir):
            print(f"  - '{split}' directory not found. Skipping.")
            continue

        output_file_path = os.path.join(root_dir, f"{split}_list.txt")
        image_count = 0
        
        # Open the output file for writing
        with open(output_file_path, 'w', encoding='utf-8') as f:
            # Loop through each class folder ('handwritten', 'printed')
            for class_name, class_idx in class_to_idx.items():
                class_dir = os.path.join(split_dir, class_name)
                if not os.path.isdir(class_dir):
                    print(f"  - Class directory '{class_name}' not found in '{split}' set. Skipping.")
                    continue
                
                # Loop through each image in the class folder, sorting for consistency
                for img_filename in sorted(os.listdir(class_dir)):
                    # Construct the relative path from the 'train' or 'val' folder
                    # The format should be "classname/imagename.jpg"
                    relative_path = os.path.join(class_name, img_filename)
                    
                    # Write the line in the format: "relative/path/to/image.jpg class_index"
                    # Use forward slashes for compatibility, even on Windows
                    f.write(f"{relative_path.replace(os.sep, '/')} {class_idx}\n")
                    image_count += 1
        
        print(f"  - Successfully created '{output_file_path}' with {image_count} entries.")



In [4]:
if __name__ == "__main__":
    create_label_files(DATASET_ROOT)

Scanning dataset root: ../data/classifier_data/
Using class mapping: {'handwritten': 0, 'printed': 1}

Processing 'train' set...
  - Successfully created '../data/classifier_data/train_list.txt' with 1342 entries.

Processing 'val' set...
  - Successfully created '../data/classifier_data/val_list.txt' with 335 entries.
