In [2]:
import os
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define the paths to your train and test directories
cnn_root_dir = "D:/DATASET/CNN"  # Replace with the correct path

# Initialize ImageDataGenerators for train and test datasets
datagen = ImageDataGenerator(rescale=1.0/255.0)  # Normalization (if used in your model)

def analyze_image_distribution(datagen, dataset_path):
    """
    Analyzes image distribution using ImageDataGenerator and flow_from_directory.
    """
    distribution_data = []

    # Traverse through each disease directory (e.g., fibrosis, ballooning, etc.)
    for disease in os.listdir(dataset_path):
        disease_path = os.path.join(dataset_path, disease)

        if os.path.isdir(disease_path):  # Ensure it's a valid directory
            for split in ['balanced_train', 'test']:
                split_path = os.path.join(disease_path, split)

                if os.path.exists(split_path):  # Check if train/test directory exists
                    # Use flow_from_directory to load images and count them
                    generator = datagen.flow_from_directory(
                        split_path,
                        target_size=(299, 299),  # Use your model’s input size
                        batch_size=32,  # Batch size doesn't matter for counting
                        class_mode='categorical',  # 'categorical' for multi-class problems
                        shuffle=False  # No need to shuffle for counting
                    )

                    # Collect data for each label
                    for label, idx in generator.class_indices.items():
                        count = (generator.labels == idx).sum()  # Count images for the label

                        # Append the results to the data list
                        distribution_data.append({
                            "Disease": disease,
                            "Split": split,
                            "Label": label,
                            "Image_Count": count
                        })

    # Convert the results into a DataFrame
    df = pd.DataFrame(distribution_data)

    # Save to CSV
    csv_path = "D:/PATENT/disease_label_distribution_balanced.csv"
    df.to_csv(csv_path, index=False)

    print(f"Image distribution analysis completed. Results saved to {csv_path}.")

# Run the analysis
analyze_image_distribution(datagen, cnn_root_dir)


Found 13199 images belonging to 2 classes.
Found 381 images belonging to 2 classes.
Found 5349 images belonging to 5 classes.
Found 1278 images belonging to 5 classes.
Found 11997 images belonging to 3 classes.
Found 491 images belonging to 3 classes.
Found 25700 images belonging to 4 classes.
Found 7705 images belonging to 4 classes.
Image distribution analysis completed. Results saved to D:/PATENT/disease_label_distribution_balanced.csv.
