In [2]:
%pip install pandas
%pip install pillow
%pip install matplotlib
%pip install tqdm


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import random
import pandas as pd
from PIL import Image
from collections import defaultdict
import matplotlib.pyplot as plt
from tqdm import tqdm


In [8]:
# Path to the dataset
dataset_path = '../dataset/raw'

# Output directory for plots and logs
OUTPUT_DIR = '../dataset/exploration'
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [9]:

# Detect dataset structure
subdirs = [d for d in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, d))]
has_train_test_split = 'train' in subdirs and 'test' in subdirs

if has_train_test_split:
    dataset_type = "split"
    dataset_subdirs = {'train': os.path.join(dataset_path, 'train'), 'test': os.path.join(dataset_path, 'test')}
elif all(len(os.listdir(os.path.join(dataset_path, d))) > 0 for d in subdirs):
    dataset_type = "container"
    dataset_subdirs = {'all': dataset_path}
else:
    dataset_type = "flat"
    dataset_subdirs = {'all': dataset_path}

print(f"Detected dataset structure: {dataset_type}")


Detected dataset structure: split


In [10]:
# Count images per class
class_counts = defaultdict(dict)

for subset, path in dataset_subdirs.items():
    for class_dir in os.listdir(path):
        class_path = os.path.join(path, class_dir)
        if os.path.isdir(class_path):
            count = len([f for f in os.listdir(class_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
            class_counts[class_dir][subset] = count

# Format and display class distribution
print("\nClass distribution summary:")
if dataset_type == "split":
    df_dist = pd.DataFrame(class_counts).T.fillna(0).astype(int)
else:
    df_dist = pd.DataFrame.from_dict(class_counts, orient='index').fillna(0).astype(int)
print(df_dist.sort_values(by=df_dist.columns[-1], ascending=False))


Class distribution summary:
                     train  test
Unknown_Normal        1651   189
Benign_tumors         1093   121
Eczema                1010   112
Tinea                  923   102
Psoriasis              820    88
Actinic_Keratosis      748    83
Vitiligo               714    82
SkinCancer             693    77
Acne                   593    65
Warts                  580    64
Lichen                 553    61
DrugEruption           547    61
Vascular_Tumors        543    60
Infestations_Bites     524    60
Bullous                504    55
Vasculitis             461    52
Seborrh_Keratoses      455    51
Moles                  361    40
Lupus                  311    34
Sun_Sunlight_Damage    312    34
Rosacea                254    28
Candidiasis            248    27


In [11]:
# Bar plot of class distribution
plt.figure(figsize=(12, 6))
if dataset_type == "split":
    df_dist.plot(kind='bar', stacked=True, colormap='tab10', edgecolor='black', ax=plt.gca())
    plt.title('Class Distribution (Train/Test Split)')
else:
    df_dist.plot(kind='bar', color='skyblue', edgecolor='black', ax=plt.gca())
    plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Number of Images')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'class_distribution.png'))
plt.close()


In [12]:
# Image properties collection
image_data = []
for subset, path in dataset_subdirs.items():
    for class_dir in tqdm(os.listdir(path), desc=f"Processing {subset} set"):
        class_path = os.path.join(path, class_dir)
        if os.path.isdir(class_path):
            all_images = [f for f in os.listdir(class_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
            sampled_files = random.sample(all_images, min(50, len(all_images)))

            for img_file in sampled_files:
                img_path = os.path.join(class_path, img_file)
                try:
                    with Image.open(img_path) as img:
                        width, height = img.size
                        image_data.append({
                            'subset': subset,
                            'class': class_dir,
                            'filename': img_file,
                            'width': width,
                            'height': height,
                            'aspect_ratio': width / height,
                            'format': img.format,
                            'file_size_kb': os.path.getsize(img_path) / 1024
                        })
                except Exception as e:
                    with open(os.path.join(OUTPUT_DIR, "errors.txt"), "a") as err_log:
                        err_log.write(f"{img_path} - {str(e)}\n")

# Create DataFrame
df = pd.DataFrame(image_data)
df.to_csv(os.path.join(OUTPUT_DIR, 'image_properties.csv'), index=False)

# Image size stats
plt.figure(figsize=(14, 5))

plt.subplot(1, 3, 1)
df['width'].hist(bins=20, color='skyblue', edgecolor='black')
plt.title('Image Width Distribution')
plt.xlabel('Width (px)')
plt.ylabel('Count')

plt.subplot(1, 3, 2)
df['height'].hist(bins=20, color='salmon', edgecolor='black')
plt.title('Image Height Distribution')
plt.xlabel('Height (px)')
plt.ylabel('Count')

plt.subplot(1, 3, 3)
df['aspect_ratio'].hist(bins=20, color='lightgreen', edgecolor='black')
plt.title('Aspect Ratio Distribution')
plt.xlabel('Aspect Ratio (W/H)')
plt.ylabel('Count')

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'image_size_distributions.png'))
plt.close()

# Format distribution
plt.figure(figsize=(6, 4))
df['format'].value_counts().plot(kind='bar', color='purple', edgecolor='black')
plt.title('Image Format Distribution')
plt.xlabel('Format')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'image_format_distribution.png'))
plt.close()

# File size distribution
plt.figure(figsize=(8, 4))
df['file_size_kb'].hist(bins=30, color='orange', edgecolor='black')
plt.title('Image File Size Distribution')
plt.xlabel('File Size (KB)')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'image_file_size_distribution.png'))
plt.close()

print(f"\n📊 All exploration results have been saved to: {OUTPUT_DIR}")


Processing train set: 100%|██████████| 22/22 [00:00<00:00, 132.22it/s]
Processing test set: 100%|██████████| 22/22 [00:00<00:00, 171.09it/s]



📊 All exploration results have been saved to: ../dataset/exploration
