In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Configuration
DATA_DIR = 'Kather_texture_2016_image_tiles_5000'
CLASSES = {
    '01_TUMOR': 1,
    '02_STROMA': 2,
    '03_COMPLEX': 3,
    '04_LYMPHO': 4,
    '05_DEBRIS': 5,
    '06_MUCOSA': 6,
    '07_ADIPOSE': 7,
    '08_EMPTY': 8
}
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1
TEST_RATIO = 0.1
OUTPUT_DIR = 'dataset_splits'


os.makedirs(OUTPUT_DIR, exist_ok=True)

# 1. Collect images and labels
image_paths = []
labels = []

for class_folder, label in CLASSES.items():
    folder_path = os.path.join(DATA_DIR, class_folder)
    if not os.path.exists(folder_path):
        print(f"Warning: Folder {folder_path} does not exist")
        continue
    for img_name in os.listdir(folder_path):
        if img_name.lower().endswith('.tif'):
            img_path = os.path.join(folder_path, img_name)
            image_paths.append(img_path)
            labels.append(label)
        else:
            print(f"Ignoring non-.tif file: {img_name}")

# Convert to DataFrame
data = pd.DataFrame({
    'image_path': image_paths,
    'label': labels
})

# 2. Verify class distribution
class_counts = data['label'].value_counts().sort_index()
print("Class distribution:")
for label, count in class_counts.items():
    class_name = [k for k, v in CLASSES.items() if v == label][0]
    print(f"{class_name} (Label {label}): {count} images ({count/len(data)*100:.2f}%)")

# 3. Split into train + (validation + test)
train_data, temp_data, train_labels, temp_labels = train_test_split(
    data['image_path'],
    data['label'],
    train_size=TRAIN_RATIO,
    stratify=data['label'],
    random_state=42
)

# 4. Split temp_data into validation and test
val_ratio_adjusted = VAL_RATIO / (VAL_RATIO + TEST_RATIO)  # Adjust ratio
val_data, test_data, val_labels, test_labels = train_test_split(
    temp_data,
    temp_labels,
    train_size=val_ratio_adjusted,
    stratify=temp_labels,
    random_state=42
)

# 5. Create DataFrames for each set
train_df = pd.DataFrame({'image_path': train_data, 'label': train_labels})
val_df = pd.DataFrame({'image_path': val_data, 'label': val_labels})
test_df = pd.DataFrame({'image_path': test_data, 'label': test_labels})

# 6. Shuffle within each set
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
val_df = val_df.sample(frac=1, random_state=42).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

# 7. Save to CSV
train_df.to_csv(os.path.join(OUTPUT_DIR, 'train_split.csv'), index=False)
val_df.to_csv(os.path.join(OUTPUT_DIR, 'val_split.csv'), index=False)
test_df.to_csv(os.path.join(OUTPUT_DIR, 'test_split.csv'), index=False)

# 8. Report sizes
print("\nSet sizes:")
print(f"Training: {len(train_df)} images ({len(train_df)/len(data)*100:.2f}%)")
print(f"Validation: {len(val_df)} images ({len(val_df)/len(data)*100:.2f}%)")
print(f"Test: {len(test_df)} images ({len(test_df)/len(data)*100:.2f}%)")

# 9. Verify stratification
print("\nClass distribution in each set:")
for split_name, split_df in [('Training', train_df), ('Validation', val_df), ('Test', test_df)]:
    print(f"\n{split_name}:")
    split_counts = split_df['label'].value_counts().sort_index()
    for label, count in split_counts.items():
        class_name = [k for k, v in CLASSES.items() if v == label][0]
        print(f"{class_name} (Label {label}): {count} images ({count/len(split_df)*100:.2f}%)")

Class distribution:
01_TUMOR (Label 1): 625 images (12.50%)
02_STROMA (Label 2): 625 images (12.50%)
03_COMPLEX (Label 3): 625 images (12.50%)
04_LYMPHO (Label 4): 625 images (12.50%)
05_DEBRIS (Label 5): 625 images (12.50%)
06_MUCOSA (Label 6): 625 images (12.50%)
07_ADIPOSE (Label 7): 625 images (12.50%)
08_EMPTY (Label 8): 625 images (12.50%)

Set sizes:
Training: 4000 images (80.00%)
Validation: 500 images (10.00%)
Test: 500 images (10.00%)

Class distribution in each set:

Training:
01_TUMOR (Label 1): 500 images (12.50%)
02_STROMA (Label 2): 500 images (12.50%)
03_COMPLEX (Label 3): 500 images (12.50%)
04_LYMPHO (Label 4): 500 images (12.50%)
05_DEBRIS (Label 5): 500 images (12.50%)
06_MUCOSA (Label 6): 500 images (12.50%)
07_ADIPOSE (Label 7): 500 images (12.50%)
08_EMPTY (Label 8): 500 images (12.50%)

Validation:
01_TUMOR (Label 1): 63 images (12.60%)
02_STROMA (Label 2): 63 images (12.60%)
03_COMPLEX (Label 3): 62 images (12.40%)
04_LYMPHO (Label 4): 62 images (12.40%)
05_DEB