In [1]:
import os
import cv2
import numpy as np
import pandas as pd
import albumentations as A
from albumentations.pytorch import ToTensorV2

# --- Constants ---
DATA_DIR = 'Kather_texture_2016_image_tiles_5000'
SPLIT_DIR = 'dataset_splits'
AUGMENTED_DIR = 'Kather_texture_2016_augmented'  # New directory for augmented images
IMG_SIZE = (224, 224)
CLASSES = {
    1: '01_TUMOR', 2: '02_STROMA', 3: '03_COMPLEX', 4: '04_LYMPHO',
    5: '05_DEBRIS', 6: '06_MUCOSA', 7: '07_ADIPOSE', 8: '08_EMPTY'
}
NUM_CLASSES = len(CLASSES)

# --- Transformations (same as original) ---
train_transforms = A.Compose([
    A.Resize(IMG_SIZE[0], IMG_SIZE[1]),
    A.Rotate(limit=360, p=0.5),
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.3),
    A.GaussNoise(p=0.2),
    A.CLAHE(clip_limit=2, tile_grid_size=(8,8), p=0.2),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

# --- Modified to Save Augmented Images ---
def save_augmented_images(csv_file, output_dir, num_augmentations=5):
    """
    Generate and save augmented versions of each image in the CSV.
    Args:
        csv_file: Path to input CSV (e.g., train_split.csv)
        output_dir: Directory to save augmented images
        num_augmentations: Number number of augmented versions per image
    Returns:
        New DataFrame with original and augmented image paths
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    df = pd.read_csv(csv_file)
    new_rows = []

    for idx, row in df.iterrows():
        img_path = row['image_path']
        label = row['label']
        image = cv2.imread(img_path)
        if image is None:
            print(f"Warning: Could not load image: {img_path}. Skipping.")
            continue
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Add original image to new DataFrame
        new_rows.append({'image_path': img_path, 'label': label})

        # Generate and save augmented images
        img_name = os.path.basename(img_path).split('.')[0]
        for i in range(num_augmentations):
            # Apply augmentations (excluding normalization and ToTensorV2 for saving)
            aug_transforms = A.Compose([
                A.Resize(IMG_SIZE[0], IMG_SIZE[1]),
                A.Rotate(limit=360, p=0.5),
                A.HorizontalFlip(p=0.5),
                A.VerticalFlip(p=0.5),
                A.RandomBrightnessContrast(p=0.3),
                A.GaussNoise(p=0.2),
                A.CLAHE(clip_limit=2, tile_grid_size=(8,8), p=0.2),
            ])
            augmented = aug_transforms(image=image)
            aug_image = augmented['image']

            # Save augmented image
            aug_img_path = os.path.join(output_dir, f"{img_name}_aug_{i}.jpg")
            cv2.imwrite(aug_img_path, cv2.cvtColor(aug_image, cv2.COLOR_RGB2BGR))
            new_rows.append({'image_path': aug_img_path, 'label': label})

    # Create new DataFrame
    new_df = pd.DataFrame(new_rows)
    return new_df

# --- Generate Augmented Training Dataset ---
print("Generating and saving augmented training images...")
train_csv_path = os.path.join(SPLIT_DIR, 'train_split.csv')
augmented_csv_path = os.path.join(SPLIT_DIR, 'train_split_augmented.csv')

# Generate 5 augmented versions per training image
augmented_train_df = save_augmented_images(train_csv_path, AUGMENTED_DIR, num_augmentations=5)

# Save new CSV with original + augmented images
augmented_train_df.to_csv(augmented_csv_path, index=False)
print(f"Saved augmented training CSV to {augmented_csv_path}")
print(f"New training set size: {len(augmented_train_df)} images")

# --- Rest of the Original Code (with updated CSV for training) ---
from torch.utils.data import Dataset, DataLoader
import torch

class HistologyDataset(Dataset):
    def __init__(self, csv_file, transforms=None):
        self.dataframe = pd.read_csv(csv_file)
        self.transforms = transforms
        self.dataframe['label'] = self.dataframe['label'] - 1

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['image_path']
        label = self.dataframe.iloc[idx]['label']
        image = cv2.imread(img_path)

        if image is None:
            raise FileNotFoundError(f"Could not load image: {img_path}")

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        if self.transforms:
            transformed = self.transforms(image=image)
            image = transformed["image"]

        label = torch.tensor(label, dtype=torch.long)
        return image, label

# --- Setup DataLoaders ---
val_test_transforms = A.Compose([
    A.Resize(IMG_SIZE[0], IMG_SIZE[1]),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

val_csv_path = os.path.join(SPLIT_DIR, 'val_split.csv')
test_csv_path = os.path.join(SPLIT_DIR, 'test_split.csv')

try:
    train_dataset = HistologyDataset(csv_file=augmented_csv_path, transforms=train_transforms)
    val_dataset = HistologyDataset(csv_file=val_csv_path, transforms=val_test_transforms)
    test_dataset = HistologyDataset(csv_file=test_csv_path, transforms=val_test_transforms)
except FileNotFoundError as e:
    print(f"Error: Make sure the split CSV files exist in '{SPLIT_DIR}'.")
    raise e

BATCH_SIZE = 32
NUM_WORKERS = os.cpu_count() or 0

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

print(f"\nDatasets and DataLoaders configured successfully.")
print(f"Training set size: {len(train_dataset)} images")
print(f"Validation set size: {len(val_dataset)} images")
print(f"Test set size: {len(test_dataset)} images")
print(f"Number of training batches: {len(train_loader)}")
print(f"Number of validation batches: {len(val_loader)}")
print(f"Number of test batches: {len(test_loader)}")

Generating and saving augmented training images...
Saved augmented training CSV to dataset_splits\train_split_augmented.csv
New training set size: 24000 images

Datasets and DataLoaders configured successfully.
Training set size: 24000 images
Validation set size: 500 images
Test set size: 500 images
Number of training batches: 750
Number of validation batches: 16
Number of test batches: 16
