In [1]:
# ## 1. Preparation: Load Data and Define Dataset Class

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# --- 1a. Redefine the Dataset Class (copied from the EDA Notebook) ---
class CancerDataset(Dataset):
    """Custom Dataset for histopathologic cancer detection."""
    def __init__(self, df, data_path, transform=None):
        self.df = df
        self.data_path = data_path
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        image_id = self.df.iloc[idx]['id']
        label = self.df.iloc[idx]['label']
        image_path = os.path.join(self.data_path, 'train', f'{image_id}.tif')
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, label

# --- 1b. Load data and create train/validation splits ---
DATA_PATH = 'E:\data\histopathologic' # ‼️ Make sure this path is correct
df_labels = pd.read_csv(os.path.join(DATA_PATH, 'train_labels.csv'))

# To evaluate the model, we need to split our data into a training and a validation set.
# stratify=df_labels['label'] ensures that the class distribution is the same in both splits.
train_df, val_df = train_test_split(df_labels, test_size=0.2, random_state=42, stratify=df_labels['label'])

print(f"Original dataset size: {len(df_labels)}")
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

# --- 1c. Define Image Transforms ---
data_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# --- 1d. Create Dataset instances for training and validation ---
train_dataset = CancerDataset(df=train_df, data_path=DATA_PATH, transform=data_transforms)
val_dataset = CancerDataset(df=val_df, data_path=DATA_PATH, transform=data_transforms)


# ## 2. Create DataLoaders

# --- 2a. Define hyperparameters ---
BATCH_SIZE = 64 # Number of images to process in a batch

# --- 2b. Create DataLoader instances ---
# Create a DataLoader for the training set, with shuffling to randomize order each epoch
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
# Create a DataLoader for the validation set, no need to shuffle
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

print(f"\n✅ DataLoaders created successfully!")
print(f"Each batch will contain {BATCH_SIZE} images.")


# --- 2c. Test the DataLoader ---
print("\n--- Testing the DataLoader ---")
# A DataLoader is an iterator, we can get the first batch using next(iter(...))
images_batch, labels_batch = next(iter(train_loader))

print(f"Shape of the retrieved images batch: {images_batch.shape}")
print(f"Shape of the retrieved labels batch: {labels_batch.shape}")
print("(Image shape [64, 3, 96, 96] means [Batch Size, Channels, Height, Width])")

Original dataset size: 220025
Training set size: 176020
Validation set size: 44005

✅ DataLoaders created successfully!
Each batch will contain 64 images.

--- Testing the DataLoader ---
Shape of the retrieved images batch: torch.Size([64, 3, 96, 96])
Shape of the retrieved labels batch: torch.Size([64])
(Image shape [64, 3, 96, 96] means [Batch Size, Channels, Height, Width])
