In [4]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "Folds.csv"

# Load the latest version
df = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "ambarish/breakhis",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())

Downloading from https://www.kaggle.com/api/v1/datasets/download/ambarish/breakhis?dataset_version_number=4&file_name=Folds.csv...


100%|██████████| 120k/120k [00:00<00:00, 632kB/s]

Extracting zip of Folds.csv...





First 5 records:    fold  mag    grp                                           filename
0     1  100  train  BreaKHis_v1/histology_slides/breast/benign/SOB...
1     1  100  train  BreaKHis_v1/histology_slides/breast/benign/SOB...
2     1  100  train  BreaKHis_v1/histology_slides/breast/benign/SOB...
3     1  100  train  BreaKHis_v1/histology_slides/breast/benign/SOB...
4     1  100  train  BreaKHis_v1/histology_slides/breast/benign/SOB...


In [1]:
import os
from os import listdir
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from pathlib import Path

import glob
import random
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset  # Ensure Dataset is imported
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision.io import read_image

import cv2
from matplotlib.image import imread

In [2]:
import glob
breast_img_paths = glob.glob('data/**/*.png', recursive = True)
for img_path in breast_img_paths[:5]:
    print(img_path)
    img_name = Path(img_path).name



data/malignant/SOB/mucinous_carcinoma/SOB_M_MC_14-13418DE/100X/SOB_M_MC-14-13418DE-100-009.png
data/malignant/SOB/mucinous_carcinoma/SOB_M_MC_14-13418DE/100X/SOB_M_MC-14-13418DE-100-008.png
data/malignant/SOB/mucinous_carcinoma/SOB_M_MC_14-13418DE/100X/SOB_M_MC-14-13418DE-100-003.png
data/malignant/SOB/mucinous_carcinoma/SOB_M_MC_14-13418DE/100X/SOB_M_MC-14-13418DE-100-002.png
data/malignant/SOB/mucinous_carcinoma/SOB_M_MC_14-13418DE/100X/SOB_M_MC-14-13418DE-100-014.png


In [3]:
# types of cancer
benign = []
malignant = []
# subtypes
A, F, PT, TA = [], [], [], [] # subtypes for benign
DC, LC, MC, PC = [], [], [], [] # subtypes for malignant

for img in breast_img_paths:
    img_name = Path(img).name
    if img_name[6] == 'A':
        A.append(img)
    elif img_name[6] == 'F':
        F.append(img)
    elif img_name[6] == 'P'and img_name[7] == 'T':
        PT.append(img)
    elif img_name[6] == 'T':
        TA.append(img)
    elif img_name[6] == 'D':
        DC.append(img)
    elif img_name[6] == 'L':
        LC.append(img)
    elif img_name[6] == 'M':
        MC.append(img)
    elif img_name[6] == 'P':
        PC.append(img)
    
    if img_name[4] == 'B':
        benign.append(img)
    else:
        malignant.append(img)

In [4]:
print(benign[:5])
print(len(malignant))

['data/benign/SOB/adenosis/SOB_B_A_14-22549AB/100X/SOB_B_A-14-22549AB-100-011.png', 'data/benign/SOB/adenosis/SOB_B_A_14-22549AB/100X/SOB_B_A-14-22549AB-100-005.png', 'data/benign/SOB/adenosis/SOB_B_A_14-22549AB/100X/SOB_B_A-14-22549AB-100-004.png', 'data/benign/SOB/adenosis/SOB_B_A_14-22549AB/100X/SOB_B_A-14-22549AB-100-010.png', 'data/benign/SOB/adenosis/SOB_B_A_14-22549AB/100X/SOB_B_A-14-22549AB-100-006.png']
5429


In [9]:
class BreakHisDataset(Dataset):
    def __init__(self, data_folder_path, labels_path = "Folds.csv", magnification = [40], transform=None, train=True):
        self.data_folder_path = data_folder_path
        self.labels_path = labels_path
        self.transform = transform
        self.magnification = magnification
        self.train = train

        self._load_data()
        target_split = 0 if self.train else 1
        self.data = self.data[self.data[:, 2].astype(int) == target_split]

    def _load_data(self):
        # Load all image path
        breast_img_paths = glob.glob(self.data_folder_path + '/**/*.png', recursive = True)
        # Keep only the ones with selected magnification
        filtered_img_paths = []
        for img in breast_img_paths:
            img_name = Path(img).name
            for mag in self.magnification:
                if f"-{mag}-" in img_name:
                    filtered_img_paths.append(img) 
        triplets = []

        df = pd.read_csv(self.labels_path)
        for image_path in filtered_img_paths:
            filename = Path(image_path).name
            row = df[df['filename'].str.contains(filename)]
            if not row.empty:
                grp = row['grp'].values[0]
                label = 0 if '/benign/' in image_path else 1
                train_test = 0 if grp == 'train' else 1
                triplets.append((img_path, label, train_test))
        self.data = np.array(triplets)

    def __getitem__(self, idx):
        img_path, label, train_set = self.data[idx]
        image = read_image(img_path)
        image = image.float() / 255.0

        if self.transform:
            image = self.transform(image)
            
        return image, label, train_set
    
    def __len__(self):
        return len(self.data)
        


In [6]:
df = pd.read_csv("Folds.csv")

In [7]:
df.head()
df['filename'] = df['filename'].str.replace('BreaKHis_v1/histology_slides/breast/', 'data/', regex=False)


In [8]:
df.head()

Unnamed: 0,fold,mag,grp,filename
0,1,100,train,data/benign/SOB/adenosis/SOB_B_A_14-22549AB/10...
1,1,100,train,data/benign/SOB/adenosis/SOB_B_A_14-22549AB/10...
2,1,100,train,data/benign/SOB/adenosis/SOB_B_A_14-22549AB/10...
3,1,100,train,data/benign/SOB/adenosis/SOB_B_A_14-22549AB/10...
4,1,100,train,data/benign/SOB/adenosis/SOB_B_A_14-22549AB/10...


In [12]:
# Load all image path
breast_img_paths = glob.glob('data/**/*.png', recursive = True)
# Keep only the ones with selected magnification
filtered_img_paths = []
for img in breast_img_paths:
    img_name = Path(img).name
    for mag in [40, 100]:
        if f"-{mag}-" in img_name:
            filtered_img_paths.append(img) 
triplets = []

for image_path in filtered_img_paths:
    filename = Path(image_path).name
    row = df[df['filename'].str.contains(filename)]
    if not row.empty:
        grp = row['grp'].values[0]
        label = 0 if '/benign/' in image_path else 1
        train_test = 0 if grp == 'train' else 1
        triplets.append((img_path, label, train_test))
triplets = np.array(triplets)


In [7]:
transform_pipeline = transforms.Compose([
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1), 
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [10]:
train_dataset = BreakHisDataset(
    data_folder_path='data/', 
    magnification=[40, 100],
    transform=transform_pipeline,
    train=True
)

test_dataset = BreakHisDataset(
    data_folder_path='data/', 
    magnification=[40, 100],
    transform=transform_pipeline, 
    train=False
)

# 2. Create the DataLoaders
train_loader = DataLoader(
    train_dataset, 
    batch_size=32, 
    shuffle=True,      # Always shuffle training data
    num_workers=4      # Parallel data loading
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=32, 
    shuffle=False,     # No need to shuffle test data
    num_workers=4
)