In [None]:
# https://www.kaggle.com/code/prakharbhartiya1/breast-cancer-detection/data

In [1]:
# https://www.kaggle.com/datasets/paultimothymooney/breast-histopathology-images

### Defining Dataset Paths and Train/Validation/Test Splits


In [16]:
import os
INPUT_DATASET = 'datasets/original'
BASE_PATH = 'datasets/idc'
TRAIN_PATH = os.path.sep.join([BASE_PATH, 'training'])
VAL_PATH = os.path.sep.join([BASE_PATH, 'validation'])
TEST_PATH = os.path.sep.join([BASE_PATH, 'testing'])
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.1

### Preparing Dataset Folders and Splitting Images by Labels


In [None]:
from imutils import paths 
import random, shutil, os

originalPaths = list(paths.list_images(INPUT_DATASET))
random.seed(7)
random.shuffle(originalPaths)

index = int(len(originalPaths) * TRAIN_SPLIT)
trainPaths = originalPaths[:index] 
testPaths = originalPaths[index:]

index = int(len(trainPaths) * VAL_SPLIT)
valPaths = trainPaths[:index]
trainPaths = trainPaths[index:]

datasets = [
    ("training", trainPaths, TRAIN_PATH), 
    ("validation", valPaths, VAL_PATH), 
    ("testing", testPaths, TEST_PATH)
]

for (setType, originalPaths, basePath) in datasets: 
    print(f'Building {setType} set')
    if not os.path.exists(basePath):
        print(f'Building directory {basePath}') 
        os.makedirs(basePath)
    
    for path in originalPaths:
        file = path.split(os.path.sep)[-1] 
        label = file[-5:-4]
        labelPath = os.path.sep.join([basePath, label])
        
        if not os.path.exists(labelPath):
            print(f'Building directory {labelPath}') 
            os.makedirs(labelPath)
        
        newPath = os.path.sep.join([labelPath, file]) 
        shutil.copy2(path, newPath)


Building the CNN model

In [19]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras import backend as K

class CancerNet:
    def build(width, height, depth, classes):
        model = tf.keras.models.Sequential()
        shape = (height, width, depth)
        channelDim = -1

        if K.image_data_format() == "channels_first":
            shape = (depth, height, width)
            channelDim = 1

        model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, activation='relu', input_shape=shape))
        model.add(tf.keras.layers.BatchNormalization(axis=channelDim))
        model.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2))
        model.add(tf.keras.layers.Dropout(0.25))

        model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu'))
        model.add(tf.keras.layers.BatchNormalization(axis=channelDim))
        
        model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu'))
        model.add(tf.keras.layers.BatchNormalization(axis=channelDim))
        
        model.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2))
        model.add(tf.keras.layers.Dropout(0.25))
        
        model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu'))
        model.add(tf.keras.layers.BatchNormalization(axis=channelDim))
        
        model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu'))
        model.add(tf.keras.layers.BatchNormalization(axis=channelDim))
        
        model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu'))
        model.add(tf.keras.layers.BatchNormalization(axis=channelDim))
        
        model.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2))
        model.add(tf.keras.layers.Dropout(0.25))
        
        model.add(tf.keras.layers.Flatten())
        model.add(tf.keras.layers.Dense(units=256, activation='relu'))
        
        model.add(tf.keras.layers.BatchNormalization(axis=channelDim))
        model.add(tf.keras.layers.Dropout(0.5))
        
        model.add(tf.keras.layers.Dense(units=classes, activation='softmax'))
        
        return model
        



### Loading and Augmenting Training Images with ImageDataGenerator

In [20]:
import matplotlib
matplotlib.use('Agg')

train_datagen = ImageDataGenerator(rescale = 1./255,
                                  shear_range=0.2,
                                   zoom_range=0.2,
                                  horizontal_flip=True)
training_set = train_datagen.flow_from_directory('datasets/idc/training/',
                                                target_size=(64,64),
                                                batch_size=32,
                                                class_mode='binary')


Found 203468 images belonging to 2 classes.


In [21]:
len(training_set)

6359

### Creating ImageDataGenerators for Training, Validation, and Testing


In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import LearningRateScheduler
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from imutils import paths
import matplotlib.pyplot as plt
import numpy as np
import os

NUM_EPOCHS = 3
INIT_LR = 1e-2
BS = 32

trainPaths = list(paths.list_images(TRAIN_PATH))
lenTrain = len(trainPaths)
lenVal = len(list(paths.list_images(VAL_PATH)))
lenTest = len(list(paths.list_images(TEST_PATH)))

trainLabels = [int(p.split(os.path.sep)[-2]) for p in trainPaths]
trainLabels = to_categorical(trainLabels)

classTotals = trainLabels.sum(axis=0)
classWeight = classTotals.max() / classTotals

trainAug = ImageDataGenerator(
    rescale=1/255.0,
    rotation_range=20,
    zoom_range=0.05,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.05,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode="nearest"
)

valAug = ImageDataGenerator(rescale=1/255.0)

trainGen = trainAug.flow_from_directory(
    TRAIN_PATH,
    class_mode="categorical",
    target_size=(48, 48),
    color_mode="rgb",
    shuffle=True,
    batch_size=BS
)

valGen = valAug.flow_from_directory(
    VAL_PATH,
    class_mode="categorical",
    target_size=(48, 48),
    color_mode="rgb",
    shuffle=False,
    batch_size=BS
)

testGen = valAug.flow_from_directory(
    TEST_PATH,
    class_mode="categorical",
    target_size=(48, 48),
    color_mode="rgb",
    shuffle=False,
    batch_size=BS
)


Found 203468 images belonging to 2 classes.
Found 23537 images belonging to 2 classes.
Found 58484 images belonging to 2 classes.


Safe Generator for checking for bad images in the dataset

In [23]:
# Define a safe generator wrapper to skip corrupted batches
def safe_generator(generator):
    """
    Yields batches, skipping any batch that triggers an error during data loading.
    """
    while True:
        try:
            yield next(generator)
        except Exception as e:
            print("[Warning] Skipping a corrupted batch:", e)


Making sure that no bad images get into the model

In [31]:
# Wrap the generators so they are protected from corrupted images
safe_trainGen = safe_generator(trainGen)
safe_testGen = safe_generator(testGen)
safe_valGen = safe_generator(valGen)


### Creating CancerNet Architecture and Displaying Model Summary


In [25]:
model = CancerNet.build(width = 48, height = 48, depth = 3, classes =2)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

model.summary()

Training the model

In [None]:
# Calculate steps per epoch manually
steps_per_epoch = trainGen.samples // trainGen.batch_size
validation_steps = valGen.samples // valGen.batch_size

# Train the model using the safe generators
M = model.fit(
    x=safe_trainGen,
    steps_per_epoch=steps_per_epoch,
    validation_data=safe_valGen,
    validation_steps=validation_steps,
    epochs=3
)


Epoch 1/3
[1m6358/6358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3697s[0m 579ms/step - accuracy: 0.8096 - loss: 0.4572 - val_accuracy: 0.8180 - val_loss: 0.4112
Epoch 2/3
[1m6358/6358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4499s[0m 708ms/step - accuracy: 0.8424 - loss: 0.3670 - val_accuracy: 0.8468 - val_loss: 0.4098
Epoch 3/3
[1m6358/6358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3258s[0m 513ms/step - accuracy: 0.8535 - loss: 0.3501 - val_accuracy: 0.8573 - val_loss: 0.3489


Testing and preparing Confusion Matrix

In [None]:
print("Now evaluating the model...")

# Reset the test generator
testGen.reset()

# Predict using the safe test generator
pred_indices = model.predict(
    safe_testGen,
    steps=(lenTest // BS) + 1
)

# Convert predictions to class labels
pred_indices = np.argmax(pred_indices, axis=1)

# Check lengths
print("Predictions shape:", pred_indices.shape)
print("Ground truth shape:", testGen.classes.shape)

# Trim to the same length
min_len = min(len(pred_indices), len(testGen.classes))
pred_indices = pred_indices[:min_len]
true_classes = testGen.classes[:min_len]

# Classification report
print("\nClassification Report:")
print(classification_report(
    true_classes,
    pred_indices,
    target_names=list(testGen.class_indices.keys())
))

# Confusion matrix
cm = confusion_matrix(true_classes, pred_indices)
print("\nConfusion Matrix:")
print(cm)

# Compute metrics
total = sum(sum(cm))
accuracy = (cm[0,0] + cm[1,1]) / total
specificity = cm[1,1] / (cm[1,0] + cm[1,1])
sensitivity = cm[0,0] / (cm[0,0] + cm[0,1])

print(f"\nAccuracy: {accuracy:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"Sensitivity: {sensitivity:.4f}")

Now evaluating the model...
[1m1828/1828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1352s[0m 740ms/step
Predictions shape: (58484,)
Ground truth shape: (58484,)

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.93      0.90     42060
           1       0.79      0.67      0.73     16424

    accuracy                           0.86     58484
   macro avg       0.83      0.80      0.81     58484
weighted avg       0.85      0.86      0.85     58484


Confusion Matrix:
[[39163  2897]
 [ 5433 10991]]

Accuracy: 0.8576
Specificity: 0.6692
Sensitivity: 0.9311


Plotting a graph

In [36]:
# Plot training history
N = len(M.history["loss"])   # Automatically detect number of epochs
plt.style.use("ggplot")
plt.figure(figsize=(10, 6))

plt.plot(np.arange(0, N), M.history["loss"], label="train_loss")
plt.plot(np.arange(0, N), M.history["val_loss"], label="val_loss")
plt.plot(np.arange(0, N), M.history["accuracy"], label="train_acc")
plt.plot(np.arange(0, N), M.history["val_accuracy"], label="val_acc")

plt.title("Training Loss and Accuracy on the IDC Dataset")
plt.xlabel("Epoch")
plt.ylabel("Loss/Accuracy")
plt.legend(loc="lower left")
plt.tight_layout()
plt.savefig("plot.png")
print("\nTraining plot saved as 'plot.png'")



Training plot saved as 'plot.png'
