
# Breast Cancer CNN â€” **PyTorch** (Modernized)

**Converted on:** 2025-10-17 01:43:26  
**Why PyTorch?** It's the current industry standard in research and production, with flexible APIs and fast GPU support.

### What's new vs. the original TensorFlow/Keras notebook
- Uses **PyTorch** (`torch`, `torchvision`).
- Adds **Batch Normalization** after conv layers.
- Uses **AdamW** optimizer (often better generalization than plain Adam).
- Includes **Early Stopping** and **Model Checkpointing**.
- Supports **GPU** automatically when available.
- Optional **Automatic Mixed Precision (AMP)** for faster training on GPU.
- Same dataset and intent; structure is comparable for apples-to-apples results.

> ðŸ’¡ Each section below is self-contained and **fully editable**. You can tweak paths, transforms, model layers, and training hyperparameters cell-by-cell.


## 1) Environment & Imports

In [6]:

# Editable: core imports
import os
import random
import json
import math
import time
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import AdamW
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms
from torchvision.utils import make_grid

# Visualization & metrics
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import itertools

# Reproducibility
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

SEED = 42
set_seed(SEED)

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print("CUDA available:", torch.cuda.is_available())

# Optional: enable TF32 (Ampere+ GPUs) for faster matmul/convs while preserving accuracy
if torch.cuda.is_available():
    try:
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
        print("Enabled TF32 on supported GPUs.")
    except Exception as e:
        print("TF32 not enabled:", e)

# Optional Mixed Precision
USE_AMP = True if torch.cuda.is_available() else False
print("Using AMP:", USE_AMP)


Using device: cpu
CUDA available: False
Using AMP: False



## 2) Data Paths & (Optional) Kaggle Download

Update these paths to point to your **Kaggle dataset**.  
If your images are arranged like `root/class_x/...jpg` and `root/class_y/...jpg`, you can use `ImageFolder` directly.

- If you already have the dataset locally, set `DATA_ROOT` to that folder.
- If you'd like to **download from Kaggle** in-notebook, fill in the `COMPETITION_OR_DATASET` and ensure your `kaggle.json` is configured.


In [7]:

# === Editable ===
# If the dataset is already downloaded and extracted locally, point DATA_ROOT to it.
# Example structure expected by ImageFolder:
# DATA_ROOT/
#   â”œâ”€â”€ train/
#   â”‚    â”œâ”€â”€ benign/ *.png|*.jpg
#   â”‚    â””â”€â”€ malignant/ *.png|*.jpg
#   â””â”€â”€ val/   (optional; if not present we'll split from train)
DATA_ROOT = Path("/path/to/your/breast_cancer_dataset")  # <-- EDIT THIS
TRAIN_DIR = DATA_ROOT / "train"
VAL_DIR = DATA_ROOT / "val"    # optional
TEST_DIR = DATA_ROOT / "test"  # optional

# (Optional) Kaggle dataset/competition settings â€” use ONLY if you want to download here
# Example: "paultimothymooney/breast-histopathology-images"
KAGGLE_DATASET = ""  # e.g., "paultimothymooney/breast-histopathology-images"
KAGGLE_COMPETITION = ""  # leave empty unless it's a competition
KAGGLE_DOWNLOAD_DIR = Path("/mnt/data/kaggle_download")  # safe default inside this environment

def kaggle_download(dataset:str="", competition:str="", out_dir:Path=KAGGLE_DOWNLOAD_DIR):
    if not dataset and not competition:
        print("No Kaggle dataset/competition specified. Skipping download.")
        return

    out_dir.mkdir(parents=True, exist_ok=True)
    try:
        import subprocess, zipfile, glob

        if dataset:
            print(f"Downloading Kaggle dataset: {dataset}")
            subprocess.check_call(["kaggle", "datasets", "download", "-d", dataset, "-p", str(out_dir)])
        elif competition:
            print(f"Downloading Kaggle competition: {competition}")
            subprocess.check_call(["kaggle", "competitions", "download", "-c", competition, "-p", str(out_dir)])

        # Unzip all zips found
        for z in out_dir.glob("*.zip"):
            print(f"Unzipping {z.name} ...")
            with zipfile.ZipFile(z, 'r') as zip_ref:
                zip_ref.extractall(out_dir)
        print("Kaggle download and unzip complete. Inspect files and set DATA_ROOT accordingly.")
    except Exception as e:
        print("Kaggle download failed or kaggle CLI not available:", e)

# Uncomment to trigger a download (if needed):
# kaggle_download(dataset=KAGGLE_DATASET, competition=KAGGLE_COMPETITION, out_dir=KAGGLE_DOWNLOAD_DIR)


## 3) Transforms, Datasets, and Dataloaders

In [8]:

# === Editable ===
IMG_SIZE = 224   # Common choice for many CNNs
BATCH_SIZE = 32
VAL_SPLIT = 0.2  # used only if VAL_DIR doesn't exist
NUM_WORKERS = os.cpu_count() if os.cpu_count() else 2

# Typical histopathology augmentations can be mild; adjust as needed
train_tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomRotation(degrees=10),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

eval_tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

assert TRAIN_DIR.exists(), f"TRAIN_DIR not found: {TRAIN_DIR}. Please set DATA_ROOT correctly."
train_dataset_full = datasets.ImageFolder(root=str(TRAIN_DIR), transform=train_tfms)

if VAL_DIR.exists():
    val_dataset = datasets.ImageFolder(root=str(VAL_DIR), transform=eval_tfms)
    train_dataset = train_dataset_full
else:
    # Split from train
    val_len = int(len(train_dataset_full) * VAL_SPLIT)
    train_len = len(train_dataset_full) - val_len
    train_dataset, val_dataset = random_split(train_dataset_full, [train_len, val_len], generator=torch.Generator().manual_seed(SEED))

if TEST_DIR.exists():
    test_dataset = datasets.ImageFolder(root=str(TEST_DIR), transform=eval_tfms)
else:
    test_dataset = None

class_names = train_dataset_full.classes if hasattr(train_dataset_full, "classes") else None
print("Classes:", class_names)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available()) if test_dataset else None


AssertionError: TRAIN_DIR not found: /path/to/your/breast_cancer_dataset/train. Please set DATA_ROOT correctly.

## 4) Model Definition (Slightly Modernized CNN)

In [None]:

# === Editable ===
# A compact CNN with BatchNorm and Dropout. Feel free to tweak channels/depth.
class BreastCancerCNN(nn.Module):
    def __init__(self, num_classes: int):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),  # -> 32x224x224
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),                              # -> 32x112x112

            nn.Conv2d(32, 64, kernel_size=3, padding=1),  # -> 64x112x112
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),                              # -> 64x56x56

            nn.Conv2d(64, 128, kernel_size=3, padding=1), # -> 128x56x56
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),                              # -> 128x28x28

            nn.Conv2d(128, 256, kernel_size=3, padding=1),# -> 256x28x28
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),                              # -> 256x14x14
        )
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(256 * 14 * 14, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.4),
            nn.Linear(512, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

num_classes = len(class_names) if class_names else 2
model = BreastCancerCNN(num_classes=num_classes).to(device)

# Optional: better init
def init_weights(m):
    if isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
        if m.bias is not None:
            nn.init.constant_(m.bias, 0.0)
    elif isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
        if m.bias is not None:
            nn.init.constant_(m.bias, 0.0)

model.apply(init_weights)
print(model)


## 5) Training Setup

In [None]:

# === Editable ===
EPOCHS = 15
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-4
PATIENCE = 5  # early stopping patience
CHECKPOINT_DIR = Path("./checkpoints")
CHECKPOINT_DIR.mkdir(exist_ok=True, parents=True)
CKPT_PATH = CHECKPOINT_DIR / "best_model.pt"

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)

def accuracy_from_logits(logits, targets):
    preds = torch.argmax(logits, dim=1)
    return (preds == targets).float().mean().item()

def train_one_epoch(model, loader, optimizer, criterion, scaler, device):
    model.train()
    running_loss = 0.0
    running_acc = 0.0
    n = 0
    for imgs, labels in loader:
        imgs, labels = imgs.to(device, non_blocking=True), labels.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=USE_AMP):
            logits = model(imgs)
            loss = criterion(logits, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        batch_size = imgs.size(0)
        running_loss += loss.item() * batch_size
        running_acc  += accuracy_from_logits(logits, labels) * batch_size
        n += batch_size

    return running_loss / n, running_acc / n

@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    running_acc = 0.0
    n = 0
    for imgs, labels in loader:
        imgs, labels = imgs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
        with torch.cuda.amp.autocast(enabled=USE_AMP):
            logits = model(imgs)
            loss = criterion(logits, labels)

        batch_size = imgs.size(0)
        running_loss += loss.item() * batch_size
        running_acc  += accuracy_from_logits(logits, labels) * batch_size
        n += batch_size

    return running_loss / n, running_acc / n


## 6) Train (with Early Stopping & Checkpointing)

In [None]:

best_val_loss = float("inf")
epochs_no_improve = 0
history = {"train_loss": [], "val_loss": [], "train_acc": [], "val_acc": []}

for epoch in range(1, EPOCHS + 1):
    t0 = time.time()
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, scaler, device)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    dt = time.time() - t0

    history["train_loss"].append(train_loss)
    history["val_loss"].append(val_loss)
    history["train_acc"].append(train_acc)
    history["val_acc"].append(val_acc)

    print(f"Epoch {epoch:02d}/{EPOCHS} | "
          f"train_loss={train_loss:.4f} val_loss={val_loss:.4f} | "
          f"train_acc={train_acc:.4f} val_acc={val_acc:.4f} | {dt:.1f}s")

    # Early Stopping & Checkpoint
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        torch.save({"model_state": model.state_dict(),
                    "optimizer_state": optimizer.state_dict(),
                    "epoch": epoch,
                    "val_loss": val_loss,
                    "class_names": class_names}, CKPT_PATH)
        print(f"  â†³ Saved new best checkpoint to {CKPT_PATH}")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= PATIENCE:
            print("Early stopping triggered.")
            break

# Load best weights for evaluation
if CKPT_PATH.exists():
    ckpt = torch.load(CKPT_PATH, map_location=device)
    model.load_state_dict(ckpt["model_state"])
    print(f"Loaded best model from epoch {ckpt.get('epoch')} with val_loss={ckpt.get('val_loss'):.4f}")


## 7) Loss & Accuracy Curves

In [None]:

# Plot training curves
epochs_range = range(1, len(history["train_loss"]) + 1)

plt.figure()
plt.plot(epochs_range, history["train_loss"], label="Train Loss")
plt.plot(epochs_range, history["val_loss"], label="Val Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss over Epochs")
plt.legend()
plt.show()

plt.figure()
plt.plot(epochs_range, history["train_acc"], label="Train Acc")
plt.plot(epochs_range, history["val_acc"], label="Val Acc")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Accuracy over Epochs")
plt.legend()
plt.show()


## 8) Evaluation: Confusion Matrix & Classification Report

In [None]:

@torch.no_grad()
def get_all_preds_targets(model, loader, device):
    model.eval()
    all_preds, all_targets = [], []
    for imgs, labels in loader:
        imgs = imgs.to(device, non_blocking=True)
        logits = model(imgs)
        preds = torch.argmax(logits, dim=1).cpu().numpy().tolist()
        all_preds.extend(preds)
        all_targets.extend(labels.numpy().tolist())
    return np.array(all_preds), np.array(all_targets)

eval_loader = test_loader if test_loader is not None else val_loader
preds, targets = get_all_preds_targets(model, eval_loader, device)

print("Classification Report:")
print(classification_report(targets, preds, target_names=class_names if class_names else None, digits=4))

cm = confusion_matrix(targets, preds)
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix'):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.figure()
    plt.imshow(cm, interpolation='nearest')
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()

plot_confusion_matrix(cm, class_names if class_names else [str(i) for i in range(num_classes)], normalize=False, title='Confusion Matrix')
plot_confusion_matrix(cm, class_names if class_names else [str(i) for i in range(num_classes)], normalize=True, title='Confusion Matrix (Normalized)')


## 9) Inference Helper

In [None]:

from PIL import Image

@torch.no_grad()
def predict_image(model, image_path: str, transform, device):
    model.eval()
    img = Image.open(image_path).convert("RGB")
    x = transform(img).unsqueeze(0).to(device)
    logits = model(x)
    probs = F.softmax(logits, dim=1).cpu().numpy()[0]
    pred_idx = int(np.argmax(probs))
    return pred_idx, probs

# Example usage (edit path):
# img_path = "/path/to/sample.jpg"
# idx, probs = predict_image(model, img_path, eval_tfms, device)
# print("Predicted:", class_names[idx], "probs=", probs)


## 10) Save / Load Utilities

In [None]:

FINAL_MODEL_PATH = Path("./breast_cancer_cnn_pytorch.pt")

def save_final(model, path=FINAL_MODEL_PATH):
    torch.save({"model_state": model.state_dict(),
                "class_names": class_names}, path)
    print(f"Saved model to {path}")

def load_final(path=FINAL_MODEL_PATH, device=device):
    ckpt = torch.load(path, map_location=device)
    model = BreastCancerCNN(num_classes=len(ckpt["class_names"])).to(device)
    model.load_state_dict(ckpt["model_state"])
    return model, ckpt["class_names"]

# Example:
# save_final(model, FINAL_MODEL_PATH)
# loaded_model, loaded_classes = load_final(FINAL_MODEL_PATH, device=device)



## 11) Notes on Reproducibility & Matching Results

- Seeding (`SEED=42`) and disabling CuDNN nondeterminism are enabled for **stable results**.
- Minor differences between TensorFlow and PyTorch (initialization, data order, AMP) can cause small metric variations.
- To get as close as possible to the original:
  - Use identical image sizes, augmentations, splits, and batch size.
  - Keep learning rate, epochs, and optimizer comparable (we use **AdamW** with modest weight decay for better generalization).
  - If exact parity is required, set `USE_AMP = False` and consider removing BatchNorm or Dropout changes.


## 12) Sanity Check: Visualize a Batch

In [None]:

def show_batch(dl):
    imgs, labels = next(iter(dl))
    grid = make_grid(imgs[:16], nrow=8, padding=2, normalize=True)
    plt.figure(figsize=(10,5))
    plt.imshow(grid.permute(1,2,0))
    plt.axis('off')
    if hasattr(dl.dataset, 'dataset') and hasattr(dl.dataset.dataset, 'classes'):
        classes = dl.dataset.dataset.classes
    elif hasattr(dl.dataset, 'classes'):
        classes = dl.dataset.classes
    else:
        classes = class_names
    plt.title("A few training samples")
    plt.show()

# Uncomment to preview:
# show_batch(train_loader)
