## Brest cancer classifier

In [12]:
import os
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models, mixed_precision

import warnings
warnings.filterwarnings('ignore')

### End-to-End CNN Strategy (1st)

#### Overview
Utilize a Convolutional Neural Network (CNN) to directly learn features from 50×50 histology image patches and classify them as benign (Class 0) or malignant (Class 1).

#### Key Steps

1. **Model Architecture**
   - **Convolutional Layers:** Automatically extract local features such as edges and textures.
   - **Pooling Layers:** Reduce spatial dimensions, making the model more robust to small translations.
   - **Fully Connected Layers:** Integrate the learned features to map them to a binary classification output.

2. **Data Augmentation**
   - **Techniques:** Apply rotations, flips, zooming, and shifts.
   - **Purpose:** Increase the effective size and variability of the dataset to reduce overfitting and improve generalization.

3. **Training with Labeled Data**
   - **Supervised Learning:** Use the provided labels with a loss function (e.g., cross-entropy) to train the network.
   - **Backpropagation:** Adjust the network weights iteratively to minimize classification errors.

4. **Optimization Techniques**
   - **Early Stopping:** Monitor validation performance to avoid overfitting.
   - **Learning Rate Scheduling:** Adapt the learning rate during training to ensure stable convergence.
   - **Dropout:** Randomly deactivate neurons during training to force the network to learn robust features.

5. **Evaluation Metrics**
   - **Metrics:** Assess performance using accuracy, precision, recall, and F1-score.
   - **Clinical Relevance:** Emphasize metrics that capture the balance between false positives and false negatives.

In [13]:
# -------------------------------
# Step 1: Build DataFrame from Directory Structure
# -------------------------------
data_dir = 'data/IDC_regular_ps50_idx5'  # Update this path

filepaths = []
labels = []

for root, dirs, files in os.walk(data_dir):
    for file in files:
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            file_path = os.path.join(root, file)
            # Assumes label is the name of the immediate parent folder
            label = os.path.basename(os.path.dirname(file_path))
            filepaths.append(file_path)
            labels.append(label)

# Create DataFrame
df = pd.DataFrame({
    'filename': filepaths,
    'class': labels
})

# Split DataFrame into training and validation sets (80/20 split)
train_df, valid_df = train_test_split(df, test_size=0.2, stratify=df['class'], random_state=42)

# Create a mapping from class names to integer labels
classes = sorted(df['class'].unique())
class_to_index = {cls: idx for idx, cls in enumerate(classes)}
num_classes = len(classes)

# Map class labels to integer indices
train_df['label'] = train_df['class'].map(class_to_index)
valid_df['label'] = valid_df['class'].map(class_to_index)

# -------------------------------
# Step 2: Setup tf.data Pipeline with Mixed Precision and Augmentation
# -------------------------------
# Enable mixed precision
mixed_precision.set_global_policy('mixed_float16')

# Define constants
TARGET_SIZE = (50, 50)
BATCH_SIZE = 32
AUTOTUNE = tf.data.experimental.AUTOTUNE

def preprocess_image(filename, label, training=False):
    # Read and decode the image
    image = tf.io.read_file(filename)
    image = tf.image.decode_image(image, channels=3, expand_animations=False)
    # Resize and rescale
    image = tf.image.resize(image, TARGET_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image, label

def augment_image(image, label):
    # Apply random augmentations for training
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_brightness(image, max_delta=0.1)
    image = tf.image.random_contrast(image, lower=0.9, upper=1.1)
    # Note: For rotation, consider using tf.keras.layers.RandomRotation in TF2.6+
    return image, label

# Create a Dataset from the training DataFrame
train_ds = tf.data.Dataset.from_tensor_slices((train_df['filename'].values, train_df['label'].values))
train_ds = train_ds.map(lambda f, l: preprocess_image(f, l, training=True), num_parallel_calls=AUTOTUNE)
train_ds = train_ds.map(augment_image, num_parallel_calls=AUTOTUNE)
train_ds = train_ds.shuffle(1000).batch(BATCH_SIZE).prefetch(AUTOTUNE)

# Create a Dataset from the validation DataFrame
valid_ds = tf.data.Dataset.from_tensor_slices((valid_df['filename'].values, valid_df['label'].values))
valid_ds = valid_ds.map(lambda f, l: preprocess_image(f, l, training=False), num_parallel_calls=AUTOTUNE)
valid_ds = valid_ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)

# One-hot encode the labels within the dataset (optional: you can also use sparse labels)
def one_hot(image, label):
    label = tf.one_hot(label, depth=num_classes)
    return image, label

train_ds = train_ds.map(one_hot, num_parallel_calls=AUTOTUNE)
valid_ds = valid_ds.map(one_hot, num_parallel_calls=AUTOTUNE)

# -------------------------------
# Step 3: Define the CNN Model
# -------------------------------
input_shape = (TARGET_SIZE[0], TARGET_SIZE[1], 3)

model = models.Sequential([
    layers.Input(shape=input_shape),
    layers.Conv2D(32, (3, 3), activation='relu'),
    layers.BatchNormalization(),
    layers.MaxPooling2D(pool_size=(2, 2)),

    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.BatchNormalization(),
    layers.MaxPooling2D(pool_size=(2, 2)),

    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.BatchNormalization(),
    layers.MaxPooling2D(pool_size=(2, 2)),

    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    # Output layer: note that mixed precision might require the final output to be cast back to float32
    layers.Dense(num_classes, activation='softmax', dtype='float32')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [14]:
model.summary()

In [15]:
# -------------------------------
# Step 4: Train the Model
# -------------------------------
epochs = 15
history = model.fit(
    train_ds,
    epochs=epochs,
    validation_data=valid_ds
)

Epoch 1/15


[1m6939/6939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 10ms/step - accuracy: 0.8186 - loss: 0.4446 - val_accuracy: 0.8421 - val_loss: 0.3537
Epoch 2/15
[1m6939/6939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 8ms/step - accuracy: 0.8527 - loss: 0.3575 - val_accuracy: 0.7961 - val_loss: 0.4579
Epoch 3/15
[1m6939/6939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 8ms/step - accuracy: 0.8583 - loss: 0.3405 - val_accuracy: 0.8578 - val_loss: 0.3743
Epoch 4/15
[1m6939/6939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 8ms/step - accuracy: 0.8636 - loss: 0.3293 - val_accuracy: 0.8628 - val_loss: 0.3337
Epoch 5/15
[1m6939/6939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 8ms/step - accuracy: 0.8671 - loss: 0.3207 - val_accuracy: 0.8572 - val_loss: 0.3439
Epoch 6/15
[1m6939/6939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 8ms/step - accuracy: 0.8697 - loss: 0.3149 - val_accuracy: 0.8599 - val_loss: 0.3343
Epoch 7/15
[1m6939/