## Brest cancer classifier

In [14]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.image import ImageDataGenerator

### End-to-End CNN Strategy (1st)

#### Overview
Utilize a Convolutional Neural Network (CNN) to directly learn features from 50×50 histology image patches and classify them as benign (Class 0) or malignant (Class 1).

#### Key Steps

1. **Model Architecture**
   - **Convolutional Layers:** Automatically extract local features such as edges and textures.
   - **Pooling Layers:** Reduce spatial dimensions, making the model more robust to small translations.
   - **Fully Connected Layers:** Integrate the learned features to map them to a binary classification output.

2. **Data Augmentation**
   - **Techniques:** Apply rotations, flips, zooming, and shifts.
   - **Purpose:** Increase the effective size and variability of the dataset to reduce overfitting and improve generalization.

3. **Training with Labeled Data**
   - **Supervised Learning:** Use the provided labels with a loss function (e.g., cross-entropy) to train the network.
   - **Backpropagation:** Adjust the network weights iteratively to minimize classification errors.

4. **Optimization Techniques**
   - **Early Stopping:** Monitor validation performance to avoid overfitting.
   - **Learning Rate Scheduling:** Adapt the learning rate during training to ensure stable convergence.
   - **Dropout:** Randomly deactivate neurons during training to force the network to learn robust features.

5. **Evaluation Metrics**
   - **Metrics:** Assess performance using accuracy, precision, recall, and F1-score.
   - **Clinical Relevance:** Emphasize metrics that capture the balance between false positives and false negatives.

In [15]:
# -------------------------------
# Step 1: Build DataFrame from Directory Structure
# -------------------------------
data_dir = 'data/IDC_regular_ps50_idx5'  # Update this path

filepaths = []
labels = []

# Traverse directory tree
for root, dirs, files in os.walk(data_dir):
    for file in files:
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            file_path = os.path.join(root, file)
            # Assumes the label is the name of the immediate parent folder ("0" or "1")
            label = os.path.basename(os.path.dirname(file_path))
            filepaths.append(file_path)
            labels.append(label)

# Create a DataFrame with the file paths and labels
df = pd.DataFrame({
    'filename': filepaths,
    'class': labels
})

# Optional: Save DataFrame to CSV for future reuse
# df.to_csv("image_paths.csv", index=False)

# Split DataFrame into training and validation sets (80/20 split)
train_df, valid_df = train_test_split(df, test_size=0.2, stratify=df['class'], random_state=42)

# -------------------------------
# Step 2: Setup ImageDataGenerators
# -------------------------------
batch_size = 32
target_size = (50, 50)
color_mode = 'rgb'  # Change to 'grayscale' if your images are grayscale

# Data augmentation for training
train_datagen = ImageDataGenerator(
    rescale=1.0/255,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Only rescaling for validation
valid_datagen = ImageDataGenerator(rescale=1.0/255)

train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    x_col='filename',
    y_col='class',
    target_size=target_size,
    batch_size=batch_size,
    class_mode='categorical',  # Uses one-hot encoding for labels
    color_mode=color_mode,
    shuffle=True,
    seed=42,
    workers=4,  # Apply to validation set too
    use_multiprocessing=True
)

validation_generator = valid_datagen.flow_from_dataframe(
    dataframe=valid_df,
    x_col='filename',
    y_col='class',
    target_size=target_size,
    batch_size=batch_size,
    class_mode='categorical',
    color_mode=color_mode,
    shuffle=False,
    workers=4,  # Number of CPU cores for data loading
    use_multiprocessing=True  # Enables multiprocessing
)

# -------------------------------
# Step 3: Define the CNN Model
# -------------------------------
# Adjust input_shape to (50,50,1) if using grayscale images.
input_shape = (50, 50, 3)

model = Sequential([
    Input(shape=input_shape),
    Conv2D(32, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 2)),

    Conv2D(64, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 2)),

    Conv2D(128, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 2)),

    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')  # 2 classes: "0" and "1"
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

Found 106868 validated image filenames belonging to 2 classes.
Found 26717 validated image filenames belonging to 2 classes.


In [16]:
model.summary()

In [19]:
# -------------------------------
# Step 4: Train the Model
# -------------------------------
epochs = 15
history = model.fit(
    train_generator,
    epochs=epochs,
    validation_data=validation_generator
)

Epoch 1/15
[1m3340/3340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 65ms/step - accuracy: 0.8383 - loss: 0.3774 - val_accuracy: 0.7364 - val_loss: 1.0375
Epoch 2/15
[1m3340/3340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 62ms/step - accuracy: 0.8523 - loss: 0.3554 - val_accuracy: 0.7814 - val_loss: 0.7039
Epoch 3/15
[1m3340/3340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 62ms/step - accuracy: 0.8538 - loss: 0.3487 - val_accuracy: 0.8271 - val_loss: 0.3968
Epoch 4/15
[1m3340/3340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 62ms/step - accuracy: 0.8574 - loss: 0.3378 - val_accuracy: 0.8560 - val_loss: 0.3424
Epoch 5/15
[1m3340/3340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 63ms/step - accuracy: 0.8621 - loss: 0.3304 - val_accuracy: 0.8453 - val_loss: 0.3696
Epoch 6/15
[1m3340/3340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 63ms/step - accuracy: 0.8647 - loss: 0.3252 - val_accuracy: 0.8111 - val_loss: 0.446

In [4]:
print(tf.test.is_built_with_cuda())  # Should return True
print(tf.test.is_gpu_available())  # Deprecated but should return True for TF <2.11
print(tf.config.list_physical_devices('GPU'))  # Should list GPUs

True
False
[]


W0000 00:00:1743096014.675976     658 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [8]:
print(tf.sysconfig.get_build_info())

OrderedDict({'cpu_compiler': '/usr/lib/llvm-18/bin/clang', 'cuda_compute_capabilities': ['sm_60', 'sm_70', 'sm_80', 'sm_89', 'compute_90'], 'cuda_version': '12.5.1', 'cudnn_version': '9', 'is_cuda_build': True, 'is_rocm_build': False, 'is_tensorrt_build': False})
