<a href="https://colab.research.google.com/github/Kishan-jobs/Calypso/blob/main/breast_cancer(cnn).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ambarish/breakhis")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/breakhis


In [None]:
import os
import shutil

source_root = "/kaggle/input/breakhis/BreaKHis_v1/histology_slides/breast"
target_root = "/content/breakhis_40x_clean"
os.makedirs(f"{target_root}/benign", exist_ok=True)
os.makedirs(f"{target_root}/malignant", exist_ok=True)

# Copy only 40X images into the clean folders
for label in ["benign", "malignant"]:
    class_path = os.path.join(source_root, label)
    for subdir, _, files in os.walk(class_path):
        if "40X" in subdir:  # filter only 40X folders
            for file in files:
                if file.lower().endswith((".png", ".jpg", ".jpeg")):
                    src = os.path.join(subdir, file)
                    dst = os.path.join(target_root, label, file)
                    shutil.copy2(src, dst)

print("✅ Copied 40X images into:", target_root)


✅ Copied 40X images into: /content/breakhis_40x_clean


In [None]:
# Step A: Upload your kaggle.json
from google.colab import files
files.upload()  # choose kaggle.json from your device

# Step B: Move it to correct location
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Step C: Download BreakHis
!kaggle datasets download -d ambarish/breakhis

# Step D: Unzip the dataset
!unzip breakhis.zip -d /content/breakhis


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/breakhis/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/malignant/SOB/ductal_carcinoma/SOB_M_DC_14-12312/40X/SOB_M_DC-14-12312-40-026.png  
  inflating: /content/breakhis/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/malignant/SOB/ductal_carcinoma/SOB_M_DC_14-12312/40X/SOB_M_DC-14-12312-40-027.png  
  inflating: /content/breakhis/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/malignant/SOB/ductal_carcinoma/SOB_M_DC_14-12312/40X/SOB_M_DC-14-12312-40-028.png  
  inflating: /content/breakhis/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/malignant/SOB/ductal_carcinoma/SOB_M_DC_14-12312/40X/SOB_M_DC-14-12312-40-029.png  
  inflating: /content/breakhis/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/malignant/SOB/ductal_carcinoma/SOB_M_DC_14-12312/40X/SOB_M_DC-14-12312-40-030.png  
  inflating: /content/breakhis/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/malignant/SOB/ductal_carcinoma/SOB_M_DC_1

In [None]:
import zipfile

zip_path = "/content/breakhis.zip"
extract_path = "/content/breakhis"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("✅ Unzipped to:", extract_path)


✅ Unzipped to: /content/breakhis


In [None]:
import os

# Check top level extracted folder
print("🔍 Top-level folders inside /content/breakhis:")
print(os.listdir("/content/breakhis"))


🔍 Top-level folders inside /content/breakhis:
['Folds.csv', 'BreaKHis_v1']


In [None]:
# Show full path to 'breast' folder if it exists
for root, dirs, files in os.walk("/content/breakhis"):
    for name in dirs:
        if name.lower() == "breast":
            print("✅ Found breast folder at:", os.path.join(root, name))


✅ Found breast folder at: /content/breakhis/BreaKHis_v1/BreaKHis_v1/histology_slides/breast


In [None]:
import os
import shutil

source_root = "/content/breakhis/BreaKHis_v1/BreaKHis_v1/histology_slides/breast"
target_root = "/content/breakhis_40x_clean"

os.makedirs(f"{target_root}/benign", exist_ok=True)
os.makedirs(f"{target_root}/malignant", exist_ok=True)

count = {"benign": 0, "malignant": 0}

for label in ["benign", "malignant"]:
    class_path = os.path.join(source_root, label)
    for subdir, _, files in os.walk(class_path):
        if "40X" in subdir:  # filter only 40X folders
            for file in files:
                if file.lower().endswith((".png", ".jpg", ".jpeg")):
                    src = os.path.join(subdir, file)
                    dst = os.path.join(target_root, label, file)
                    shutil.copy2(src, dst)
                    count[label] += 1

print(f"✅ Copied {count['benign']} benign and {count['malignant']} malignant images into /content/breakhis_40x_clean")


✅ Copied 625 benign and 1370 malignant images into /content/breakhis_40x_clean


In [None]:
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# === CONFIG ===
dataset_path = "/content/breakhis_40x_clean"
image_size   = (224, 224)
batch_size   = 32
AUTOTUNE     = tf.data.AUTOTUNE

# === LOAD DATA ===
train_ds = tf.keras.utils.image_dataset_from_directory(
    dataset_path,
    validation_split=0.2,
    subset="training",
    seed=123,
    image_size=image_size,
    batch_size=batch_size,
    label_mode="binary"
)
val_ds = tf.keras.utils.image_dataset_from_directory(
    dataset_path,
    validation_split=0.2,
    subset="validation",
    seed=123,
    image_size=image_size,
    batch_size=batch_size,
    label_mode="binary"
)

train_ds = train_ds.cache().prefetch(AUTOTUNE)
val_ds   = val_ds.cache().prefetch(AUTOTUNE)

# === CLASS WEIGHTS ===
labels = [int(label.numpy()) for _, label in train_ds.unbatch()]
weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels), y=labels)
class_weights = dict(enumerate(weights))

# === MODEL: MobileNetV2 + Custom Head ===
base_model = tf.keras.applications.MobileNetV2(
    input_shape=(*image_size, 3),
    include_top=False,
    weights='imagenet'
)
base_model.trainable = False

model = tf.keras.Sequential([
    tf.keras.layers.Rescaling(1./255),
    base_model,
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# === TRAINING: Phase 1 (frozen base) ===
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
model.fit(train_ds, validation_data=val_ds, epochs=5, callbacks=[early_stop], class_weight=class_weights)

# === TRAINING: Phase 2 (fine-tune base) ===
base_model.trainable = True
model.compile(optimizer=tf.keras.optimizers.Adam(1e-5), loss='binary_crossentropy', metrics=['accuracy'])
model.fit(train_ds, validation_data=val_ds, epochs=10, initial_epoch=5, callbacks=[early_stop], class_weight=class_weights)

# === EVALUATION ===
test_ds = tf.keras.utils.image_dataset_from_directory(
    dataset_path,
    image_size=image_size,
    batch_size=batch_size,
    shuffle=False,
    label_mode="binary"
).cache().prefetch(AUTOTUNE)

y_true, y_pred = [], []
for images, labels in test_ds:
    preds = model.predict(images)
    y_true.extend(labels.numpy())
    y_pred.extend((preds > 0.5).astype("int32").flatten())

print("\n📋 Classification Report")
print(classification_report(y_true, y_pred, target_names=["Benign", "Malignant"]))

cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=["Benign", "Malignant"],
            yticklabels=["Benign", "Malignant"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


Found 1995 files belonging to 2 classes.
Using 1596 files for training.
Found 1995 files belonging to 2 classes.
Using 399 files for validation.


  labels = [int(label.numpy()) for _, label in train_ds.unbatch()]
  labels = [int(label.numpy()) for _, label in train_ds.unbatch()]
  labels = [int(label.numpy()) for _, label in train_ds.unbatch()]
  labels = [int(label.numpy()) for _, label in train_ds.unbatch()]
  labels = [int(label.numpy()) for _, label in train_ds.unbatch()]
  labels = [int(label.numpy()) for _, label in train_ds.unbatch()]
  labels = [int(label.numpy()) for _, label in train_ds.unbatch()]
  labels = [int(label.numpy()) for _, label in train_ds.unbatch()]
  labels = [int(label.numpy()) for _, label in train_ds.unbatch()]
  labels = [int(label.numpy()) for _, label in train_ds.unbatch()]
  labels = [int(label.numpy()) for _, label in train_ds.unbatch()]
  labels = [int(label.numpy()) for _, label in train_ds.unbatch()]
  labels = [int(label.numpy()) for _, label in train_ds.unbatch()]
  labels = [int(label.numpy()) for _, label in train_ds.unbatch()]
  labels = [int(label.numpy()) for _, label in train_ds.unbatc

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Epoch 1/5
[1m 4/50[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m48s[0m 1s/step - accuracy: 0.6693 - loss: 0.7602

KeyboardInterrupt: 

In [None]:
# 🔄 Add Augmentation to Training Pipeline
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal_and_vertical"),
    tf.keras.layers.RandomRotation(0.2),
    tf.keras.layers.RandomZoom(0.2),
    tf.keras.layers.RandomContrast(0.2),
    tf.keras.layers.RandomBrightness(0.2),
])

train_ds_aug = train_ds.map(lambda x, y: (data_augmentation(x, training=True), y)).cache().prefetch(tf.data.AUTOTUNE)


In [None]:
import tensorflow as tf
import numpy as np
from sklearn.metrics import classification_report

# Define your test dataset again (unaltered, base version)
test_ds = tf.keras.utils.image_dataset_from_directory(
    "/content/breakhis_40x_clean",
    image_size=(224, 224),
    batch_size=32,
    shuffle=False,
    label_mode="binary"
).cache().prefetch(tf.data.AUTOTUNE)

# Define augmentations for TTA
tta_augment = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal"),
    tf.keras.layers.RandomRotation(0.15),
    tf.keras.layers.Rescaling(1./255)
])

# Run TTA predictions
y_true, y_pred_tta = [], []

for images, labels in test_ds:
    tta_preds = []
    for _ in range(5):  # 5 augmented versions
        augmented = tta_augment(images, training=True)
        preds = model.predict(augmented)
        tta_preds.append(preds)
    final_preds = np.mean(tta_preds, axis=0)
    y_true.extend(labels.numpy())
    y_pred_tta.extend((final_preds > 0.5).astype("int32").flatten())

print("\n📋 Classification Report (TTA)")
print(classification_report(y_true, y_pred_tta, target_names=["Benign", "Malignant"]))


Found 1995 files belonging to 2 classes.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
