In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("aryashah2k/breast-ultrasound-images-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/aryashah2k/breast-ultrasound-images-dataset?dataset_version_number=1...


100%|██████████| 195M/195M [00:07<00:00, 26.9MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/aryashah2k/breast-ultrasound-images-dataset/versions/1


In [2]:
import random
import os
import shutil

In [3]:
os.makedirs("data")

FileExistsError: [Errno 17] File exists: 'data'

In [None]:
shutil.move("/root/.cache/kagglehub/datasets/aryashah2k/breast-ultrasound-images-dataset/versions/1", "data")

In [3]:
import tensorflow
from tensorflow import keras
from keras import Sequential
from keras.layers import Dense, Flatten
from keras.applications.vgg16 import VGG16


In [None]:
l_benign = []
l_normal = []
l_malignant = []

In [None]:
for dirname, _, filenames in os.walk('data/1/Dataset_BUSI_with_GT'):
    for filename in filenames:
        if ("mask" not in filename):
            if("benign" in filename):
                l_benign.append(os.path.join(dirname, filename))
            elif ("normal" in filename):
                l_normal.append(os.path.join(dirname, filename))
            elif ("malignant" in filename):
                l_malignant.append(os.path.join(dirname, filename))

In [None]:
random.shuffle(l_benign)
random.shuffle(l_normal)
random.shuffle(l_malignant)

In [None]:
split_ratio = 0.8

# Benign split
split_idx_benign = int(len(l_benign) * split_ratio)
benign_train = l_benign[:split_idx_benign]
benign_test = l_benign[split_idx_benign:]

# Normal split
split_idx_normal = int(len(l_normal) * split_ratio)
normal_train = l_normal[:split_idx_normal]
normal_test = l_normal[split_idx_normal:]

# Malignant split
split_idx_malignant = int(len(l_malignant) * split_ratio)
malignant_train = l_malignant[:split_idx_malignant]
malignant_test = l_malignant[split_idx_malignant:]


In [None]:
# create train and test directory
# train_dir = "/kaggle/working/breast-ultrasound-images-dataset/train"
# test_dir = "/kaggle/working/breast-ultrasound-images-dataset/test"

# os.makedirs(train_dir)
# os.makedirs(test_dir)

In [None]:
list_of_train_groups = [benign_train, normal_train, malignant_train]
list_of_test_groups = [benign_test, normal_test, malignant_test]

In [None]:
import os
import shutil

# Define the base paths
base_dir = "final_data"
train_dir = os.path.join(base_dir, "train")
test_dir = os.path.join(base_dir, "test")

# Class names and their corresponding lists
classes = {
    "benign": {"train": benign_train, "test": benign_test},
    "malignant": {"train": malignant_train, "test": malignant_test},
    "normal": {"train": normal_train, "test": normal_test}
}

# Create directories
for split in ["train", "test"]:
    for cls in classes.keys():
        dir_path = os.path.join(base_dir, split, cls)
        os.makedirs(dir_path, exist_ok=True)

# Copy images to their respective directories
for cls, data in classes.items():
    for split in ["train", "test"]:
        for img_path in data[split]:
            # Destination path
            dest_path = os.path.join(base_dir, split, cls, os.path.basename(img_path))
            shutil.copy(img_path, dest_path)

print("Dataset organized successfully!")


Data Preprocessing

In [1]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define image size and batch size
image_size = (224, 224)
batch_size = 32

# Data augmentation for training
train_datagen = ImageDataGenerator(
    rescale=1./255,         # Normalize pixel values
    rotation_range=20,      # Random rotation
    width_shift_range=0.1,  # Horizontal shift
    height_shift_range=0.1, # Vertical shift
    shear_range=0.1,        # Shear transformation
    zoom_range=0.1,         # Random zoom
    horizontal_flip=True,   # Random horizontal flip
    fill_mode='nearest'     # Filling strategy for new pixels
)

# Only rescaling for validation/test data
test_datagen = ImageDataGenerator(rescale=1./255)

# Load training images
train_generator = train_datagen.flow_from_directory(
    directory='final_data/train',
    target_size=image_size,
    batch_size=batch_size,
    class_mode='sparse'  # For multi-class classification
)

# Load testing images
test_generator = test_datagen.flow_from_directory(
    directory='final_data/test',
    target_size=image_size,
    batch_size=batch_size,
    class_mode='sparse'  # Same as training
)

# Check the class indices assigned
print("Class indices:", train_generator.class_indices)


Found 623 images belonging to 3 classes.
Found 157 images belonging to 3 classes.
Class indices: {'benign': 0, 'malignant': 1, 'normal': 2}


VGG16 model edited

In [4]:
conv_base = VGG16(
    weights='imagenet',
    include_top=False,
    input_shape=(224, 224, 3)
)

In [5]:
conv_base.summary()

In [6]:

set_trainable = False

for layer in conv_base.layers:
  if(layer.name == "block5_conv1"):
    set_trainable = True
  if(set_trainable == True):
    layer.trainable = True
  else:
    layer.trainable = False


In [7]:
for layer in conv_base.layers:
  print(layer.name, layer.trainable)

input_layer False
block1_conv1 False
block1_conv2 False
block1_pool False
block2_conv1 False
block2_conv2 False
block2_pool False
block3_conv1 False
block3_conv2 False
block3_conv3 False
block3_pool False
block4_conv1 False
block4_conv2 False
block4_conv3 False
block4_pool False
block5_conv1 True
block5_conv2 True
block5_conv3 True
block5_pool True


Local model

In [8]:
from tensorflow.keras.layers import BatchNormalization

In [9]:
model = Sequential()

model.add(conv_base)
model.add(Flatten())
model.add(Dense(512, activation="relu"))
model.add(BatchNormalization())
model.add(Dense(256, activation="relu"))
model.add(BatchNormalization())
model.add(Dense(3, activation="softmax"))

In [10]:
model.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

In [11]:
model.summary()

In [None]:
model.fit(train_generator, epochs=15, validation_data=test_generator)

  self._warn_if_super_not_called()


Epoch 1/15
[1m 3/20[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m7:18[0m 26s/step - accuracy: 0.3576 - loss: 2.2759

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
