# Setting up the environment

In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d paultimothymooney/breast-histopathology-images

In [None]:
!mkdir dataset

In [None]:
!ls

In [None]:
!unzip -q breast-histopathology-images.zip -d dataset

## Setup


In [None]:
import numpy as np
import glob
import random
import warnings
warnings.filterwarnings(action = 'ignore')
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from glob import glob

from PIL import Image

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight

from glob import glob
from os import listdir

import time
import copy
from tqdm import tqdm_notebook as tqdm


import tensorflow as tf

random.seed(98)
np.random.seed(98)

In [None]:
samples = glob('dataset/IDC_regular_ps50_idx5/**/*.png', recursive= True)

samples[:3]

In [None]:
IDC_negative = []
IDC_positive = []

for img in samples:
  if img[-5] == '0':
    IDC_negative.append(img)
  else:
    IDC_positive.append(img)

print('Negative samples: ', IDC_negative[:3])
print('Positive samples: ', IDC_positive[:3])
print('Negative sample size: ', len(IDC_negative))
print('Positive sample size: ', len(IDC_positive))

In [None]:
# Randomly select a few IDC negative and positive samples for display
sample_size = 6  # You can change this to the number of samples you want to display
random_negative_samples = random.sample(IDC_negative, sample_size)
random_positive_samples = random.sample(IDC_positive, sample_size)

# Create subplots for IDC negative patches
plt.figure(figsize=(12, 6))
for i in range(sample_size):
    plt.subplot(2, sample_size, i + 1)
    img = mpimg.imread(random_negative_samples[i])
    plt.imshow(img)
    plt.axis('off')
    plt.title('IDC Negative')

# Create subplots for IDC positive patches
for i in range(sample_size):
    plt.subplot(2, sample_size, i + sample_size + 1)
    img = mpimg.imread(random_positive_samples[i])
    plt.imshow(img)
    plt.axis('off')
    plt.title('IDC Positive')

plt.tight_layout()
plt.show()

In [None]:
from PIL import Image
import numpy as np

# Define the image size
IMG_SIZE = (96, 96)
INPUT_SHAPE = (96, 96, 3)
# Define the number of samples to use from each class
NUM_SAMPLES_PER_CLASS = 5000

# Define lists to store features (X) and labels (y)
X = []
y = []

# Define a function to extract features and labels
def extract_features_labels(sample_paths, label_value):
    count = 0
    for path in sample_paths:
        if count >= NUM_SAMPLES_PER_CLASS:
            break
        img = Image.open(path).resize(IMG_SIZE, Image.LANCZOS)
        X.append(np.array(img) / 255.0)
        y.append(label_value)
        count += 1

# Extract features and labels for IDC negative samples (up to 20,000)
extract_features_labels(IDC_negative, label_value=0)

# Extract features and labels for IDC positive samples (up to 20,000)
extract_features_labels(IDC_positive, label_value=1)

# Convert lists to NumPy arrays
X = np.array(X)
y = np.array(y)

# Shuffle the data
shuffled_indices = np.arange(len(y))
np.random.shuffle(shuffled_indices)
X = X[shuffled_indices]
y = y[shuffled_indices]

# Check the shape of the features and labels arrays
print("Features shape:", X.shape)
print("Labels shape:", y.shape)


In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Split the data into train, test, and predict sets
X_train, X_predict, y_train, y_true = train_test_split(X, y, test_size=0.3, random_state=7)

# Define the ratio for reducing the size of the train and test sets
rate = 0.5
num = int(X.shape[0] * rate)

# Reduce the size of the train and test sets
X_test = X_train[num:]
X_train = X_train[:num]

y_test = y_train[num:]
y_train = y_train[:num]

# Convert labels to categorical format
y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)
y_true = to_categorical(y_true, 2)

# Print the shapes of the datasets
print('X_train shape: {}'.format(X_train.shape))
print('X_test shape: {}'.format(X_test.shape))
print('X_predict shape: {}'.format(X_predict.shape))
print('y_train shape: {}'.format(y_train.shape))
print('y_test shape: {}'.format(y_test.shape))
print('y_true shape: {}'.format(y_true.shape))


In [None]:
base_model = tf.keras.applications.VGG16(weights='imagenet', include_top=False)  # Exclude the top fully connected layers

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D

# Load the pre-trained MobileNetV2 model with custom classification layers
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=INPUT_SHAPE)

# Create a Sequential model for your custom classification task
model = Sequential([
    base_model,
    GlobalAveragePooling2D(),
    Dense(256, activation='relu'),
    Dense(2, activation='softmax')
])

# Freeze the base model layers
for layer in base_model.layers:
    layer.trainable = False

# Print the summary of the model
model.summary()


In [None]:
from keras.optimizers import Adam

Adam = Adam(learning_rate = 0.0001)
model.compile(loss = 'binary_crossentropy', optimizer = Adam, metrics = ['accuracy'])

# Model Train

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',   # Metric to monitor (e.g., validation loss)
    patience=5,           # Number of epochs with no improvement after which training will be stopped
    verbose=1,            # Verbosity mode (0: quiet, 1: update messages)
    restore_best_weights=True  # Restore model weights from the epoch with the best value of the monitored metric
)

In [None]:
history = model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 30, batch_size = 32, callbacks=[early_stopping])

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['train', 'test'], loc='upper left')
plt.show()