In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the Data

In [None]:
# import libraries
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_meta = pd.read_csv('/kaggle/input/cbis-ddsm-breast-cancer-image-dataset/csv/meta.csv')
df_meta.head()

In [None]:
# load dicom info file
df_dicom = pd.read_csv('/kaggle/input/cbis-ddsm-breast-cancer-image-dataset/csv/dicom_info.csv')
df_dicom.head()

In [None]:
# check image types in dataset
df_dicom.SeriesDescription.unique()

In [None]:
# check image path in dataset
# cropped images
cropped_images = df_dicom[df_dicom.SeriesDescription=='cropped images'].image_path
cropped_images.head(5)

In [None]:
#full mammogram images
full_mammo = df_dicom[df_dicom.SeriesDescription=='full mammogram images'].image_path
full_mammo.head(5)

In [None]:
# ROI images
roi_img = df_dicom[df_dicom.SeriesDescription=='ROI mask images'].image_path
roi_img.head(5)

In [None]:
# set correct image path for image types
imdir = '../input/cbis-ddsm-breast-cancer-image-dataset/jpeg'

In [None]:
# change directory path of images
cropped_images = cropped_images.replace('CBIS-DDSM/jpeg', imdir, regex=True)
full_mammo = full_mammo.replace('CBIS-DDSM/jpeg', imdir, regex=True)
roi_img = roi_img.replace('CBIS-DDSM/jpeg', imdir, regex=True)

# view new paths
print('Cropped Images paths:\n')
print(cropped_images.iloc[0])
print('Full mammo Images paths:\n')
print(full_mammo.iloc[0])
print('ROI Mask Images paths:\n')
print(roi_img.iloc[0])

In [None]:
# organize image paths
full_mammo_dict = dict()
cropped_images_dict = dict()
roi_img_dict = dict()

for dicom in full_mammo:
    key = dicom.split("/")[4]
    full_mammo_dict[key] = dicom
for dicom in cropped_images:
    key = dicom.split("/")[4]
    cropped_images_dict[key] = dicom
for dicom in roi_img:
    key = dicom.split("/")[4]
    roi_img[key] = dicom

# view keys
next(iter((full_mammo_dict.items())))

# Mass Dataset

In [None]:
# load the mass dataset
mass_train = pd.read_csv('/kaggle/input/cbis-ddsm-breast-cancer-image-dataset/csv/mass_case_description_train_set.csv')
mass_test = pd.read_csv('/kaggle/input/cbis-ddsm-breast-cancer-image-dataset/csv/mass_case_description_test_set.csv')

mass_train.head()

In [None]:
# fix image paths
def fix_image_path(data):
    """correct dicom paths to correct image paths"""
    for index, img in enumerate(data.values):
        img_name = img[11].split("/")[2]
        data.iloc[index,11] = full_mammo_dict[img_name]
        img_name = img[12].split("/")[2]
        data.iloc[index,12] = cropped_images_dict[img_name]
        
# apply to datasets
fix_image_path(mass_train)
fix_image_path(mass_test)

In [None]:
# check unique values in pathology column
mass_train.pathology.unique()

In [None]:
mass_train.info()

In [None]:
# rename columns
mass_train = mass_train.rename(columns={'left or right breast': 'left_or_right_breast',
                                           'image view': 'image_view',
                                           'abnormality id': 'abnormality_id',
                                           'abnormality type': 'abnormality_type',
                                           'mass shape': 'mass_shape',
                                           'mass margins': 'mass_margins',
                                           'image file path': 'image_file_path',
                                           'cropped image file path': 'cropped_image_file_path',
                                           'ROI mask file path': 'ROI_mask_file_path'})

mass_train.head(5)

In [None]:
# check for null values
mass_train.isnull().sum()

In [None]:
# fill in missing values using the backwards fill method
mass_train['mass_shape'] = mass_train['mass_shape'].fillna(method='bfill')
mass_train['mass_margins'] = mass_train['mass_margins'].fillna(method='bfill')

#check null values
mass_train.isnull().sum()

In [None]:
# quantitative summary of features
mass_train.describe()

In [None]:
# view mass_test
mass_test.head()

In [None]:
# check datasets shape
print(f'Shape of mass_train: {mass_train.shape}')
print(f'Shape of mass_test: {mass_test.shape}')

In [None]:
mass_test.isnull().sum()

In [None]:
# check for column names in mass_test
print(mass_test.columns)
print('\n')
# rename columns
mass_test = mass_test.rename(columns={'left or right breast': 'left_or_right_breast',
                                           'image view': 'image_view',
                                           'abnormality id': 'abnormality_id',
                                           'abnormality type': 'abnormality_type',
                                           'mass shape': 'mass_shape',
                                           'mass margins': 'mass_margins',
                                           'image file path': 'image_file_path',
                                           'cropped image file path': 'cropped_image_file_path',
                                           'ROI mask file path': 'ROI_mask_file_path'})

# view renamed columns
mass_test.columns

In [None]:
# fill in missing values using the backwards fill method
mass_test['mass_margins'] = mass_test['mass_margins'].fillna(method='bfill')

#check null values
mass_test.isnull().sum()

# Visualizations

In [None]:
# pathology distributions
value = mass_train['pathology'].value_counts()
plt.figure(figsize=(8,8))

plt.pie(value, labels=value.index, autopct='%1.1f%%')
plt.title('Breast Cancer Mass Types', fontsize=14)
#plt.savefig('/kaggle/working/pathology_distributions.png')
plt.show()

In [None]:
# examine breast assessment types
plt.figure(figsize=(10,8))
sns.countplot(mass_train, y='assessment', hue='pathology', palette='viridis')
plt.title('Breast Cancer Assessment\n\n 0: Undetermined || 1: Well Differentiated\n2: Moderately differentiated || 3: Poorly DIfferentiated\n4-5: Undifferentiated', 
          fontsize=12)
plt.ylabel('Assessment Grade')
plt.xlabel('Count')
#plt.savefig('/kaggle/working/breast_assessment.png')
plt.show()

In [None]:
# examine cancer subtlety
plt.figure(figsize=(10,8))
sns.countplot(mass_train, x='subtlety', palette='viridis')
plt.title('Breast Cancer Mass Subtlety', fontsize=12)
plt.xlabel('Subtlety Grade')
plt.ylabel('Count')
#plt.savefig('/kaggle/working/cancer_subtlety.png')
plt.show()

In [None]:
# view breast mass shape distribution against pathology
plt.figure(figsize=(10,8))

sns.countplot(mass_train, x='mass_shape', hue='pathology')
plt.title('Mass Shape Distribution by Pathology', fontsize=14)
plt.xlabel('Mass Shape')
plt.xticks(rotation=30, ha='right')
plt.ylabel('Pathology Count')
plt.legend()
#plt.savefig('/kaggle/working/mass_pathology.png')
plt.show()

In [None]:
# breast density against pathology
plt.figure(figsize=(10,8))

sns.countplot(mass_train, x='breast_density', hue='pathology')
plt.title('Breast Density vs Pathology\n\n1: fatty || 2: Scattered Fibroglandular Density\n3: Heterogenously Dense || 4: Extremely Dense',
          fontsize=14)
plt.xlabel('Density Grades')
plt.ylabel('Count')
plt.legend()
#plt.savefig('/kaggle/working/density_pathology.png')
plt.show()

In [None]:
# Display some images
import matplotlib.image as mpimg

# create function to display images
def display_images(column, number):
    """displays images in dataset"""
    # create figure and axes
    number_to_visualize = number
    rows = 1
    cols = number_to_visualize
    fig, axes = plt.subplots(rows, cols, figsize=(15,5))
    
    # Loop through rows and display images
    for index, row in mass_train.head(number_to_visualize).iterrows():
        image_path = row[column]
        image = mpimg.imread(image_path)
        ax = axes[index]
        ax.imshow(image, cmap='gray')
        ax.set_title(f"{row['pathology']}")
        ax.axis('off')
    plt.tight_layout()
    plt.show()

print('Full Mammograms:\n')
display_images('image_file_path', 5)
print('Cropped Mammograms:\n')
display_images('cropped_image_file_path', 5)

# Preprocessing of Images

In [None]:
import tensorflow
import cv2
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

def image_processor(image_path, target_size):
    """Preprocess images for CNN model"""
    #print("Image path:", image_path)  # Print the value of image_path
    absolute_image_path = os.path.abspath(image_path)
    image = cv2.imread(absolute_image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
    image = cv2.resize(image, (target_size[1], target_size[0]))  # Resize the image
    image_array = image / 255.0  # Normalize pixels
    return image_array

# Train_Test_Validation

In [None]:
# merge datasets
full_mass = pd.concat([mass_train, mass_test], axis=0)

# Define the target size
target_size = (224, 224, 3)

# apply preprocessor to train data
full_mass['processed_images'] = full_mass['image_file_path'].apply(lambda x: image_processor(x, target_size))

# create a binary mapper
class_mapper = {'MALIGNANT': 1, 'BENIGN': 0, 'BENIGN_WITHOUT_CALLBACK': 0} 

# Convert the processed_images column to an array
X_resized = np.array(full_mass['processed_images'].tolist())

# Verify the shape of the resized array
print(X_resized.shape)

# apply class mapper to pathology column
full_mass['labels'] = full_mass['pathology'].replace(class_mapper)

# check number of classes
num_classes = len(full_mass['labels'].unique())

# set customary feature and target variables
X = X_resized
y = full_mass['labels'].values

# Reshape X to include the number of samples
#num_samples = X.shape[0]
X = X.reshape(-1, 224, 224, 3)

# Split data into train, test, and validation sets (70, 20, 10)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.33, random_state=42)

In [None]:
# convert integer labels to one-hot encoded labels
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)
y_val = to_categorical(y_val, num_classes)

In [None]:
unique_labels = full_mass['labels'].unique()
num_unique_labels = len(unique_labels)

print("Unique labels:", unique_labels)
print("Number of unique labels:", num_unique_labels)
print("Num classes:", num_classes)

# CNN Architecture

In [None]:
# import necessary tensorflow libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import plot_model

# Augment data
train_datagen = ImageDataGenerator(rotation_range=40, 
                                   width_shift_range=0.2,
                                   height_shift_range=0.2,
                                   shear_range=0.2,
                                   zoom_range=0.2,
                                   horizontal_flip=True,
                                   fill_mode='nearest'
                                 )

# apply augmentation to training data
train_data_augmented = train_datagen.flow(X_train, y_train, batch_size=16)

# instantiate CNN model
model = Sequential()

# add layers
model.add(Conv2D(32, (3, 3), activation='relu',
                input_shape=(224, 224, 3)))
model.add(MaxPooling2D((2,2)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, (3,3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten()) # flatten feature maps
model.add(Dense(512, activation='relu')) # add fully connected layers
model.add(Dense(num_classes, activation='softmax')) # output layer

# compile model
model.compile(loss='binary_crossentropy',
              optimizer='adam', 
              metrics=['accuracy'])

# train model
history = model.fit(train_data_augmented,
                    epochs=30, 
                    validation_data=(X_val, y_val)
                   )

# save model architecture as png file
plot_model(model, to_file='model-1_architecture.png', show_shapes=True)

In [None]:
# model summary
model.summary()

# Evaluation

In [None]:
model.evaluate(X_test, y_test)

# Hyperparameter Tuning

In [None]:
# increase epoch, batch size, reduce number of layers, add dropout ratio

train_data_augmented_2 = train_datagen.flow(X_train, y_train, batch_size=32)

# instantiate second model
model_2 = Sequential()

# Add layers
model_2.add(Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)))
model_2.add(MaxPooling2D((2, 2)))
model_2.add(Dropout(0.25))

model_2.add(Conv2D(64, (3,3), activation='relu'))
model_2.add(MaxPooling2D((2, 2)))
model_2.add(Dropout(0.25))

model_2.add(Conv2D(64, (3, 3), activation='relu'))
model_2.add(MaxPooling2D((2, 2)))
model_2.add(Dropout(0.25))

model_2.add(Conv2D(128, (3, 3), activation='relu'))
model_2.add(MaxPooling2D((2, 2)))
model_2.add(Dropout(0.25))

model_2.add(Flatten())
model_2.add(Dense(128, activation='relu'))
model_2.add(Dropout(0.5))
model_2.add(Dense(num_classes, activation='softmax'))

# compile
model_2.compile(optimizer=Adam(lr=0.0001), 
                loss='binary_crossentropy', 
                metrics=['accuracy'])

# fit model
history_2 = model_2.fit(train_data_augmented_2,
                    epochs=35,
                    validation_data=(X_val, y_val)
                   )

# save model architecture
plot_model(model_2, to_file='model-2_architecture.png', show_shapes=True)

In [None]:
# model summary
model_2.summary()

# Evaluation

In [None]:
model_2.evaluate(X_test, y_test)

# Classification Report

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# create labels for confusion matrix
cm_labels = ['MALIGNANT', 'BENIGN']

# obtain predictions
y_pred_test = model.predict(X_test)
y_pred_val = model.predict(X_val)

# convert predicted probabilities to class predictions
y_pred_classes_test = np.argmax(y_pred_test, axis=1)
y_pred_classes_val = np.argmax(y_pred_val, axis=1)

# Assuming y_test and y_val are in binary format (0 or 1)
y_true_classes_test = np.argmax(y_test, axis=1)
y_true_classes_val = np.argmax(y_val, axis=1)

# generate classification reports for test and val sets
test_report = classification_report(y_true_classes_test, y_pred_classes_test, target_names=cm_labels)
val_report = classification_report(y_true_classes_val, y_pred_classes_val, target_names=cm_labels)

# generate confusion matrices for test and validation sets
test_cm = confusion_matrix(y_true_classes_test, y_pred_classes_test)
val_cm = confusion_matrix(y_true_classes_val, y_pred_classes_val)

# create function to print confusion matrix
def plot_confusion_matrix(cm, labels, title):
    """plots confusion matrix"""
    plt.figure(figsize=(8, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=labels, yticklabels=labels)
    plt.title(title, fontsize=14)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

# print test and validation reports and matrices
print(f"Test Set Classification report:\n {test_report}\n")
plot_confusion_matrix(test_cm, cm_labels, 'Test Set Confusion Matrix')

In [None]:
print(f"Validation Set Classification report:\n {val_report}\n")
plot_confusion_matrix(val_cm, cm_labels, 'Validation Set Confusion Matrix')

# ROC_AUC Curves

In [None]:
from sklearn.metrics import roc_curve, auc

# Use the trained model to predict probabilities for the test set
y_pred_prob = model.predict(X_test)

# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_test[:, 1], y_pred_prob[:, 1])
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

# Print the AUC score
print(f'AUC: {roc_auc:.2f}')

# Visualizing Loss vs Epoch/Accuracy vs Epoch 

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
# plot training loss vs validation loss
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
acc = history_dict['accuracy']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, loss_values, 'b', label='Training Loss')
plt.plot(epochs, val_loss_values, 'r', label='Validation Loss')
plt.title('Training and Validation Loss', fontsize=12)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
#history_df = pd.DataFrame(history.history)
#history_df[['loss', 'val_loss']].plot()

#history_df = pd.DataFrame(history.history)
#history_df[['accuracy', 'val_accuracy']].plot()

In [None]:
# plot training vs validation accuracy
val_acc_values = history_dict['val_accuracy']
acc = history_dict['accuracy']

plt.plot(epochs, acc, 'b', label='Training Accuracy')
plt.plot(epochs, val_acc_values, 'r', label='Validation Accuracy')
plt.title('Training and Validation Accuracy', fontsize=12)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Transfer Learning

In [None]:
# use VGG19
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import GlobalAveragePooling2D

train_data_aug_3 = train_datagen.flow(X_train, y_train, batch_size=16)

vgg_model = Sequential()

pretrained_model = VGG19(include_top=False, 
                         input_shape=(224, 224, 3), 
                         classes=num_classes, 
                         weights='imagenet')

# apply GAP to last layer of pretrained model
pretrained_model.layers[-1] = GlobalAveragePooling2D()

for layer in pretrained_model.layers:
    layer.trainable=False

# add layers
vgg_model.add(pretrained_model)
vgg_model.add(Flatten())
vgg_model.add(Dense(512, activation='relu'))
vgg_model.add(Dense(num_classes, activation='softmax'))

# train model
vgg_model.compile(optimizer=Adam(lr=0.001), 
                    loss='binary_crossentropy',
                    metrics=['accuracy'])

# fit model
history_3 = vgg_model.fit(train_data_aug_3, 
                            epochs=30,
                            validation_data=(X_val, y_val))

# save architecture
plot_model(vgg_model, to_file='transfer_learning-1_archictect.png', show_shapes=True)

In [None]:
# model summary
vgg_model.summary()

# Classification Report: Transfer Learning

In [None]:
# classification report and confusion matrix

#obtain predictions
y_pred_test_res = vgg_model.predict(X_test)
y_pred_val_res = vgg_model.predict(X_val)

# convert predicted probabilities to class predictions
y_pred_classes_test_res = np.argmax(y_pred_test_res, axis=1)
y_pred_classes_val_res = np.argmax(y_pred_val_res, axis=1)

# get true classes
y_true_classes_test_res = np.argmax(y_test, axis=1)
y_true_classes_val_res = np.argmax(y_val, axis=1)

# generate classification report
test_report_res = classification_report(y_true_classes_test_res, y_pred_classes_test_res, target_names=cm_labels)
val_report_res = classification_report(y_true_classes_val_res, y_pred_classes_val_res, target_names=cm_labels)

# generate confusion matrix
test_cm_res = confusion_matrix(y_true_classes_test_res, y_pred_classes_test_res)
val_cm_res = confusion_matrix(y_true_classes_val_res, y_pred_classes_val_res)

In [None]:
print(f"Test Set Classification report:\n {test_report_res}\n")
plot_confusion_matrix(test_cm_res, cm_labels, 'Test Set Confusion Matrix: VGG19')

In [None]:
print(f'Validation Set Classifcation report:\n {val_report_res}\n')
plot_confusion_matrix(val_cm_res, cm_labels, 'Validation Set Confusion Matrix: VGG19')

# ROC-AUC Curves: Transfer Learning

In [None]:
# ROC-AUC Curves

# Use the trained model to predict probabilities for the test set
y_pred_prob = vgg_model.predict(X_test)

# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_test[:, 1], y_pred_prob[:, 1])
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

# Print the AUC score
print(f'AUC: {roc_auc:.2f}')

# Epochs-Loss-Accuracy Visualization: Transfer Learning

In [None]:
history_3_dict = history_3.history
history_3_dict.keys()

In [None]:
# plot training loss vs validation loss
loss_values = history_3_dict['loss']
val_loss_values = history_3_dict['val_loss']
acc = history_3_dict['accuracy']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, loss_values, 'b', label='Training Loss')
plt.plot(epochs, val_loss_values, 'r', label='Validation Loss')
plt.title('Training and Validation Loss', fontsize=12)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# plot training vs validation accuracy
val_acc_values = history_3_dict['val_accuracy']

plt.plot(epochs, acc, 'b', label='Training Accuracy')
plt.plot(epochs, val_acc_values, 'r', label='Validation Accuracy')
plt.title('Training and Validation Accuracy', fontsize=12)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Save Model

# Demo

In [None]:
#image_pred = vgg_model.predict(y_test)

In [None]:
#image_output_class = labels[np.argmax(image_pred)] # to obtain a human readable output
#print('The predicted class is', image_output_class)