# Catherine de Andres Arceno
# Breast Cancer detection using Ensemble of homogeneous and heterogeneous Transfer learning methods

## Approach to complete problem statement
- **Handling Imbalance distribution -** First after loading data we will handle Imbalance case
- **Data Augmentation -** We will augment the data for better predictions


In [None]:
#import libraries
import pandas as pd
import numpy as np
import keras
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob

from keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.utils import to_categorical
import cv2
import warnings
warnings.filterwarnings("ignore")

In [None]:
#loading files
files = glob.glob('/kaggle/input/breast-histopathology-images/*/*/*')

In [None]:
def show_img(files):
    plt.figure(figsize= (10,10))
    index = np.random.randint(0, len(files), 25)
    i=0
    for loc in index:
        plt.subplot(5,5,i+1)
        sample = load_img(files[loc], target_size=(150,150))
        sample = img_to_array(sample)
        plt.axis("off")
        plt.imshow(sample.astype("uint8"))
        i+=1
        
show_img(files)

**Load Data**

In [None]:

def load_data(files, lower_limit, upper_limit):
    data = []
    labels = []
    for file in files[lower_limit : upper_limit]:
        if file.endswith(".png"):
            img = load_img(file, target_size=(50,50)) 
            pixels = img_to_array(img)
            pixels /= 255
            data.append(pixels)
            if(file[-5] == "1"):
                labels.append(1)
            elif(file[-5] == "0"):
                labels.append(0)
                
    return np.stack(data), labels
    

In [None]:
from sklearn.model_selection import train_test_split
#loading 90000 imgs of each cls
#x_train, y_train = load_data(files, 0, 90000)
#20000 imgs for testing
#x_test, y_test = load_data(files, 90000, 110000)

In [None]:
#lets visualize the distribution of data in both classes
#sns.countplot(y_train)
#plt.title("class distribution in trainin data")
#plt.show()

### Handling Imbalance Distribution of data

In [None]:
# Handling Data Imbalances
def load_balanced_data(files, size, start_index):
    half_size = int(size/2)
    count=0
    res = []
    y = []
    for file in files[start_index:]:
        if (count!=half_size):
            if file[-5] == '1' and file.endswith(".png"):
                img = load_img(file, target_size = (50,50))
                pixels = img_to_array(img)
                pixels /= 255
                res.append(pixels)
                y.append(1)
                count += 1
                
    for file in files[start_index:]:
        if(count!=0):
            if(file[-5] == '0'):
                img = load_img(file, target_size = (50,50))
                pixels = img_to_array(img)
                pixels /= 255
                res.append(pixels)
                y.append(0)
                count -= 1
    return np.stack(res), y

In [None]:
#15000-15000 imgs of each cls in train data
X_train2, Y_train2 = load_balanced_data(files,30000, 0)
#6000 imgs in test set(both cls 10000)
X_test2, Y_test2 = load_balanced_data(files, 6000, 120000)

In [None]:
#visualize distribution now
sns.countplot(Y_train2)
#plt.figure(figsize=(6, 4))
plt.title("Training data distribution")
plt.xlabel("0-IDC Negative      1-IDC Positive")
plt.savefig("Training_distri.png")
plt.show()

In [None]:
Y_train2 = to_categorical(Y_train2)
Y_test2 = to_categorical(Y_test2)

## Data Augmentation

In [None]:
from keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(rescale=1.0/255,
                                height_shift_range=0.2,
                                width_shift_range=0.2,
                                horizontal_flip = True,
                                vertical_flip = True,
                                zoom_range=0.2,
                                shear_range=0.2)

test_datagen = ImageDataGenerator(rescale=1.0/255)

In [None]:
train_generator = train_datagen.flow(X_train2, Y_train2, batch_size=32)
val_generator = test_datagen.flow(X_test2, Y_test2, batch_size=32)

In [None]:
tot_imgs = X_train2.shape[0]
tot_imgs

In [None]:
#Obtaining x_train and y_train back from ImageDataGenertor
#using simply next will only give 1 batch of data. To load complete we need to use loop
from tqdm import tqdm
batch_size=32
train_generator.reset()
X_train, y_train = next(train_generator)
for i in tqdm(range(int(tot_imgs / batch_size)-1)): #1st batch is already fetched before the for loop
    img, label = next(train_generator)
    X_train = np.append(X_train, img, axis=0 )
    y_train = np.append(y_train, label, axis=0)
print(X_train.shape, y_train.shape)


## Creating a Vgg16 with Cross-validation

### Architechture of Vgg 16 with data augmentation
- Splitted the training data into 5 folds using KFold cross-validation
- Created VGG16 model object with image input shape as 50*50*3
- Existing trained layers, set trainable to False and added out output layers with 1024 filters.
- compiled model using Adam optimizer, and binary crossentropy as loss function
- finally model trained on each set with 35 epochs, and batch size as 32

In [None]:
# Creaing VGG16 Architecture and its Libraries
from keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, BatchNormalization, Dropout, Add, ReLU, Input, Lambda
from tensorflow.keras import backend as K
from tensorflow.keras.metrics import Recall, Precision, AUC 
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from sklearn.model_selection import StratifiedKFold, KFold

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
#K fold cross validation with 5 folds
kfold = KFold(n_splits=5, shuffle=True, random_state=10)

In [None]:
cvscores = []
output_classes=2
i=1
for train, test in kfold.split(X_train, y_train):
    
    print(f"Fold {i}")
    #create vgg model
    vgg = VGG16(input_shape=(50,50,3),weights='imagenet', include_top=False)

    #we do not want to train existing weights
    for layer in vgg.layers:
        layer.trainable = False
        
    #Add layers at the end
    x = Flatten()(vgg.output)
    x = Dense(1024, activation='relu')(x)
    prediction = Dense(output_classes, activation='sigmoid')(x) #last prediction layer
    
    # create a model object
    model = Model(inputs=vgg.input, outputs=prediction)
    
    #if you want the summary of each 
    #model.summary()
    
    #compile the model with defining hyperparameters
    model.compile(loss = "binary_crossentropy", optimizer="adam", metrics=["acc"])
    
    #train_data = tf.reshape(train, [50, 50])
    #fit the model
    history = model.fit(X_train[train], y_train[train], validation_data=(X_train[test], y_train[test]), epochs=35, batch_size=32)
    
    #evaluate the model
    scores = model.evaluate(X_train[test], y_train[test], verbose=0)
    
    i+=1
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    print("\n")
    cvscores.append(scores[1] * 100)
    
    #Plotting for each fold accuracy
    plt.figure(figsize=(6, 4))
    plt.plot(history.history['acc'], label="train-acc")
    plt.plot(history.history['val_acc'], label="val-acc")
    plt.title(f"Fold {i} analyzing accuracy")
    plt.legend(loc="best")
    plt.xlabel("Number of epochs")
    plt.savefig(f"Fold_{i}_acc.png")
    plt.show()
    
    #Plotting for each fold loss
    plt.figure(figsize=(6,4))
    plt.plot(history.history['loss'], label="train-loss")
    plt.plot(history.history['val_loss'], label="val-loss")
    #plt.title(f"Fold {i} analyzing loss)
    plt.legend()
    plt.xlabel("Number of epochs")
    plt.savefig(f"Fold_{i}_loss.png")
    plt.show()


In [None]:
np.mean(cvscores)

In [None]:
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

## VGG16 Model

In [None]:
# VGG-16 Model
output_classes = 2
vgg = VGG16(input_shape=(50,50,3),weights='imagenet', include_top=False)

#we do not want to train existing weights
for layer in vgg.layers:
    layer.trainable = False
        
#Add layers at the end
x = Flatten()(vgg.output)
x = Dense(1024, activation='relu')(x)
prediction = Dense(output_classes, activation='sigmoid')(x) #last prediction layer
    
# create a model object
model = Model(inputs=vgg.input, outputs=prediction)
    
    #if you want the summary of each 
    #model.summary()
    
#compile the model with defining hyperparameters
model.compile(loss = "binary_crossentropy", optimizer="adam", metrics=["accuracy", recall, precision])
    
#train_data = tf.reshape(train, [50, 50])
#fit the model
history = model.fit(X_train2, Y_train2, validation_data = (X_test2, Y_test2), epochs=20, batch_size=32)
    

In [None]:
loss, accuracy, recall, precision = model.evaluate(X_test2, Y_test2,verbose=1)
print("VGG16 Metrics Score")
print(f"accuracy: {accuracy} \nprecision: {precision} \nrecall: {recall} \nloss: {loss}")

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(history.history['accuracy'], label="train-acc")
plt.plot(history.history['val_accuracy'], label="val-acc")
plt.title("Analyzing Accuracy of VGG16")
plt.legend(loc="best")
plt.xlabel("Number of epochs")
plt.savefig("VGG16_accuracy.png")
plt.show()

In [None]:
#Loss graph
plt.figure(figsize=(6,4))
plt.plot(history.history['loss'], label="train-loss")
plt.plot(history.history['val_loss'], label="val-loss")
plt.title("Analyzing Loss for VGG16")
plt.legend()
plt.xlabel("Number of epochs")
plt.savefig("VGG16_loss.png")
plt.show()

## Resnet50 Model


In [None]:
from tensorflow.keras.applications import ResNet50

In [None]:
#base_model = Sequential()
#base_model.add(ResNet50(include_top=False, weights='imagenet', pooling='max'))
#base_model.add(Dense(2, activation='sigmoid'))

In [None]:
#compile model
#base_model.compile(optimizer = tf.keras.optimizers.SGD(lr=0.0001), loss = 'binary_crossentropy', metrics = ['acc',recall, precision])

#training resnet
#resnet_history = base_model.fit(X_train2, Y_train2, validation_data = (X_test2, Y_test2), epochs = 20)


In [None]:
#loss, accuracy, recall, precision = base_model.evaluate(X_test2, Y_test2,verbose=1)
#print("Resnet Metrics")
#print(f"accuracy: {accuracy} \nprecision: {precision} \nrecall: {recall} \nloss: {loss}")

In [None]:
#plt.figure(figsize=(6, 4))
#plt.plot(resnet_history.history['acc'], label="train-acc")
#plt.plot(resnet_history.history['val_acc'], label="val-acc")
#plt.title("Analyzing Accuracy of ResNet50")
#plt.legend(loc="best")
#plt.savefig("resnet_accuracy.png")
#plt.show()

In [None]:
#Loss graph
#plt.figure(figsize=(6,4))
#plt.plot(resnet_history.history['loss'], label="train-loss")
#plt.plot(resnet_history.history['val_loss'], label="val-loss")
#plt.title("Analyzing Loss for Homogenous ResNet50")
#plt.legend()
#plt.savefig("homo_resnet_loss.png")
#plt.show()

## Creating Ensemble of Transfer Learning

Ensemble
- Define the ensemble function first

- Here, we are iterating over all the models to get the last layers as output
- Then we are adding an merge layer (average) to compute the average output scores of all the models.
- Compile the model

In [None]:
# Creating an Ensemble Model
def create_model(img_size=50, channels=3):
    model = Sequential()
    model.add(ResNet50(include_top=False, weights='imagenet', pooling='max'))
    model.add(Dense(2, activation='sigmoid'))
    #compile model
    model.compile(optimizer = tf.keras.optimizers.Adam(lr=0.0001), loss = 'binary_crossentropy',
                  metrics = ['acc'])
    return model

def create_vgg():
    vgg = VGG16(input_shape=(50,50,3),weights='imagenet', include_top=False)
    for layer in vgg.layers:
        layer.trainable = False
        
    #Add layers at the end
    x = Flatten()(vgg.output)
    x = Dense(1024, activation='relu')(x)
    prediction = Dense(output_classes, activation='sigmoid')(x) #last prediction layer
    
    # create a model object
    model = Model(inputs=vgg.input, outputs=prediction)
    model.compile(loss = "binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

In [None]:
def ensemble(model, model_input):
    Models_output = [model(model_input) for model in models]
    avg = keras.layers.average(Models_output)
    
    modelEnsemble = Model(inputs=model_input, outputs=avg, name="ensemble")
    modelEnsemble.summary()
    
    modelEnsemble.compile(tf.keras.optimizers.Adam(lr=0.0001), loss="binary_crossentropy", metrics=["acc"])
    return modelEnsemble

## Ensemble of ResNet50 and ResNet50(Homogeneous Resnet)

In [None]:
#combining 2 ResNet model and creating its homogeneous model
model_1 = create_model(channels = 3)
model_2 = create_model(channels = 3)

models = []

model_1._name = 'model_1'
models.append(model_1)

model_2._name = 'model_2'
models.append(model_2)

model_input = Input(shape=models[0].input_shape[1:])

In [None]:
resnet_model = ensemble(models, model_input)
resnet_history = resnet_model.fit(X_train2, Y_train2, validation_data = (X_test2, Y_test2), epochs=20)

In [None]:
#plotting accuracy graph for homogeneous ResNet model
plt.figure(figsize=(6,4))
plt.plot(resnet_history.history['acc'], label="train-acc")
plt.plot(resnet_history.history['val_acc'], label="val-acc")
plt.title("Analyzing accuracy of ensemble of homogeneous")
plt.legend(loc="best")
plt.savefig("resnethomo_accuracy.png")
plt.show()

In [None]:
#Loss graph for ensemble model (resnet50 + resnet50)
plt.figure(figsize=(6,4))
plt.plot(resnet_history.history['loss'], label="train-loss")
plt.plot(resnet_history.history['val_loss'], label="val-loss")
plt.title("Analyzing loss for ensemble of homogeneous")
plt.legend()
plt.savefig("homo_loss.png")
plt.show()

## Ensemble of VGG16 AND RESNET50

In [None]:
#combining 2 ResNet model and creating its heterogeneous model
model_1 = create_model(channels = 3)
model_2 = create_vgg()

models = []

model_1._name = 'model_1'
models.append(model_1)

model_2._name = 'model_2'
models.append(model_2)

model_input = Input(shape=models[0].input_shape[1:])

In [None]:
ensemble_model = ensemble(models, model_input)

In [None]:
ensemble_history = ensemble_model.fit(X_train2, Y_train2, validation_data = (X_test2, Y_test2), epochs=20)

In [None]:
#plotting accuracy graph for heterogeneous model (resnet50 + vgg16)
plt.figure(figsize=(6,4))
plt.plot(ensemble_history.history['acc'], label="train-acc")
plt.plot(ensemble_history.history['val_acc'], label="val-acc")
plt.title("Analyzing Accuracy for ensemble of heterogeneous")
plt.legend(loc="best")
plt.xlabel("Number of epochs")
plt.savefig("hetero_ensemble_accuracy.png")
plt.show()

In [None]:
#Loss graph for ensemble model
plt.figure(figsize=(6,4))
plt.plot(ensemble_history.history['loss'], label="train-loss")
plt.plot(ensemble_history.history['val_loss'], label="val-loss")
plt.title("Analyzing loss for ensemble of heterogeneous")
plt.legend()
plt.xlabel("Number of epochs")
plt.savefig("hetero_ensemble_loss.png")
plt.show()