In [2]:
#import required libraries
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TF_DETERMINISTIC_OPS'] = '1'
import cv2
import numpy as np
from imutils import paths
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score, matthews_corrcoef, precision_score, f1_score, recall_score
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, SimpleRNN,Dropout
from tensorflow.keras.utils import to_categorical
import pandas as pd
import time
import psutil
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications import VGG19
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input, decode_predictions
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
import keras

tf.keras.utils.set_random_seed(0)
tf.config.experimental.enable_op_determinism()



In [3]:
#!pip install tensorflow-tensorrt


In [4]:
#!pip install opencv-python
#!pip install imutils
#!pip install scikit-learn
#!pip install xgboost


In [5]:
#Class to load the dataset images from device

class SimpleDatasetLoader:
    # Method: Constructor
    def __init__(self, preprocessors=None):
        """
        :param preprocessors: List of image preprocessors
        """
        self.preprocessors = preprocessors

        if self.preprocessors is None:
            self.preprocessors = []

    # Method: Used to load a list of images for pre-processing
    def load(self, image_paths, verbose=-1):
        """
        :param image_paths: List of image paths
        :param verbose: Parameter for printing information to console
        :return: Tuple of data and labels
        """
        data, labels = [], []

        for i, image_path in enumerate(image_paths):
            image = cv2.imread(image_path)
            label = image_path.split(os.path.sep)[-2]

            if self.preprocessors is not None:
                for p in self.preprocessors:
                    image = p.preprocess(image)

            data.append(image)
            labels.append(label)

            if verbose > 0 and i > 0 and (i+1) % verbose == 0:
                print('[INFO]: Processed {}/{}'.format(i+1, len(image_paths)))

        return (np.array(data), np.array(labels))

In [6]:
#Class Preprocessror 
class SimplePreprocessor:
    # Method: Constructor
    def __init__(self, width, height, interpolation=cv2.INTER_AREA):
        """
        :param width: Image width
        :param height: Image height
        :param interpolation: Interpolation algorithm
        """
        self.width = width
        self.height = height
        self.interpolation = interpolation

    # Method: Used to resize the image to a fixed size (ignoring the aspect ratio)
    def preprocess(self, image):
        """
        :param image: Image
        :return: Re-sized image
        """
        return cv2.resize(image, (self.width, self.height), interpolation=self.interpolation)

In [7]:
from imutils import paths
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from __main__ import SimplePreprocessor
from __main__ import SimpleDatasetLoader

In [8]:
# Function to load and preprocess data using SimpleDatasetLoader
def load_and_preprocess_data(image_paths, target_size):
    sp = SimplePreprocessor(target_size[0], target_size[1])
    sdl = SimpleDatasetLoader(preprocessors=[sp])
    data, labels = sdl.load(image_paths, verbose=1000)

    print(data.shape)
    print(labels.shape)

    # Convert labels to one-hot encoding
    le = LabelEncoder()
    labels = le.fit_transform(labels)
    #labels = to_categorical(labels)

    return data, labels



# Get list of image paths
image_paths = list(paths.list_images("../BreaKHis_v1"))

# Define target size for images
target_size = (224, 224)  # Change this to your desired size

# Load and preprocess data
print('[INFO]: Images loading....')
data, labels = load_and_preprocess_data(image_paths, target_size)

[INFO]: Images loading....
[INFO]: Processed 1000/7909
[INFO]: Processed 2000/7909
[INFO]: Processed 3000/7909
[INFO]: Processed 4000/7909
[INFO]: Processed 5000/7909
[INFO]: Processed 6000/7909
[INFO]: Processed 7000/7909
(7909, 224, 224, 3)
(7909,)


In [9]:
from tensorflow.keras.applications import  ResNet50, EfficientNetB0
from tensorflow.keras.models import Model
with tf.device('/CPU:0'):
    base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(target_size[0], target_size[1], 3))
    model = Model(inputs=base_model.input, outputs=base_model.output)
    features = model.predict(data, batch_size=32, verbose=1)

# Flatten the features
features_flatten = features.reshape(features.shape[0], -1)
    
print(features_flatten.shape)

2024-04-03 18:14:55.367551: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38380 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:65:00.0, compute capability: 8.0


(7909, 62720)


In [10]:
data = features_flatten

In [11]:
import gc
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
#tf.config.experimental.set_memory_growth(tf.config.list_physical_devices('GPU')[0], True)
#tf.config.experimental.enable_tensor_float_32_execution(True)
#tf.config.threading.set_inter_op_parallelism_threads(1)
#tf.config.threading.set_intra_op_parallelism_threads(1)
# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=['Model', 'Average Accuracy', 'Average Sensitivity', 'Average Specificity',
                                   'Average AUC-ROC', 'Average MCC', 'Average Precision', 'Average F1 Score',
                                   'Memory Used (MB)', 'Time (s)'])

# Function to evaluate a model
def evaluate_model(model, name, data, labels):
    if name == "VGG16":
        labels = to_categorical(labels)

    elif name == "VGG19":  
        labels = to_categorical(labels)
    else:
        data = data
    accuracy_list = []
    sensitivity_list = []
    specificity_list = []
    auc_roc_list = []
    mcc_list = []
    precision_list = []
    f1_list = []
    time_start = time.time()
    memory_start = psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2

    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    if name == "VGG16" or name == "VGG19":


        with tf.device('/GPU:0'):
            tf.random.set_seed(0)
            for fold, (train_index, test_index) in enumerate(skf.split(data, labels.argmax(axis=1))):
                print(f'\n[INFO] Fold {fold + 1} / 10 for {name}')

                X_train, X_test = data[train_index], data[test_index]
                y_train, y_test = labels[train_index], labels[test_index]


                # Adding early stopping to prevent overfitting
                early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
                keras.utils.set_random_seed(0)
                np.random.seed(0)
                tf.random.set_seed(0)
                #tf.config.run_functions_eagerly(True)
                tf.data.experimental.enable_debug_mode()
                # Train the model
                # Example of using tf.data.Dataset
                train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
                train_dataset = train_dataset.shuffle(buffer_size=len(X_train)).batch(64)

                model.fit(train_dataset, epochs=20, callbacks=[early_stopping])

                # Evaluate the model
                predictions = model.predict(X_test)
                y_pred = np.argmax(predictions, axis=1)
                y_test_encoded = np.argmax(y_test, axis=1)

                
                # Calculate evaluation metrics for the current fold
                accuracy = np.mean(y_pred == y_test_encoded)
                sensitivity = recall_score(y_test_encoded, y_pred, pos_label=1)
                specificity = recall_score(y_test_encoded, y_pred, pos_label=0)
                auc_roc = roc_auc_score(y_test_encoded, y_pred)
                mcc = matthews_corrcoef(y_test_encoded, y_pred)
                precision = precision_score(y_test_encoded, y_pred, pos_label=1)
                f1 = f1_score(y_test_encoded, y_pred, pos_label=1)
                # Append metrics to lists
                accuracy_list.append(accuracy)
                sensitivity_list.append(sensitivity)
                specificity_list.append(specificity)
                auc_roc_list.append(auc_roc)
                mcc_list.append(mcc)
                precision_list.append(precision)
                f1_list.append(f1)

                # Print metrics for the current fold
                print(f"Accuracy: {accuracy}")
                print(f"Sensitivity: {sensitivity}")
                print(f"Specificity: {specificity}")
                print(f"AUC-ROC: {auc_roc}")
                print(f"MCC: {mcc}")
                print(f"Precision: {precision}")
                print(f"F1 Score: {f1}")
                gc.collect()

            # Calculate average metrics
            average_accuracy = np.mean(accuracy_list)
            average_sensitivity = np.mean(sensitivity_list)
            average_specificity = np.mean(specificity_list)
            average_auc_roc = np.mean(auc_roc_list)
            average_mcc = np.mean(mcc_list)
            average_precision = np.mean(precision_list)
            average_f1 = np.mean(f1_list)

            time_end = time.time()
            memory_end = psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2
            del X_train, X_test, y_train, y_test
            gc.collect()
            tf.keras.backend.clear_session()
    else:
        for fold, (train_index, test_index) in enumerate(skf.split(data, labels)):
            print(f'\n[INFO] Fold {fold + 1} / 10 for {name}')

            X_train, X_test = data[train_index], data[test_index]
            y_train, y_test = labels[train_index], labels[test_index]

            # Train the model
            model.fit(X_train, y_train)

            # Evaluate the model
            predictions = model.predict(X_test)
            y_pred = predictions
            y_test_encoded = y_test

            # Calculate evaluation metrics for the current fold
            accuracy = np.mean(y_pred == y_test_encoded)
            sensitivity = recall_score(y_test, y_pred, pos_label=1)
            specificity = recall_score(y_test, y_pred, pos_label=0)
            auc_roc = roc_auc_score(y_test, predictions)
            mcc = matthews_corrcoef(y_test, y_pred)
            precision = precision_score(y_test, y_pred, pos_label=1)
            f1 = f1_score(y_test, y_pred, pos_label=1)
            # Append metrics to lists
            accuracy_list.append(accuracy)
            sensitivity_list.append(sensitivity)
            specificity_list.append(specificity)
            auc_roc_list.append(auc_roc)
            mcc_list.append(mcc)
            precision_list.append(precision)
            f1_list.append(f1)

            # Print metrics for the current fold
            print(f"Accuracy: {accuracy}")
            print(f"Sensitivity: {sensitivity}")
            print(f"Specificity: {specificity}")
            print(f"AUC-ROC: {auc_roc}")
            print(f"MCC: {mcc}")
            print(f"Precision: {precision}")
            print(f"F1 Score: {f1}")
            gc.collect()

        # Calculate average metrics
        average_accuracy = np.mean(accuracy_list)
        average_sensitivity = np.mean(sensitivity_list)
        average_specificity = np.mean(specificity_list)
        average_auc_roc = np.mean(auc_roc_list)
        average_mcc = np.mean(mcc_list)
        average_precision = np.mean(precision_list)
        average_f1 = np.mean(f1_list)

        time_end = time.time()
        memory_end = psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2
        

    # Append results to DataFrame
    results_df.loc[len(results_df)] = [name, average_accuracy, average_sensitivity, average_specificity,
                                       average_auc_roc, average_mcc, average_precision, average_f1,
                                       memory_end - memory_start, time_end - time_start]





In [11]:
from sklearn.linear_model import LogisticRegression
# SVM
lr_model = LogisticRegression(n_jobs=-1, random_state=0)
evaluate_model(lr_model, 'LR', data, labels)




[INFO] Fold 1 / 10 for LR


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.8938053097345132
Sensitivity: 0.9281767955801105
Specificity: 0.8185483870967742
AUC-ROC: 0.8733625913384424
MCC: 0.7517833409378984
Precision: 0.9180327868852459
F1 Score: 0.923076923076923

[INFO] Fold 2 / 10 for LR
Accuracy: 0.9152970922882427
Sensitivity: 0.9355432780847146
Specificity: 0.8709677419354839
AUC-ROC: 0.9032555100100992
MCC: 0.8039005400984744
Precision: 0.9407407407407408
F1 Score: 0.9381348107109879

[INFO] Fold 3 / 10 for LR
Accuracy: 0.9077117572692794
Sensitivity: 0.9373848987108656
Specificity: 0.842741935483871
AUC-ROC: 0.8900634170973684
MCC: 0.7845083236977783
Precision: 0.9288321167883211
F1 Score: 0.9330889092575619

[INFO] Fold 4 / 10 for LR


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.9127686472819216
Sensitivity: 0.9502762430939227
Specificity: 0.8306451612903226
AUC-ROC: 0.8904607021921227
MCC: 0.7947636980074239
Precision: 0.9247311827956989
F1 Score: 0.9373297002724796

[INFO] Fold 5 / 10 for LR
Accuracy: 0.9051833122629582
Sensitivity: 0.9502762430939227
Specificity: 0.8064516129032258
AUC-ROC: 0.8783639279985742
MCC: 0.7760916229723746
Precision: 0.9148936170212766
F1 Score: 0.9322493224932249

[INFO] Fold 6 / 10 for LR
Accuracy: 0.9102402022756005
Sensitivity: 0.9539594843462247
Specificity: 0.8145161290322581
AUC-ROC: 0.8842378066892413
MCC: 0.7881399915108255
Precision: 0.9184397163120568
F1 Score: 0.935862691960253

[INFO] Fold 7 / 10 for LR
Accuracy: 0.9140328697850821
Sensitivity: 0.9613259668508287
Specificity: 0.8104838709677419
AUC-ROC: 0.8859049189092854
MCC: 0.7968992989349142
Precision: 0.9173989455184535
F1 Score: 0.9388489208633094

[INFO] Fold 8 / 10 for LR
Accuracy: 0.9152970922882427
Sensitivity: 0.9502762430939227
Specificity: 0.8

In [12]:
#NB
nb_model = GaussianNB()
evaluate_model(nb_model, 'NB', data, labels)


[INFO] Fold 1 / 10 for NB
Accuracy: 0.7243994943109987
Sensitivity: 0.7863720073664825
Specificity: 0.5887096774193549
AUC-ROC: 0.6875408423929187
MCC: 0.3697201594190471
Precision: 0.8071833648393195
F1 Score: 0.7966417910447762

[INFO] Fold 2 / 10 for NB
Accuracy: 0.7471554993678887
Sensitivity: 0.8158379373848987
Specificity: 0.5967741935483871
AUC-ROC: 0.7063060654666429
MCC: 0.4126121309332858
Precision: 0.8158379373848987
F1 Score: 0.8158379373848987

[INFO] Fold 3 / 10 for NB
Accuracy: 0.7724399494310998
Sensitivity: 0.8471454880294659
Specificity: 0.6088709677419355
AUC-ROC: 0.7280082278857007
MCC: 0.4635224184066914
Precision: 0.8258527827648114
F1 Score: 0.8363636363636362

[INFO] Fold 4 / 10 for NB
Accuracy: 0.7142857142857143
Sensitivity: 0.7808471454880295
Specificity: 0.5685483870967742
AUC-ROC: 0.6746977662924019
MCC: 0.34507156095382246
Precision: 0.7984934086629002
F1 Score: 0.7895716945996274

[INFO] Fold 5 / 10 for NB
Accuracy: 0.7509481668773704
Sensitivity: 0.8066

In [13]:

# KNN
knn_model = KNeighborsClassifier()
evaluate_model(knn_model, 'KNN', data, labels)




[INFO] Fold 1 / 10 for KNN
Accuracy: 0.7648546144121365
Sensitivity: 0.9797421731123389
Specificity: 0.29435483870967744
AUC-ROC: 0.6370485059110081
MCC: 0.41274394060548086
Precision: 0.7524752475247525
F1 Score: 0.8512000000000001

[INFO] Fold 2 / 10 for KNN
Accuracy: 0.7800252844500632
Sensitivity: 0.9797421731123389
Specificity: 0.34274193548387094
AUC-ROC: 0.661242054298105
MCC: 0.45814827391075497
Precision: 0.7654676258992805
F1 Score: 0.8594507269789983

[INFO] Fold 3 / 10 for KNN
Accuracy: 0.7914032869785083
Sensitivity: 0.990791896869245
Specificity: 0.3548387096774194
AUC-ROC: 0.6728153032733322
MCC: 0.4978158993767716
Precision: 0.7707736389684814
F1 Score: 0.8670427074939565

[INFO] Fold 4 / 10 for KNN
Accuracy: 0.7711757269279393
Sensitivity: 0.9815837937384899
Specificity: 0.31048387096774194
AUC-ROC: 0.6460338323531158
MCC: 0.43307516298673854
Precision: 0.7571022727272727
F1 Score: 0.8548516439454691

[INFO] Fold 5 / 10 for KNN
Accuracy: 0.7585335018963337
Sensitivity

In [14]:
# Random Forest
rf_model = RandomForestClassifier(random_state=0, n_jobs=-1)
evaluate_model(rf_model, 'Random Forest', data, labels)



[INFO] Fold 1 / 10 for Random Forest
Accuracy: 0.809102402022756
Sensitivity: 0.9502762430939227
Specificity: 0.5
AUC-ROC: 0.7251381215469613
MCC: 0.5315282216167191
Precision: 0.80625
F1 Score: 0.8723584108199494

[INFO] Fold 2 / 10 for Random Forest
Accuracy: 0.8369152970922883
Sensitivity: 0.9631675874769797
Specificity: 0.5604838709677419
AUC-ROC: 0.7618257292223609
MCC: 0.6061930757862746
Precision: 0.8275316455696202
F1 Score: 0.8902127659574468

[INFO] Fold 3 / 10 for Random Forest
Accuracy: 0.8394437420986094
Sensitivity: 0.9760589318600368
Specificity: 0.5403225806451613
AUC-ROC: 0.7581907562525989
MCC: 0.6158782107800638
Precision: 0.8229813664596274
F1 Score: 0.8930075821398483

[INFO] Fold 4 / 10 for Random Forest
Accuracy: 0.8027812895069533
Sensitivity: 0.9558011049723757
Specificity: 0.46774193548387094
AUC-ROC: 0.7117715202281234
MCC: 0.514835864381323
Precision: 0.7972350230414746
F1 Score: 0.8693467336683417

[INFO] Fold 5 / 10 for Random Forest
Accuracy: 0.806573957

In [12]:
import lightgbm
# LGBM
lgb_model = lightgbm.LGBMClassifier(random_state=0, n_jobs = -1)
evaluate_model(lgb_model, 'LGBM', data, labels)


[INFO] Fold 1 / 10 for LGBM
[LightGBM] [Info] Number of positive: 4886, number of negative: 2232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 18.795533 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15993600
[LightGBM] [Info] Number of data points in the train set: 7118, number of used features: 62720
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.686429 -> initscore=0.783476
[LightGBM] [Info] Start training from score 0.783476
Accuracy: 0.8672566371681416
Sensitivity: 0.9447513812154696
Specificity: 0.6975806451612904
AUC-ROC: 0.8211660131883799
MCC: 0.6822580199187609
Precision: 0.8724489795918368
F1 Score: 0.9071618037135278

[INFO] Fold 2 / 10 for LGBM
[LightGBM] [Info] Number of positive: 4886, number of negative: 2232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 18.815784 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM

In [13]:
# Save results to a CSV file
results_df.to_csv('DS3_Efficient_LGBM.csv', index=False)

In [17]:
results_df

Unnamed: 0,Model,Average Accuracy,Average Sensitivity,Average Specificity,Average AUC-ROC,Average MCC,Average Precision,Average F1 Score,Memory Used (MB),Time (s)
0,LR,0.909723,0.94511,0.832258,0.888684,0.788301,0.925102,0.934942,589.589844,2007.300548
1,NB,0.742446,0.80549,0.604435,0.704963,0.406976,0.816786,0.811027,1703.523438,102.190678
2,KNN,0.775318,0.98379,0.318952,0.651371,0.446698,0.759776,0.857383,1714.421875,142.875629
3,Random Forest,0.81793,0.96574,0.494355,0.730047,0.557284,0.807168,0.879303,1666.3125,804.392058
