# Transfer Learning

### Import Libraries

In [1]:
import pathlib
import os
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
# import keras_tuner as kt
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.model_selection import StratifiedKFold, train_test_split


## Train with Skin Cancer Dataset

### Hyperparameters

In [None]:
data_root = pathlib.Path("../data/Augmented_Images")    # points to the folder containing the images that will be used for training

# hyperparameters
img_height = 224        # input image height
img_width = 224         # input image width
batch_size = 32         # size of the batch that will be fed to model

# folds = the amount of folds that will be created for cross-validation
# fine_tune_epochs = number of epochs after which we start fine-tuning
# fine_tune_at = layer number where we start unfreezing layers

# configurations that will be used in training
configs = [
    {"learning_rate": 0.001, "optimizer": "adam", "epochs": 50, "save_metrics": True, "folds": 5, "fine_tune": False, "fine_tune_epochs": 25, "fine_tune_at": 150},
    # {"learning_rate": 0.001, "optimizer": "adam", "epochs": 50, "save_metrics": True, "folds": 3, "fine_tune": False, "fine_tune_epochs": 25, "fine_tune_at": 150},
    # {"learning_rate": 0.001, "optimizer": "adam", "epochs": 50, "save_metrics": True, "folds": 5, "fine_tune": True, "fine_tune_epochs": 25, "fine_tune_at": 152},
    # {"learning_rate": 0.001, "optimizer": "adam", "epochs": 50, "save_metrics": True, "folds": 3, "fine_tune": True, "fine_tune_epochs": 25, "fine_tune_at": 152},
]

# Define the base path for saving models
save_dir = "../saved_models"
os.makedirs(save_dir, exist_ok=True)

## Training 

### Setup

In [None]:
# Load dataset without splitting
dataset = tf.keras.utils.image_dataset_from_directory(
    data_root,                                  # loads images from the data_root directory
    image_size=(img_height, img_width),         # resizes all images to (224, 224) pixels
    batch_size=batch_size,                      # set the batch size
    shuffle=True                                # shufle data when loaded
)

class_names = np.array(dataset.class_names)     # get the class names for the data
num_classes = len(class_names)                  # get the number of classes in the dataset

# convert the dataset to a list of (image, label) pairs. This makes it easier to perform cross-validation
image_paths, labels = [], []
for image_batch, label_batch in dataset:
    image_paths.extend(image_batch.numpy())
    labels.extend(label_batch.numpy())

image_paths = np.array(image_paths)             # convert to numpy array to facilitate training
labels = np.array(labels)                       # convert to numpy array to facilitate training

# Split the dataset into training/validation and test sets
train_val_images, test_images, train_val_labels, test_labels = train_test_split(
    image_paths, labels, test_size=0.10, random_state=42, stratify=labels
)

def callbacks_setup(checkpoint_filepath):
    # EarlyStopping callback configuration
    early_stopping = EarlyStopping(
        monitor='val_loss',        # monitor validation loss
        patience=6,                # number of epochs with no improvement to stop training
        mode = 'min',              # want to minimize what it being monitored 
        restore_best_weights=False # don't restore in EarlyStopping, handled by ModelCheckpoint
    )

    model_checkpoint = ModelCheckpoint(
        filepath=checkpoint_filepath,   # path to save weights
        save_weights_only=True,         # only save weights instead of full model
        monitor='val_loss',             # monitor validation loss
        mode='min',                     # want to maximize what is being monitored
        save_best_only=True             # save the best weights
    )            

    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss',      # monitor validation loss 
        factor=0.5,              # factor by which the learning rate will be reduced 
        patience=4,              # number of epochs with no improvement to stop training 
        mode='min',              # want to minimize what it being monitored 
        min_lr=1e-6              # lower bound on the learning rate 
    )            

    return early_stopping, model_checkpoint, reduce_lr

### Metrics

In [4]:
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from typing import Tuple, Dict, Any, List
from sklearn.metrics import (
    precision_score,
    classification_report,
    roc_auc_score,
    roc_curve,
    auc,
    recall_score,
    f1_score,
    confusion_matrix,
    ConfusionMatrixDisplay
)
from sklearn.preprocessing import label_binarize

def save_confusion_matrix(true_labels: np.ndarray, predicted_labels: np.ndarray, 
                          class_names: List[str], save_path: str) -> None:
    """
    Plots and saves the confusion matrix for multi-class classification.

    Args:
        true_labels (np.ndarray): Array of true class labels.
        predicted_labels (np.ndarray): Array of predicted class labels.
        class_names (List[str]): List of class names corresponding to class indices.
        save_path (str): Path to save the confusion matrix plot.
    """
    # Compute confusion matrix using sklearn
    cm = confusion_matrix(true_labels, predicted_labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)

    # Plot with adjustments
    fig, ax = plt.subplots(figsize=(8, 6))  # Adjust figure size
    disp.plot(cmap=plt.cm.Blues, ax=ax)

    ax.set_title("Confusion Matrix")
    ax.set_xlabel("Predicted label", fontsize=12)
    ax.set_ylabel("True label", fontsize=12)

    # Rotate x-axis labels for better readability
    plt.xticks(rotation=30, ha='right')

    # Prevent labels from being cut off
    plt.tight_layout()

    # Save and close plot
    plt.savefig(save_path, bbox_inches='tight')
    plt.close()

def save_loss_curve(history: Dict[str, Any], save_path: str) -> None:
    """
    Plots and saves the training and validation loss curves.

    Args:
        history (Dict[str, Any]): Dictionary containing training history (loss values).
        save_path (str): Path to save the loss curve plot.
    """
    plt.figure(figsize=(10, 6))
    plt.plot(history['loss'], label='Training Loss', color='blue')
    plt.plot(history['val_loss'], label='Validation Loss', color='orange')
    plt.title("Training and Validation Loss Over Epochs")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(save_path)
    plt.close()

def save_roc_auc(true_labels: np.ndarray, predicted_probs: np.ndarray, class_names: list, save_path: str = None):
    """
    Plots and saves the ROC AUC curve for multi-class classification.
    
    Args:
        true_labels (np.ndarray): True class labels.
        predicted_probs (np.ndarray): Predicted class probabilities.
        class_names (list): List of class names.
        save_path (str, optional): Path to save the ROC curve plot. Defaults to None.
    """
    plt.figure(figsize=(10, 6))
    for i, class_name in enumerate(class_names):
        fpr, tpr, _ = roc_curve(true_labels == i, predicted_probs[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{class_name} (AUC = {roc_auc:.2f})')
    
    plt.plot([0, 1], [0, 1], 'k--', label='Random Chance')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC AUC Curve')
    plt.legend(loc='lower right')
    
    if save_path:
        plt.savefig(save_path)
        plt.close()
    else:
        plt.show()

def save_evaluation_metrics(true_labels: np.ndarray, predicted_labels: np.ndarray, 
                            predicted_probs: np.ndarray, save_path: str) -> Dict[str, float]:
    """
    Computes evaluation metrics for multi-class classification and saves a bar chart.
    The metrics include accuracy, precision, recall, F1 score, and ROC AUC.

    Args:
        true_labels (np.ndarray): Array of true class labels.
        predicted_labels (np.ndarray): Array of predicted class labels.
        predicted_probs (np.ndarray): Array of predicted probabilities (shape: [n_samples, n_classes]).
        save_path (str): Path to save the evaluation metrics bar chart.

    Returns:
        Dict[str, float]: Dictionary containing computed metrics.
    """
    # Calculate accuracy by comparing predicted and true labels
    accuracy = np.mean(predicted_labels == true_labels)
    # Compute macro-averaged metrics for multi-class classification
    recall = recall_score(true_labels, predicted_labels, average='macro')
    precision = precision_score(true_labels, predicted_labels, average='macro')
    f1 = f1_score(true_labels, predicted_labels, average='macro')

    # For ROC AUC, first binarize the true labels to one-hot encoding
    n_classes = predicted_probs.shape[1]
    true_labels_binarized = label_binarize(true_labels, classes=list(range(n_classes)))
    # Compute ROC AUC with a one-vs-rest approach and macro average
    roc_auc = roc_auc_score(true_labels_binarized, predicted_probs, multi_class='ovr', average='macro')

    # Store metrics in a dictionary
    metrics = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Sensitivity (Recall)": recall,
        "F1-Score": f1,
        "ROC AUC": roc_auc
    }

    # Plot metrics as a bar chart
    plt.figure(figsize=(10, 6))
    bars = plt.bar(metrics.keys(), metrics.values(), 
                   color=['darkturquoise', 'sandybrown', 'hotpink', 'limegreen', 'mediumpurple'])
    # Annotate each bar with its value
    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, yval, f'{yval:.4f}', ha='center', va='bottom')
    plt.title("Model Evaluation Metrics")
    plt.ylim([0, 1])
    plt.yticks(np.arange(0, 1.1, 0.1))
    plt.ylabel("Score")
    plt.savefig(save_path)
    plt.close()

    return metrics

def save_classification_report(true_labels: np.ndarray, predicted_labels: np.ndarray, 
                               class_names: List[str], save_path: str) -> None:
    """
    Saves the classification report to a text file for multi-class classification.

    Args:
        true_labels (np.ndarray): Array of true class labels.
        predicted_labels (np.ndarray): Array of predicted class labels.
        class_names (List[str]): List of class names.
        save_path (str): Path to save the classification report.
    """
    report = classification_report(true_labels, predicted_labels, target_names=class_names, digits=4)
    with open(save_path, "w") as f:
        f.write(report)

def calculate_metrics(true_labels: np.ndarray, predictions: np.ndarray) -> Tuple[float, float, float, float, float]:
    """
    Calculates evaluation metrics for multi-class classification.

    Args:
        true_labels (np.ndarray): Array of true class labels.
        predictions (np.ndarray): Array of predicted probabilities (shape: [n_samples, n_classes]).

    Returns:
        Tuple[float, float, float, float, float]: A tuple containing accuracy, precision, recall, 
            F1 score, and ROC AUC score.
    """
    # Convert predicted probabilities to predicted class labels using argmax
    predicted_labels = np.argmax(predictions, axis=1)
    accuracy = np.mean(predicted_labels == true_labels)
    precision = precision_score(true_labels, predicted_labels, average='macro')
    recall = recall_score(true_labels, predicted_labels, average='macro')
    f1 = f1_score(true_labels, predicted_labels, average='macro')

    # Binarize true labels for ROC AUC calculation
    n_classes = predictions.shape[1]
    true_labels_binarized = label_binarize(true_labels, classes=list(range(n_classes)))
    auc = roc_auc_score(true_labels_binarized, predictions, multi_class='ovr', average='macro')

    return accuracy, precision, recall, f1, auc

def save_best_model_visuals(history: tf.keras.callbacks.History, model: tf.keras.Model, 
                              val_ds: tf.data.Dataset, class_names: List[str], 
                              weights_path: str, fold: int) -> None:
    """
    Generates and saves evaluation visuals including confusion matrix, loss curve, evaluation 
    metrics bar chart, and classification report for the best performing model in a given fold.

    Args:
        history (tf.keras.callbacks.History): Training history object.
        model (tf.keras.Model): Trained model.
        val_ds (tf.data.Dataset): Validation dataset.
        class_names (List[str]): List of class names.
        weights_path (str): Directory path to save visuals.
        fold (int): Current fold number.
    """
    # Generate predictions (predicted probabilities) for the validation set
    val_predictions = model.predict(val_ds)
    # Convert predicted probabilities to class labels using argmax
    val_predicted_ids = np.argmax(val_predictions, axis=1)
    # Concatenate true labels from the validation dataset
    true_labels = np.concatenate([y for _, y in val_ds], axis=0)

    # Save the confusion matrix
    confusion_matrix_path = os.path.join(weights_path, f"confusion_matrix_fold{fold}.png")
    save_confusion_matrix(true_labels, val_predicted_ids, class_names, confusion_matrix_path)

    # Save the loss curve using the training history
    loss_curve_path = os.path.join(weights_path, f"loss_curve_fold{fold}.png")
    save_loss_curve(history.history, loss_curve_path)

    # Save the roc auc curve using the training history
    roc_auc_curve_path = os.path.join(weights_path, f"roc_auc_curve_fold{fold}.png")
    save_roc_auc(true_labels, val_predictions, class_names, roc_auc_curve_path)

    # Save evaluation metrics bar chart (passing predicted probabilities for ROC AUC calculation)
    metrics_bar_chart_path = os.path.join(weights_path, f"evaluation_metrics_fold{fold}.png")
    save_evaluation_metrics(true_labels, val_predicted_ids, val_predictions, metrics_bar_chart_path)

    # Save the classification report as a text file
    classification_report_path = os.path.join(weights_path, f"classification_report_fold{fold}.txt")
    save_classification_report(true_labels, val_predicted_ids, class_names, classification_report_path)


### Automated Hyperparameter Tuning

In [5]:
# # Split data into training/validation set for hyperparameter tuning
# train_images_tuning, val_images_tuning, train_labels_tuning, val_labels_tuning = train_test_split(
#     image_paths, labels, test_size=0.1, random_state=42, stratify=labels
# )

# # Define the hypermodel for hyperparameter tuning
# def build_model(hp):
#     base_model = tf.keras.applications.MobileNetV2(
#         input_shape=(img_height, img_width, 3),
#         include_top=False,
#         weights='imagenet'
#     )
#     base_model.trainable = False  # Freeze layers initially
    
#     model = Sequential([
#         base_model,
#         layers.GlobalAveragePooling2D(),
#         layers.Dense(num_classes)
#     ])

#     # Tune hyperparameters
#     learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')
#     optimizer = hp.Choice('optimizer', values=['adam', 'sgd'])

#     if optimizer == 'adam':
#         opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
#     else:
#         opt = tf.keras.optimizers.SGD(learning_rate=learning_rate)

#     model.compile(
#         optimizer=opt,
#         loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#         metrics=['accuracy']
#     )
    
#     return model

# # Set up the tuner for hyperparameter tuning
# tuner = kt.RandomSearch(
#     build_model,
#     objective='val_accuracy',  # Optimize for validation accuracy
#     max_trials=10,             # Try 10 different hyperparameter combinations
#     executions_per_trial=1,    # Run each combination once
#     directory='hyperparameter_tuning',
#     project_name='best_hyperparams_tuning'
# )

# # Prepare TensorFlow datasets for training and validation
# train_ds = tf.data.Dataset.from_tensor_slices((train_images_tuning, train_labels_tuning)).batch(batch_size)
# val_ds = tf.data.Dataset.from_tensor_slices((val_images_tuning, val_labels_tuning)).batch(batch_size)

# # Perform the hyperparameter search on the validation set
# tuner.search(train_ds, validation_data=val_ds, epochs=10)

# # Get the best hyperparameters after the search
# best_hyperparams = tuner.get_best_hyperparameters(num_trials=1)[0]

# # Print the best hyperparameters
# print(f"Best Hyperparameters: {best_hyperparams.values}")


### Model creation and fine tuning

In [6]:
# Function to create and compile the model
def create_model(num_classes, config, fine_tune=None):
    # if you are not fine tuning the model, instantiate a new model 
    if(fine_tune == False):         
        # instantiate mobilenet (contains 154 layers)
        base_model = tf.keras.applications.MobileNetV2(
            input_shape=(img_height, img_width, 3),     # set the input it will receive
            include_top=False,                          # do not include top layer to perform transfer learning
            weights='imagenet'                          # load weights from imagenet dataset
        )
        base_model.trainable = False                    # Freeze the base model
        
        # add a layer in order to perform classification on our dataset
        model = Sequential([
            base_model,                         # use base_model as the start of your model
            layers.GlobalAveragePooling2D(),    # add a final layer to perform classification
            layers.Dense(num_classes)           # set the number of possible prediction to the num of classes in dataset
        ])
        
    # select optimizer and learning rate based on configuration
    if config["optimizer"] == "adam":
        optimizer = tf.keras.optimizers.Adam(learning_rate=config["learning_rate"])
    elif config["optimizer"] == "sgd":
        optimizer = tf.keras.optimizers.SGD(learning_rate=config["learning_rate"])
    else:
        raise ValueError(f"Unsupported optimizer: {config['optimizer']}")

    # compile the model
    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy']
    )
    
    return model

# fine tune model by unfreezing the layers after the first fine_tune_at layers
def fine_tune_model(base_model, fine_tune_at):
    # Unfreeze the layers starting from fine_tune_at index
    for layer in base_model.layers[:fine_tune_at]:
        layer.trainable = False
    for layer in base_model.layers[fine_tune_at:]:
        layer.trainable = True


### Training loop

In [None]:
train_metrics = []      # list to save training metrics
val_metrics = []        # list to save validation metrics
normalization_layer = layers.Rescaling(1.0 / 255)
test_ds = tf.data.Dataset.from_tensor_slices((test_images, test_labels))
test_ds = test_ds.map(lambda x, y: (normalization_layer(x), y)).batch(32)

for i, config in enumerate(configs):
    print(f"Training model {i + 1}/{len(configs)} with config: {config}")

    # K-fold Cross Validation
    kfold = StratifiedKFold(n_splits=config['folds'], shuffle=True, random_state=42)
    best_val_f1score = -float('inf')            # Initialize best F1 score with a very low value

    # Define the base path for saving models
    model_subdir = os.path.join(save_dir, f'model{i + 1}')
    os.makedirs(model_subdir, exist_ok=True)

    # Define the base path for saving checkpoints for model
    checkpoint_folder = os.path.join(model_subdir, 'checkpoints')
    os.makedirs(checkpoint_folder, exist_ok=True)

    # Define the base path for saving cthe model with the best f1-score
    best_f1_dir = os.path.join(model_subdir, 'best_f1score_fold')
    os.makedirs(best_f1_dir, exist_ok=True)
    
    # Training and validation loop for each fold
    fold = 1
    best_f1_score = 0
    for train_idx, val_idx in kfold.split(train_val_images, train_val_labels):
        print(f"\nFold {fold}/{config['folds']}...")

        checkpoint_filepath = os.path.join(checkpoint_folder, f'checkpoint_fold{fold}.weights.h5')

        # Create subset datasets for training and validation
        train_images, train_labels = train_val_images[train_idx], train_val_labels[train_idx]
        val_images, val_labels = train_val_images[val_idx], train_val_labels[val_idx]

        # Convert NumPy arrays back to TensorFlow datasets
        train_ds = tf.data.Dataset.from_tensor_slices((train_images, train_labels))
        val_ds = tf.data.Dataset.from_tensor_slices((val_images, val_labels))

        # Normalize datasets 
        normalization_layer = layers.Rescaling(1./255)
        train_ds = train_ds.map(lambda x, y: (normalization_layer(x), y))
        val_ds = val_ds.map(lambda x, y: (normalization_layer(x), y))

        # prefetch data to improve performance by overlapping data preprocessing and model execution and cache the dataset in memory and batch
        AUTOTUNE = tf.data.AUTOTUNE
        train_ds = train_ds.batch(batch_size).cache().prefetch(buffer_size=AUTOTUNE)
        val_ds = val_ds.batch(batch_size).cache().prefetch(buffer_size=AUTOTUNE)

        # Step 1: Train model with frozen layers
        print(f"Training with frozen base layers for {config['epochs']} epochs...")

        # Create and compile model for each fold
        model = create_model(num_classes, config, fine_tune=False) 

        # setup callbacks 
        early_stopping, model_checkpoint, reduce_lr = callbacks_setup(checkpoint_filepath)

        # train the model on the training set until the epochs specified
        history_frozen = model.fit(
            train_ds,                                       # dataset used for training
            validation_data=val_ds,                         # dataset used for validation
            epochs=config['epochs'],                        # epochs used for training
            callbacks=[early_stopping, model_checkpoint, reduce_lr],   # set early stopping to avoid overfitting
            verbose=1
        )

        # load the best weights from ModelCheckpoint after training
        model.load_weights(checkpoint_filepath)

        if(config["fine_tune"] == True):
            # Step 2: Unfreeze layers and fine-tune
            print(f"Unfreezing layers starting from layer {config['fine_tune_at']} for fine-tuning...")
            fine_tune_model(model.layers[0], config['fine_tune_at'])      # fine tune model

            # re-compile the model with a lower learning rate for fine-tuning
            fine_tune_lr = config['learning_rate'] * 0.01

            model.compile(
                optimizer=tf.keras.optimizers.Adam(learning_rate=fine_tune_lr),
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy']
            )
                
            print(f"Fine-tuning for {config['fine_tune_epochs']} epochs...")

            # setup callbacks again for fine-tuning phase with a unique checkpoint
            early_stopping, model_checkpoint = callbacks_setup(checkpoint_filepath)
            
            history_fine_tune = model.fit(
                train_ds,                                       # dataset used for training
                validation_data=val_ds,                         # dataset used for validation
                epochs=config['fine_tune_epochs'],                        # epochs used for training
                callbacks=[early_stopping, model_checkpoint],   # set early stopping to avoid overfitting
                verbose=1
            )

            # load weights after fine-tuning
            model.load_weights(checkpoint_filepath)

        # evaluate on validation set after training
        val_predictions = model.predict(val_ds)
        avg_val_loss = model.evaluate(val_ds, verbose=0)[0]
        avg_val_accuracy, avg_val_precision, avg_val_recall, avg_val_f1, avg_val_auc = calculate_metrics(
            np.concatenate([y for x, y in val_ds]), val_predictions
        )

        print(f"\nValidation: \tFold {fold} - Loss: {avg_val_loss:.4f}, Accuracy: {avg_val_accuracy:.4f}, Precision: {avg_val_precision:.4f}, Recall: {avg_val_recall:.4f}, F1 Score: {avg_val_f1:.4f}, AUC Score: {avg_val_auc:.4f}")

        test_predictions = model.predict(test_ds)
        predicted_labels = np.argmax(test_predictions, axis=-1)
        true_labels = np.concatenate([y for _, y in test_ds], axis=0)

        avg_test_f1 = f1_score(true_labels, predicted_labels, average='macro')

        # -------------------- Optional: Evaluation on Test Dataset --------------------
        # If this fold produces the best F1 score so far, save the model and visuals
        if avg_test_f1 > best_f1_score:
            best_f1_score = avg_test_f1
            # Save the best model (using model.export for TensorFlow SavedModel format)
            model.export(best_f1_dir)
            print(f"Best model updated at Fold {fold} with F1 Score: {best_f1_score:.4f}")
            if config.get('save_metrics', False):
                save_best_model_visuals(history_frozen, model, test_ds, class_names, model_subdir, fold)

        fold += 1       # Move to the next fold

# save metrics after training
# np.save(os.path.join(save_dir, 'train_metrics.npy'), train_metrics)
# np.save(os.path.join(save_dir, 'val_metrics.npy'), val_metrics)

Training model 1/1 with config: {'learning_rate': 0.001, 'optimizer': 'adam', 'epochs': 50, 'save_metrics': True, 'folds': 5, 'fine_tune': False, 'fine_tune_epochs': 25, 'fine_tune_at': 150}

Fold 1/5...
Training with frozen base layers for 50 epochs...
Epoch 1/50
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 243ms/step - accuracy: 0.5861 - loss: 1.1096 - val_accuracy: 0.7651 - val_loss: 0.6243 - learning_rate: 0.0010
Epoch 2/50
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 229ms/step - accuracy: 0.8055 - loss: 0.5389 - val_accuracy: 0.8129 - val_loss: 0.5182 - learning_rate: 0.0010
Epoch 3/50
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 224ms/step - accuracy: 0.8503 - loss: 0.4265 - val_accuracy: 0.8266 - val_loss: 0.4678 - learning_rate: 0.0010
Epoch 4/50
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 221ms/step - accuracy: 0.8768 - loss: 0.3641 - val_accuracy: 0.8350 - val_loss: 0.4380 - learning_rat

NameError: name 'best_f1_score' is not defined

## Testing

In [None]:
model = tf.keras.models.load_model('../saved_models/mobilenetv2_best_f1score_fold_1.h5')

# once training is complete, evaluate on the held-out test set
print("Evaluating the best model on the held-out test set...")
test_ds = tf.data.Dataset.from_tensor_slices((test_images, test_labels))
test_ds = test_ds.map(lambda x, y: (normalization_layer(x), y)).batch(batch_size)

test_predictions = model.predict(test_ds)
avg_test_loss = model.evaluate(test_ds, verbose=0)[0]
avg_test_accuracy, avg_test_precision, avg_test_recall, avg_test_f1, avg_test_auc = calculate_metrics(
    np.concatenate([y for x, y in test_ds]), test_predictions
)

print(f"\nTest Set Evaluation - Loss: {avg_test_loss:.4f}, Accuracy: {avg_test_accuracy:.4f}, Precision: {avg_test_precision:.4f}, Recall: {avg_test_recall:.4f}, F1 Score: {avg_test_f1:.4f}, AUC Score: {avg_test_auc:.4f}")




Evaluating the best model on the held-out test set...
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 103ms/step

Test Set Evaluation - Loss: 0.2367, Accuracy: 0.9219, Precision: 0.9218, Recall: 0.9200, F1 Score: 0.9208, AUC Score: 0.9586
