# Utilities and Constants

# Import

In [100]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from keras import layers
import matplotlib.pyplot as plt
from tqdm import tqdm
import shutil
from tensorflow.keras import optimizers
import random as rn
import os
from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

# Path Costants and Classes


In [101]:
DRIVE_DIR = "/content/drive/MyDrive/Colab Notebooks/B-CellLymphoblastsClassification"
DATA_DIR = "/content/drive/MyDrive/Colab Notebooks/B-CellLymphoblastsClassification/dataset/C-NMC_training_data"
PREPROCESSED_DIR = "/content/drive/MyDrive/Colab Notebooks/B-CellLymphoblastsClassification/dataset-cleaned"
SETS_DIR = "/content/drive/MyDrive/Colab Notebooks/B-CellLymphoblastsClassification/dataset-splits"
MODELS_PATH = "/content/drive/MyDrive/Colab Notebooks/B-CellLymphoblastsClassification/Models"

SEED = 123

CLASSES = ['all', 'hem']

In [102]:
IMAGE_WIDTH = 224
IMAGE_HEIGHT = 224
IMAGE_SIZE = (IMAGE_HEIGHT, IMAGE_WIDTH)
BATCH_SIZE = 64

# Set Seed


In [103]:
np.random.seed(SEED)
rn.seed(SEED)
tf.random.set_seed(SEED)

# Object Handling Utilities

In [104]:
#provides functions to store and load objects from files
import pickle

def saveObject(obj, path):
    """"Save an object using the pickle library on a file

    :param obj: undefined. Object to save
    :param fileName: str. Name of the file of the object to save
    """
    print("Saving " + path + '.pkl')
    with open(path + ".pkl", 'wb') as fid:
        pickle.dump(obj, fid)

def loadObject(path):
    """"Load an object from a file

    :param fileName: str. Name of the file of the object to load
    :return: obj: undefined. Object loaded
    """
    try:
        with open(path + '.pkl', 'rb') as fid:
            obj = pickle.load(fid)
            return obj
    except IOError:
        return None

# Training Utility


In [105]:
def load_data_splits (img_size, batch_size, shuffle_on_val=True):
  train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    PREPROCESSED_DIR + '/training_set',
    labels='inferred', #the label of the dataset is obtained by the name of the directory
    seed=SEED,
    shuffle=True,
    image_size=img_size,
    batch_size=batch_size,
  )
  val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    PREPROCESSED_DIR + '/validation_set',
    labels='inferred', #the label of the dataset is obtained by the name of the directory
    seed=SEED,
    shuffle=shuffle_on_val,
    image_size=img_size,
    batch_size=batch_size,
  )
  test_ds = tf.keras.preprocessing.image_dataset_from_directory(
    PREPROCESSED_DIR + '/test_set',
    labels='inferred', #the label of the dataset is obtained by the name of the directory
    seed=SEED,
    shuffle=False,
    image_size=img_size,
    batch_size=batch_size,
  )
  return train_ds, val_ds, test_ds

def split_dataset(dataset, dataset_size, train_percentage=0.6, val_percentage=0.2, test_percentage=0.2, shuffle=True):
  """
    split_dataset splits the dataset into training, validation and test sets.

    :param dataset: a list representing the whole dataset
    :param dataset_size: number of elements in the dataset
    :param train_percentage: the percentage of the dataset that will be used for training
    :param val_percentage: the percentage of the dataset that will be used for validation
    :param test_percentage: the percentage of the dataset that will be used for testing
    :param shuffle: if True the elements of the dataset will be randomly shuffled
    :return: three lists representing the training, validation and test sets
  """
  if train_percentage + val_percentage + test_percentage != 1:
    print('Total of percentages must be 1')
    return None, None, None

  if shuffle:
    random.shuffle(dataset)

  train_size = int(train_percentage * dataset_size)
  val_size = int(val_percentage * dataset_size)

  train_set = dataset[0:train_size]
  val_set = dataset[train_size:train_size+val_size]
  test_set = dataset[train_size+val_size:dataset_size]

  return train_set, val_set, test_set


def compile_model(model, metrics='accuracy', loss='binary_crossentropy', optimizer='adam', learning_rate = 0.0005):
  '''
    compile_model is used to compile the current model
    :param model: model to compile
    :param optimizer: optimizer to be used
    :param learning_rate: learning rate parameter for the optimizer
  '''
  if optimizer == 'adam':
    optimizer=optimizers.Adam(learning_rate=learning_rate)
  elif optimizer == 'rmsprop':
    optimizer = optimizers.RMSprop(learning_rate=learning_rate)
  else:
    return

  model.compile(loss=loss,
    optimizer=optimizer,
    metrics=[metrics])

  return model

def run_model (model, model_name, train_ds, val_ds, epochs=50, patience=5, monitor='val_loss'):
  '''
  run_model is used to run the current mode
  :param model: model to run
  :param model_name: name given to save the model
  :param epochs: how many epochs to do
  :param patience: patience value for Early Stopping
  :param monitor: what to monitor for Early Stopping and Model Checkpoint
  '''
  # local save path for the models
  local_path = "/content/model/" + model_name + '.h5'
  drive_path = MODELS_PATH + '/' + model_name

  # Create local directory
  #if not os.path.exists(local_path):
  #  os.makedirs(local_path)

  # Create derive directory
  #if not os.path.exists(drive_path):
  #  os.makedirs(drive_path)

  #deletes old model
  try:
    shutil.rmtree(drive_path)
  except:
    pass
  os.mkdir(drive_path)
  callbacks_list = [
                  keras.callbacks.EarlyStopping(monitor=monitor, patience=patience), #we implement EarlyStopping to prevent overfitting
                  keras.callbacks.ModelCheckpoint(
                      filepath = local_path,
                      monitor=monitor,
                      verbose=1,
                      save_best_only=True)
                  ]
  history = model.fit(train_ds,
                    epochs=epochs,
                    validation_data=val_ds,
                    callbacks=callbacks_list)

  history_to_save = {
    'loss': history.history['loss'],
    'val_loss': history.history['val_loss'],
    'accuracy': history.history['accuracy'],
    'val_accuracy': history.history['val_accuracy']
  }

  # save on Drive only the best model
  shutil.copy(local_path, drive_path + '/' + model_name + '.h5')
  # save on Drive also the history
  saveObject(history_to_save, drive_path + '/history')

  #try:
  #  saveObject(history, drive_path + '/history')
  #except Exception as e:
  #  print(f"An error occurred while saving the object: {str(e)}")

  return tf.keras.models.load_model(local_path), history


we provide keras layers that performs data augmentation in order to fight overfitting

In [106]:
data_augmentation_layers = keras.Sequential(
  [
      layers.RandomFlip("horizontal"), # Applies horizontal flipping to a random 50% of the images
      layers.RandomFlip("vertical"), # Applies vertical flipping to a random 50% of the images
      layers.RandomContrast(0.20), # Randomly adjust the contrast of an image or images by a random factor in the range[–20%, +20%]
  ]
)

# Evaluation Utilities


In [107]:
def plot_accuracy_and_loss_history(history):
  acc = history.history['accuracy']
  val_acc = history.history['val_accuracy']
  loss = history.history['loss']
  val_loss = history.history['val_loss']

  epochs = range(len(acc))

  plt.plot(epochs, acc, 'bo', label='Training accuracy')
  plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
  plt.title('Training and validation accuracy')
  plt.legend()

  plt.figure()

  plt.plot(epochs, loss, 'bo', label='Training loss')
  plt.plot(epochs, val_loss, 'b', label='Validation loss')
  plt.title('Training and validation loss')
  plt.legend()

  plt.show()

def evaluate_model(model, test_ds):
  y_score = model.predict(test_ds)
  y_pred = np.rint(y_score)  # round value to 0 or 1
  y_true = tf.concat([labels_batch for data_batch, labels_batch in test_ds], axis=0)

  # 0 --> all and 1 --> hem
  y_true = np.where(y_true == 0, "all", "hem")
  y_pred = np.where(y_pred == 0, "all", "hem")

  print("Classification report:")
  print(metrics.classification_report(y_true, y_pred, target_names=["all", "hem"], digits=4))

def plot_confusionmatrix(model, test_ds):
    y_score = model.predict(test_ds)
    y_pred = np.rint(y_score)  # round value to 0 or 1
    y_true = tf.concat([labels_batch for data_batch, labels_batch in test_ds], axis=0)

    # 0 --> all and 1 --> hem
    y_true = np.where(y_true == 0, "all", "hem")
    y_pred = np.where(y_pred == 0, "all", "hem")

    # produce and show confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=["all", "hem"])
    fig, ax = plt.subplots(figsize=(10, 10))
    ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["all", "hem"]).plot(cmap="viridis", ax=ax)



In [108]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
