# Colab training routine

## Set Up the Environment

In [None]:
! git clone https://github.com/LeonardoDiCaterina/DL.git

## Hardware Testing

In [None]:
from psutil import virtual_memory
import tensorflow as tf
print('TensorFlow version:', tf.__version__)

ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
print("GPU Available:", tf.config.list_physical_devices('GPU'))

In [None]:
print("Built with CUDA:", tf.test.is_built_with_cuda())
print("Is GPU available:", tf.test.is_gpu_available())

In [None]:
dummy_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(256, 256, 3)),
    tf.keras.layers.Conv2D(32, 3, activation='relu'),
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(10)
])


dummy_input = tf.random.normal((1, 256, 256, 3))
out = dummy_model(dummy_input)
print(out.shape)

## prepare the direcory for the dataset and preprcess the data

In [3]:
! mkdir /content/DL/data/downloaded_dataset

mkdir: /content/DL/data: No such file or directory


In [None]:
! gdown --id 1PyxqW_nsORX4PetkQo6OIL0mUL1pFsTD --output /content/DL/data/downloaded_dataset/rare_species.zip

In [None]:
! unzip data/downloaded_dataset/rare_species.zip -d data/downloaded_dataset

In [None]:
%cd DL

copy this in the config file to the `preprocessing_config.py` file
```python
DATA_DIR = 'data/downloaded_dataset'
DEST_DIR = 'data/rearranged'
CSV_PATH = f'{DATA_DIR}/metadata.csv'
N_SPLITS = 5 #it's a positive integer
TEST_SIZE = 0.2 # it's a ratio therefore has to be between 0 and 1
OVERSAMPLE = True
LOG_LEVEL = 'INFO'
LABEL_COL = 'family'
OVERCLASS_COL = None
```


In [23]:
! python -m data_preprocessing.main

2025-05-02 16:20:36,948 - __main__ - INFO - Removing existing directory data/rearranged
2025-05-02 16:20:36,948 - __main__ - INFO - Starting preprocessing pipeline
2025-05-02 16:20:37,002 - __main__ - INFO - Loaded metadata with 11983 entries
2025-05-02 16:20:37,002 - data_preprocessing.splitting - INFO - Splitting dataset into train/test and folds
2025-05-02 16:20:37,030 - data_preprocessing.splitting - INFO - Created 5 stratified folds
2025-05-02 16:20:37,030 - __main__ - INFO - Data splitting completed
2025-05-02 16:20:37,030 - __main__ - INFO - Creating directory data/rearranged/fold_0
2025-05-02 16:20:37,030 - __main__ - INFO - Renaming and saving the fold 0
2025-05-02 16:20:37,030 - data_preprocessing.augmentation - INFO - Starting oversampling and saving process
2025-05-02 16:20:37,030 - data_preprocessing.augmentation - INFO - origin_root: data/downloaded_dataset --> dest_root: data/rearranged/fold_0
2025-05-02 16:20:37,030 - data_preprocessing.data_utils - INFO - Calculating n

# train the model with the whole data

copy this in the config file to the `training/training_config.py` file
```python
NUM_CLASSES = 202
INPUT_SHAPE = (256, 256, 3)
N_EPOCHS_4CV = 5 #number of epochs for cross-validation
N_EPOCHS_4FULL_TRAIN = 10 #number of epochs for test set
```

In [26]:
# load the dataset
from data_preprocessing.data_loading import load_data
train_folds,test_ds = load_data()

2025-05-02 16:23:26,111 - data_preprocessing.data_loading - INFO - Loading data from data/rearranged


Found 2397 files belonging to 202 classes.


2025-05-02 16:23:26,608 - data_preprocessing.data_loading - INFO - Loaded test dataset with 75 batches
2025-05-02 16:23:26,608 - data_preprocessing.data_loading - INFO - Test dataset shape: (TensorSpec(shape=(None, 256, 256, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 202), dtype=tf.float32, name=None))
Loading folds:   0%|          | 0/5 [00:00<?, ?it/s]2025-05-02 16:23:26,610 - data_preprocessing.data_loading - INFO - Loading fold 0 from data/rearranged/fold_0


Found 1918 files belonging to 202 classes.


2025-05-02 16:23:26,709 - data_preprocessing.data_loading - INFO - Loading fold 1 from data/rearranged/fold_1


Found 1917 files belonging to 202 classes.


Loading folds:  40%|████      | 2/5 [00:00<00:00,  6.87it/s]2025-05-02 16:23:26,903 - data_preprocessing.data_loading - INFO - Loading fold 2 from data/rearranged/fold_2


Found 1917 files belonging to 202 classes.


2025-05-02 16:23:26,996 - data_preprocessing.data_loading - INFO - Loading fold 3 from data/rearranged/fold_3


Found 1917 files belonging to 202 classes.


Loading folds:  80%|████████  | 4/5 [00:00<00:00,  7.13it/s]2025-05-02 16:23:27,175 - data_preprocessing.data_loading - INFO - Loading fold 4 from data/rearranged/fold_4


Found 1917 files belonging to 202 classes.


Loading folds: 100%|██████████| 5/5 [00:00<00:00,  7.57it/s]
2025-05-02 16:23:27,271 - data_preprocessing.data_loading - INFO - Loaded 5 folds


In [None]:
from functools import reduce
from training.model_selection_utils import build_model

from tf.keras.optimizers import Adam
from tf.keras.callbacks import EarlyStopping

def train_best_model(train_folds, test_data,configuration, input_shape = (256, 256, 3) , num_classes = 202 ,
                      epochs=10):

    model_name, freeze_until, dense_layers, learning_rate = configuration

    # Combine all folds into one dataset
    full_train_ds = reduce(lambda x, y: x.concatenate(y), train_folds)
    full_train_ds = full_train_ds.cache().prefetch(tf.data.AUTOTUNE)

    # Build model with best configuration
    model = build_model(model_name, freeze_until, dense_layers, input_shape, num_classes)

    # Compile with specified learning rate
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # Optional early stopping for safety
    early_stop = EarlyStopping(monitor='accuracy', patience=3, restore_best_weights=True)

    print(f"\nTraining final model with freeze_until={freeze_until}, "
          f"dense_layers={dense_layers}, learning_rate={learning_rate:.2e}")

    # Fit the model with validation data
    history = model.fit(full_train_ds,
                        epochs=epochs,
                        validation_data=test_data,
                        callbacks=[early_stop],
                        verbose=1)

    return model, history


In [None]:
configuration = ('ResNet50', 50, [512], 0.0001)
best_model, best_history = train_best_model(train_folds,test_ds,configuration)

In [None]:
import matplotlib.pyplot as plt
def plot_history(history):
    """
    Plots training and validation accuracy and loss from a Keras history object.

    Args:
        history: A History object returned by model.fit().
    """
    acc = history.history.get('accuracy')
    val_acc = history.history.get('val_accuracy')
    loss = history.history.get('loss')
    val_loss = history.history.get('val_loss')

    if acc and loss:
        epochs = range(1, len(acc) + 1)

        # Plot accuracy
        plt.figure(figsize=(10, 5))
        plt.subplot(1, 2, 1)
        plt.plot(epochs, acc, 'o-', label='Training Accuracy')
        if val_acc:
            plt.plot(epochs, val_acc, 's-', label='Validation Accuracy')
        plt.title('Model Accuracy per Epoch')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.grid(True)

        # Plot loss
        plt.subplot(1, 2, 2)
        plt.plot(epochs, loss, 'o-', label='Training Loss')
        if val_loss:
            plt.plot(epochs, val_loss, 's-', label='Validation Loss')
        plt.title('Model Loss per Epoch')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.grid(True)

        plt.tight_layout()
        plt.show()
    else:
        print("History object does not contain accuracy or loss data.")
        
plot_history(best_history)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import tqdm

def plot_confusion_matrix_from_dataset_with_class_names(model, test_dataset):
    """
    Generates and plots a confusion matrix using class names from the dataset.

    Args:
        model: A trained TensorFlow Keras model.
        test_dataset: A TensorFlow PrefetchDataset with a 'class_names' attribute.
    """
    if not hasattr(test_dataset, 'class_names'):
        raise AttributeError("The test_dataset must have a 'class_names' attribute.")

    #class_names = test_dataset.class_names
    y_true = []
    y_pred = []
    print(len(test_dataset))
    for images, labels in tqdm(test_dataset, desc="Predicting"):
        y_true.extend(np.argmax(labels.numpy(), axis=1))
        y_pred.extend(np.argmax(model.predict(images, verbose=0), axis=1))
    cm = confusion_matrix(y_true, y_pred)
    normalized_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.figure(figsize=(10, 8))
    sns.heatmap(normalized_cm, annot=False, fmt='.2f', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Normalized Confusion Matrix')
    plt.show()

plot_confusion_matrix_from_dataset_with_class_names(best_model, test_ds)