In [12]:
# Import Required Libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
import sys
sys.path.append('./src')
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score, classification_report


In [21]:
# Colab optional setup

IS_COLAB = "google.colab" in sys.modules
print("Running on Colab:", IS_COLAB)
if IS_COLAB:
  from google.colab import drive
  drive.mount('/content/drive/', force_remount=True)
  #Adapt the folder to your specific one where you have downloaded the code
  %cd /content/drive/My Drive/progetto-daml


Running on Colab: True
Mounted at /content/drive/
/content/drive/My Drive/progetto-daml


# Model Evaluation on PlantVillage Dataset

This notebook evaluates the trained model using the test set. It includes metrics, confusion matrix, ROC curves, and a classification report.

Be careful if you are running this on COLAB or locally.
Due to some bugs, datasets creation is different.
Change the lower COLAB bool to True if running on COLAB.

In [22]:
COLAB = True  # if True, run on Google Colab, else on local repository

In [23]:
IMG_SIZE = (128, 128)
BATCH_SIZE = 32


In [86]:
#define prerpocess function
def preprocess(image, label, image_size=(128, 128)):
    image = tf.image.resize(image, image_size)
    image = tf.cast(image, tf.float32) / 255.0
    return image, label


IF RUNNING ON COLAB

In [159]:
if COLAB: #if running on colab
    ds_info = tfds.builder('plant_village').info
    ds_test = tfds.load('plant_village', split='train[95%:]', as_supervised=True)

    class_names = ds_info.features['label'].names
    families = sorted({n.split('___')[0] for n in class_names})
    split_labels = families
    family_map = tf.constant([families.index(n.split('___')[0]) for n in class_names], dtype=tf.int32)

    # returns label as one-hot
    def to_ohe(img, lbl):
      idx = tf.gather(family_map, lbl)
      return img, tf.one_hot(idx, len(split_labels))

    test_ds  = ds_test.map(to_ohe)

IF RUNNING ON LOCAL REPOSITORY

In [160]:
# Define local path to generate split
if not COLAB:
    from pathlib import Path
    from preprocessing import preprocess

    base_path = Path(r"D:\progetto-daml") #Change according to the path where PlantVillage-Dataset is cloned. Same as training notebook
    base_path = base_path / "PlantVillage-Dataset"

    OUTPUT_ROOT = base_path / "by_family" # 14 families splitting

    # read test set
    split_labels = sorted([p.name for p in (OUTPUT_ROOT/"train").iterdir() if p.is_dir()]) #folders names
    test_ds = tf.keras.utils.image_dataset_from_directory(
        str(OUTPUT_ROOT/"test"),
        image_size=IMG_SIZE,
        batch_size=BATCH_SIZE,
        label_mode='categorical',
        shuffle=False,
    )


# Model selection and test evaluation

In [161]:
models = {
    0: "best_model_basecnn_14_families.h5", # run with import from directory
    1: "best_model_14_families_categorical_l2coeff_1e-3.h5", # run with import from directory
    2: "best_model_14_families_focal_l2coeff_1e-3.h5", # run with import from directory
    3: "best_model_14_families_transfer_mobilenet.h5" # run with import through tfds.load
}

# model selection through index (if you change model, please recompile from preprocess function)
n = 1

model_name = models.get(n)
if model_name is None:
    print(f"No file associated to {n}")
else:
    model_path = f"./weights/{model_name}"
    print(f"Model file name: {model_name}")

#comment following line if you have trained and saved new files after Sunday 27/07, 9 PM
if n != 3 and not COLAB:
    test_ds = test_ds.map(lambda img, lbl: (tf.image.resize(img, IMG_SIZE), lbl)).prefetch(tf.data.AUTOTUNE)
if n != 3 and COLAB:
    test_ds = test_ds.map(lambda img, lbl: (tf.image.resize(img, IMG_SIZE), lbl)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
if n == 3 and COLAB:
    test_ds = test_ds.map(lambda img, lbl: preprocess(img, lbl, IMG_SIZE)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
if n == 3 and not COLAB:
    test_ds = test_ds.map(lambda img, lbl: preprocess(img, lbl, IMG_SIZE)).prefetch(tf.data.AUTOTUNE)

Model file name: best_model_14_families_categorical_l2coeff_1e-3.h5


In [162]:
import tensorflow_datasets as tfds

# Load Trained Model and Test Data
model = keras.models.load_model(model_path)




In [163]:
# Generate Predictions on Test Set
y_true = []
y_pred = []
y_score = []
for images, labels in test_ds:
    y_true.extend(np.argmax(labels.numpy(), axis=1))
    preds = model.predict(images)
    y_pred.extend(np.argmax(preds, axis=1))
    y_score.append(preds)
y_score = np.concatenate(y_score)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 416ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 331ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 339ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 313ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 320ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 315ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 316ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 336ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 310ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 319ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 334ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 317ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 331ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [164]:
# Calculate Evaluation Metrics (Accuracy, Precision, Recall, F1)
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')
print(f"Accuracy: {accuracy:.6f}")
print(f"Precision: {precision:.6f}")
print(f"Recall: {recall:.6f}")
print(f"F1-score: {f1:.6f}")


Accuracy: 0.990424
Precision: 0.990500
Recall: 0.990424
F1-score: 0.990421


In [None]:
  import matplotlib.pyplot as plt
from plotting import plot_confusion_matrix

plt.figure()  # Set a larger figure size for better readability
plot_confusion_matrix(
    np.array(y_true),
    np.array(y_pred),
    classes=split_labels,
    normalize=True,
    title="Confusion Matrix (Test Set)",
    cmap="plasma"
)
plt.tight_layout()  # Ensure labels and ticks are not cut off
plt.savefig(f'./conf-matrix/confusion_matrix_v{model_name}.png', dpi=400, bbox_inches='tight')
plt.show()


In [None]:
# Plot ROC Curves for Each Class
n_classes = y_score.shape[1]
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(np.eye(n_classes)[y_true][:, i], y_score[:, i])
    roc_auc[i] = roc_auc_score(np.eye(n_classes)[y_true][:, i], y_score[:, i])
plt.figure(figsize=(12, 12))
auc_and_idx = sorted([(roc_auc[i], i) for i in range(n_classes)], reverse=True)
for auc, i in auc_and_idx:
    plt.plot(fpr[i], tpr[i], label=f'{split_labels[i]} (AUC = {auc:.4f})')
plt.plot([0, 1], [0, 1], 'r--', lw=2, label='Random Classifier (AUC = 0.5)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - One vs Rest (Test Set)')
plt.legend(fontsize='small', bbox_to_anchor=(1.05, 1), loc='best')
plt.savefig(f'./roc-curves/roc_curve_v{model_name}.png', dpi=400, bbox_inches='tight')
plt.show()


In [None]:
# Display Classification Report
report = classification_report(y_true, y_pred, target_names=split_labels)
print(report)

# Save the classification report to a text file
with open(f'./reports/report_v{model_name}.txt', 'w') as f:
    f.write(report)
