# Progetto d'esame di Data Analysis in Experimental Physics with Machine Learning

Gruppo composto dagli studenti Luca Attinà, Sharis Feriotto e Matteo Marchisio Caprioglio

Ipotesi dataset iniziale: https://www.kaggle.com/datasets/vipoooool/new-plant-diseases-dataset

Questo dataset non va bene perchè ha fatto data aug sul validation dataset, fallback al plant village originale: https://www.tensorflow.org/datasets/catalog/plant_village

In [None]:
# Initial imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

import tensorflow as tf
import tensorflow_datasets as tfds

import os
import sys
sys.path.append('./src')
import pickle

# from tqdm.notebook import tqdm


In [None]:
# Colab optional setup
IS_COLAB = "google.colab" in sys.modules
print("Running on Colab:", IS_COLAB)
if IS_COLAB:
  from google.colab import drive
  drive.mount('/content/drive/', force_remount=True)
  #Adapt the folder to your specific one where you have downloaded the code
  %cd /content/drive/My Drive/path_to/exam-project


In [None]:
print(tf.config.list_physical_devices('GPU'))


In [None]:
# Seed setting for reproducibility
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)
tf.keras.utils.set_random_seed(42)


## Load the PlantVillage dataset from TFDS

In [None]:
(ds_train, ds_val), ds_info = tfds.load(
    'plant_village',
    split=['train[:80%]', 'train[80%:95%]'],
    shuffle_files=[True, False],
    as_supervised=True,  # returns (image, label) pairs
    with_info=True
)

# print the number of elements in each dataset
print(f"Number of training images: {ds_info.splits['train[:80%]'].num_examples}")
print(f"Number of validation images: {ds_info.splits['train[80%:95%]'].num_examples}")


In [None]:
# Show example from the dataset
SAVE_EXAMPLES = False  # Set to True to save the example images
fig = tfds.show_examples(ds_train, ds_info, rows=4, cols=4,)
if SAVE_EXAMPLES:
    fig.savefig('example_images.png')


In [None]:
# Extract only labels (not images) for counting
labels_list = []
for _, label in ds_train:
    labels_list.append(label.numpy())

# Convert to pandas Series for fast operations
labels_series = pd.Series(labels_list)

number_of_classes = ds_info.features['label'].num_classes
print(f"Number of classes: {number_of_classes}")
class_names = ds_info.features['label'].names

# Count using pandas (fast) but memory-efficient
class_counts = labels_series.value_counts().sort_index()

# Print the counts with class names
print("Number of images per class:")
for idx, count in class_counts.items():
    print(f"{class_names[idx]}: {count} images")


### Visualize the class distribution in the training dataset

In [None]:
NORMALIZE = False  # Set to True to normalize the counts

plt.figure(figsize=(12, 6))
alphas = np.linspace(1, 0.4, ds_info.features['label'].num_classes)
for (name, count, alpha) in zip(class_names, class_counts, alphas):
    plt.bar(name, count / (np.sum(class_counts) if NORMALIZE else 1), color='green', alpha=alpha)
plt.xlabel('Class')
plt.xticks(rotation=90)
plt.grid(axis='y')
if NORMALIZE:
    plt.ylabel('Normalized #Images per Class')
    plt.title('Normalized Class Distribution in PlantVillage Dataset')
    plt.tight_layout(rect=[0, 0, 1, 0.95])  # Add this line to leave space for the title
    plt.savefig('norm_class_distr.png')
else:
    plt.ylabel('#Images per Class')
    plt.title('Class Distribution in PlantVillage Dataset')
    plt.tight_layout(rect=[0, 0, 1, 0.95])  # Add this line to leave space for the title
    plt.savefig('class_distr.png')
plt.show()


In [None]:
# setup constants
NUM_CLASSES         = ds_info.features['label'].num_classes

COMMON_FILENAME = 'base_model'

WEIGHTS_DIR = './weights'
HISTORY_DIR = './train-hist'
os.makedirs(WEIGHTS_DIR, exist_ok=True)
os.makedirs(HISTORY_DIR, exist_ok=True)
CHECKPOINT_FILE = f'{WEIGHTS_DIR}/{COMMON_FILENAME}.h5'
HISTORY_FILE = f'{HISTORY_DIR}/history_{COMMON_FILENAME}'

IMG_SIZE            = (128, 128) # Image size for resizing in preprocessing
IMG_CHANNELS        = 3          # Number of channels in the images (RGB)
BATCH_SIZE          = 64         # Batch size for training
N_EPOCHS            = 50         # Number of epochs for training
STARTING_LR         = 0.0002     # Starting learning rate for the optimizer
EARLY_STOP_PATIENCE = 4          # Patience for early stopping
REDUCE_LR_PATIENCE  = 3          # Patience for reducing learning rate



## Preprocess and batch the datasets

In [None]:
from preprocessing import preprocess

train_ds = ds_train.map(lambda image, label: preprocess(image, label, ds_info, IMG_SIZE), num_parallel_calls=tf.data.AUTOTUNE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_ds   = ds_val.map  (lambda image, label: preprocess(image, label, ds_info, IMG_SIZE), num_parallel_calls=tf.data.AUTOTUNE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)


## Build the base architecture

In [None]:
from keras.models import Sequential, Model
from keras.layers import Activation, BatchNormalization, Dense, Conv2D, MaxPooling2D, Dropout, Flatten, GlobalAveragePooling2D, ReLU, Rescaling
from keras.optimizers.legacy import Adam, SGD
from keras.losses import CategoricalCrossentropy

from keras.metrics import CategoricalAccuracy, Precision, Recall
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau


In [None]:
def simple_cnn(input_shape=(IMG_SIZE[0], IMG_SIZE[1], IMG_CHANNELS),
                   num_classes=NUM_CLASSES,
                   drop_rate=0.4,
                   ):
    model = Sequential([
        Conv2D(16, (3, 3), activation='relu', padding='same', input_shape=input_shape),
        Dropout(drop_rate),
        MaxPooling2D((2, 2)),
        Conv2D(32, (3, 3), activation='relu', padding='same'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(num_classes, activation='softmax')
    ])
    return model


In [None]:
model = simple_cnn()
model.build(input_shape=(None, IMG_SIZE[0], IMG_SIZE[1], IMG_CHANNELS))
model.summary()


## Train the base architecture

In [None]:
optimizer = Adam(learning_rate=STARTING_LR)
model.compile(
    optimizer=optimizer,
    loss=CategoricalCrossentropy(),
    metrics=['accuracy']
)

n_epochs = N_EPOCHS

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=N_EPOCHS,
    callbacks=[
        EarlyStopping(monitor='val_accuracy',
                      patience=EARLY_STOP_PATIENCE,
                      restore_best_weights=True,
                      verbose=1),
        ModelCheckpoint(CHECKPOINT_FILE,
                        monitor='val_accuracy', save_best_only=True,
                        verbose=1),
        ReduceLROnPlateau(monitor='val_accuracy',
                          factor=0.2, patience=REDUCE_LR_PATIENCE,
                          verbose=1,)
    ]
)


### Visualize the model training history

In [None]:
with open(HISTORY_FILE+'.pkl', 'wb') as f:
        pickle.dump(history.history, f)


In [None]:
from plotting import plot_model_history

plot_model_history(history)
plt.savefig(HISTORY_FILE+'.png', dpi=400, bbox_inches='tight')


### For the model evaluation, execute the notebook model_evaluation.ipynb

In [None]:
# # Plot ROC AUC for each class (one-vs-rest)
# from sklearn.metrics import roc_auc_score, roc_curve
# import matplotlib.pyplot as plt

# # Get true labels and predicted probabilities for the validation set
# y_true = []
# y_score = []

# for images, labels in test_ds:
#     y_true.append(labels.numpy())
#     y_score.append(model.predict(images))

# y_true = np.concatenate(y_true)
# y_score = np.concatenate(y_score)

# # Compute ROC AUC for each class
# n_classes = y_true.shape[1]
# fpr = dict()
# tpr = dict()
# roc_auc = dict()

# for i in range(n_classes):
#     fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_score[:, i])
#     roc_auc[i] = roc_auc_score(y_true[:, i], y_score[:, i])


In [None]:
# # Plot ROC curve for each class, legend sorted by AUC score
# plt.figure(figsize=(12, 12))

# # Prepare list of (auc, i) and sort descending
# auc_and_idx = sorted([(roc_auc[i], i) for i in range(n_classes)], reverse=True)

# for auc, i in auc_and_idx:
#     plt.plot(fpr[i], tpr[i], label=f'{class_names[i]} (AUC = {auc:.4f})')
# plt.plot([0, 1], [0, 1], 'r--', lw=2, label='Random Classifier')
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curve - One vs Rest (Validation Set)')
# plt.legend(fontsize='small', bbox_to_anchor=(1.05, 1), loc='upper left')


In [None]:
# # Plot confusion matrix for the validation set
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.metrics import confusion_matrix

# # Get true and predicted labels for the validation set
# y_true = []
# y_pred = []

# for images, labels in val_ds:
#     y_true.extend(np.argmax(labels.numpy(), axis=1))
#     preds = model.predict(images)
#     y_pred.extend(np.argmax(preds, axis=1))

# # Compute confusion matrix
# cm = confusion_matrix(y_true, y_pred, normalize='true')


In [None]:
# plt.figure(figsize=(14, 12))
# sns.heatmap(cm, annot=False, fmt='d', cmap='magma', 
#             xticklabels=class_names, yticklabels=class_names)
# plt.xlabel('Predicted label')
# plt.ylabel('True label')
# plt.title('Confusion Matrix (Validation Set)')
# plt.xticks(rotation=90)
# plt.yticks(rotation=0)
# plt.tight_layout()
# plt.show()


In [None]:
# 
