# Progetto d'esame di Data Analysis in Experimental Physics with Machine Learning
Gruppo composto dagli studenti Luca Attinà, Sharis Feriotto e Matteo Marchisio Caprioglio

Dataset ipotesi: https://www.kaggle.com/datasets/vipoooool/new-plant-diseases-dataset

Questo dataset non va bene perchè ha fatto data aug sul validation dataset, fallback al plant village originale: https://www.tensorflow.org/datasets/catalog/plant_village

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_datasets as tfds
import os
import pandas as pd

from tqdm.notebook import tqdm


In [None]:
print(tf.config.list_physical_devices(''))


In [None]:
# Load the PlantVillage dataset from TFDS instead of the new dataset (it performed data aug on the validation set, which is wrong)
(ds_train, ds_val, ds_test), ds_info = tfds.load(
    'plant_village',
    split=['train[:80%]', 'train[80%:95%]', 'train[95%:]'],
    shuffle_files=True,
    as_supervised=True,  # returns (image, label) pairs
    with_info=True
)


In [None]:
# Show example from the dataset
tfds.show_examples(ds_train, ds_info)


In [None]:
class_names = ds_info.features['label'].num_classes
print(f"Number of classes: {class_names}")
class_names = ds_info.features['label'].names
print(f"Class names: {class_names}")


In [None]:
ds_info


In [None]:
# Convert the training dataset to a DataFrame
df = tfds.as_dataframe(ds_train, ds_info)

# Count the number of samples per class
class_counts = df['label'].value_counts().sort_index()

# Print the counts with class names
for idx, count in class_counts.items():
    print(f"{class_names[idx]}: {count} images")


In [None]:
# useful constants
IMG_SIZE = (128, 128)
BATCH_SIZE = 64
APPLY_DATA_AUGMENTATION = False
N_EPOCHS = 30
NUM_CLASSES = ds_info.features['label'].num_classes


In [None]:
data_augmentation = tf.keras.Sequential([
    # tf.keras.layers.RandomFlip("horizontal"),
    # tf.keras.layers.RandomFlip("vertical"),
    tf.keras.layers.RandomRotation(0.1),
    tf.keras.layers.RandomZoom(0.1),
    tf.keras.layers.RandomContrast(0.1),
])

def preprocess(image, label):
    image = tf.image.resize(image, IMG_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image, tf.one_hot(label, ds_info.features['label'].num_classes)

def preprocess_with_aug(image, label):
    image = tf.image.resize(image, IMG_SIZE)
    image = data_augmentation(image)  # <-- augment here
    image = tf.cast(image, tf.float32) / 255.0
    return image, tf.one_hot(label, ds_info.features['label'].num_classes)



In [None]:
# Preprocess and batch the datasets
if APPLY_DATA_AUGMENTATION:
    print("Data augmentation is enabled.")
    train_ds = ds_train.map(preprocess_with_aug).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)  # training dataset with data augmentation
else:
    print("Data augmentation is disabled.")
    train_ds = ds_train.map(preprocess).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)  # training dataset without data augmentation

val_ds = ds_val.map(preprocess).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_ds = ds_test.map(preprocess).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)


In [None]:
# # Count samples per class in the training set
# def count_classes(dataset, class_names):
#     class_counts = [0] * len(class_names)
    
#     for images, labels in dataset:
#         for label in labels:
#             class_idx = np.argmax(label.numpy())
#             class_counts[class_idx] += 1
    
#     return class_counts

# # Count classes in training set
# train_counts = count_classes(train_ds.unbatch(), class_names)

# # Display results
# print("Training set class distribution:")
# for i, (class_name, count) in enumerate(zip(class_names, train_counts)):
#     print(f"{class_name}: {count} images")

# print(f"\nTotal training images: {sum(train_counts)}")
# print(f"Min samples: {min(train_counts)}")
# print(f"Max samples: {max(train_counts)}")
# print(f"Average per class: {sum(train_counts)/len(train_counts):.1f}")
# print(f"Imbalance ratio: {max(train_counts)/min(train_counts):.2f}")


In [None]:
from keras.models import Sequential, Model
from keras.layers import Activation, BatchNormalization, Dense, Conv2D, MaxPooling2D, Dropout, Flatten, GlobalAveragePooling2D, ReLU, Rescaling
from keras.optimizers.legacy import Adam, SGD
from keras.losses import CategoricalCrossentropy

from keras.metrics import CategoricalAccuracy, Precision, Recall
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau


In [None]:
def simple_cnn(input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3), num_classes=NUM_CLASSES):
    model = Sequential([
        Rescaling(1./255, input_shape=input_shape),
        Conv2D(16, (5, 5), activation='relu', padding='same'),
        Dropout(0.2),
        MaxPooling2D((2, 2)),
        Conv2D(32, (5, 5), activation='relu', padding='same'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    return model


In [None]:
model = simple_cnn()
model.summary()


In [None]:
optimizer = Adam(learning_rate=0.0002)
# optimizer = SGD(learning_rate=0.05, momentum=0.9)
model.compile(
    optimizer=optimizer,
    loss=CategoricalCrossentropy(),
    metrics=['accuracy']
)

# Train the model
if APPLY_DATA_AUGMENTATION:
    check_point_filename = 'best_model_w_augmentation.h5'
else:
    check_point_filename = 'best_model_wo_augmentation.h5'


In [None]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=N_EPOCHS,
    callbacks=[
        EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1),
        ModelCheckpoint(check_point_filename, monitor='val_loss', save_best_only=True, verbose=1),
        # ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2)
    ]
)


In [None]:
# plot training history
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.show()
