In [2]:
#installation of tensorflow_datasets (un-comment if needed)
%pip install tensorflow_datasets



In [3]:
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds
import os
import pandas as pd
from tqdm.notebook import tqdm

In [4]:
# Load the PlantVillage dataset from TFDS instead of the new dataset (it performed data aug on the validation set, which is wrong)
(ds_train, ds_val, ds_test), ds_info = tfds.load(
    'plant_village',
    split=['train[:80%]', 'train[80%:90%]', 'train[90%:]'],
    shuffle_files=True,
    as_supervised=True,  # returns (image, label) pairs
    with_info=True
)



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/plant_village/1.0.2...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/plant_village/incomplete.B8H7Y6_1.0.2/plant_village-train.tfrecord*...:   …

Dataset plant_village downloaded and prepared to /root/tensorflow_datasets/plant_village/1.0.2. Subsequent calls will reuse this data.


In [7]:
#Number of elements per class:
class_counts = Counter(tf.argmax(tf.one_hot(label, depth=ds_info.features['label'].num_classes)).numpy() for _, label in ds_train)
class_names = ds_info.features['label'].names
class_counts_named = {class_names[i]: count for i, count in class_counts.items()}

for name, count in class_counts_named.items():
    print(f"{name}: {count}")

Tomato___Septoria_leaf_spot: 1409
Tomato___Early_blight: 807
Squash___Powdery_mildew: 1466
Peach___Bacterial_spot: 1838
Tomato___Leaf_Mold: 758
Orange___Haunglongbing_(Citrus_greening): 4399
Soybean___healthy: 4043
Tomato___Target_Spot: 1124
Tomato___Tomato_Yellow_Leaf_Curl_Virus: 4252
Cherry___Powdery_mildew: 851
Grape___Esca_(Black_Measles): 1111
Pepper,_bell___Bacterial_spot: 800
Tomato___healthy: 1261
Corn___healthy: 939
Strawberry___Leaf_scorch: 923
Apple___Apple_scab: 501
Blueberry___healthy: 1210
Tomato___Late_blight: 1535
Apple___Black_rot: 467
Tomato___Bacterial_spot: 1732
Grape___Black_rot: 956
Pepper,_bell___healthy: 1173
Raspberry___healthy: 302
Apple___healthy: 1306
Corn___Northern_Leaf_Blight: 788
Corn___Common_rust: 953
Potato___Early_blight: 792
Tomato___Tomato_mosaic_virus: 303
Grape___Leaf_blight_(Isariopsis_Leaf_Spot): 881
Apple___Cedar_apple_rust: 221
Tomato___Spider_mites Two-spotted_spider_mite: 1360
Potato___Late_blight: 801
Cherry___healthy: 659
Strawberry___hea

In [8]:
#setting the desired number of elements in each class after augmentation (target).
#In this run target = # of elements in the largest class
target = max(class_counts.values())
class_elements = Counter(class_counts)
num_classes = len(class_names)

data_augmentation = tf.keras.Sequential([
    # tf.keras.layers.RandomFlip("horizontal"),
    # tf.keras.layers.RandomFlip("vertical"),
    tf.keras.layers.RandomRotation(0.1),
    tf.keras.layers.RandomZoom(0.1),
    tf.keras.layers.RandomContrast(0.1),
])

def preprocess(image, label):
    image = tf.image.resize(image, IMG_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image, tf.one_hot(label, ds_info.features['label'].num_classes)

def preprocess_with_aug(image, label):
    image = tf.image.resize(image, IMG_SIZE)
    image = data_augmentation(image)  # <-- augment here
    image = tf.cast(image, tf.float32) / 255.0
    return image, tf.one_hot(label, ds_info.features['label'].num_classes)



In [11]:
#this cell should apply a class-specific augmentation pipeline before saving the image

# Create output directory
os.makedirs('augmented_data', exist_ok=True)

# Create output folders
output_root = 'augmented_data'
os.makedirs(output_root, exist_ok=True)
for class_name in class_names:
    os.makedirs(os.path.join(output_root, class_name), exist_ok=True)

#Number of elements per class in the new dir:
#new_class_elements = Counter(tf.argmax(label).numpy() for _, label in ds_train)
new_class_elements = Counter({i: 0 for i in range(len(class_names))})

# Augmentation
for i, (image, label) in enumerate(ds_train.repeat()):  # .repeat() allows infinite looping

    label_index = label.numpy()
    label_name = class_names[label_index]

    # Saving the og images in the new empty folders
    save_path = os.path.join(output_root, label_name, f'img_{new_class_elements[label_index]}.png')
    tf.keras.preprocessing.image.save_img(save_path, image.numpy())
    new_class_elements[label_index] += 1


    # Stop augmenting if this NEW class is already balanced (= the desired number of images has been saved into this new folder)
    if new_class_elements[label_index] >= target:
        continue

    # Stop augmenting if this class is already balanced (= the original folder does not require data augmentation, but the new folder is still being filled)
    if class_elements[label_index] >= target:
        continue

    #the two previous lines are the ones that allow the duplication (and eventual augmentation) of the original classes:
    #if a class contains a number of images < target, then its images will get duplicated and augmented in the new respective class
    #and until the number of elements in that class reaches target

    #if a class contains a number of images >= target, then its images will get duplicated in the new class without
    #augmentation until the new class reaches a number of elements = target; in this way if target > max # of elements
    # the classes with # of elements > target will be trimmed down in the new dir and will contain less images


    # Apply augmentation
    aug_image = data_augmentation(image)

    # Save image
    save_path = os.path.join(output_root, label_name, f'aug_{new_class_elements[label_index]}.png')
    tf.keras.preprocessing.image.save_img(save_path, aug_image.numpy())

    # Update count
    class_elements[label_index] += 1
    new_class_elements[label_index] += 1


    # Stop once all classes are balanced
    #if all(class_elements[c] >= target for c in range(num_classes)):
    if all(new_class_elements[c] >= target for c in range(num_classes)):
        print(" Classes balanced!! ")
        break


OSError: [Errno 28] No space left on device

In [None]:
# useful constants
IMG_SIZE = (128, 128)
BATCH_SIZE = 64
APPLY_DATA_AUGMENTATION = False
N_EPOCHS = 30
NUM_CLASSES = ds_info.features['label'].num_classes
DROP_RATE = 0.3
L2_REGULARIZATION = 0.005

In [None]:
#load the new training set in the training notebook

from tensorflow.keras.utils import image_dataset_from_directory

# Define the path to the augmented data
augmented_data_path = os.path.abspath("augmented_data")

# Load the dataset from the new directory
ds_augmented_train = tf.keras.utils.image_dataset_from_directory(
    augmented_data_path,
    labels='inferred',
    label_mode='categorical',   # one-hot encoding
    batch_size=32,
    image_size=(224, 224),      # or whatever size your model expects
    shuffle=True,
    seed=123
)

# Get class names (sorted as image_dataset_from_directory does)
class_names = sorted(os.listdir("augmented_data"))
print("Class names:", class_names)

# Count images per class
class_counts = {cls: len(os.listdir(os.path.join("augmented_data", cls))) for cls in class_names}

print("Number of images per class:")
for cls in class_names:
    print(f"{cls}: {class_counts[cls]} images")



In [None]:
from preprocessing import preprocess

# Constants
IMG_SIZE = (224, 224)  # Or whatever size you're using
BATCH_SIZE = 32

# Load augmented training data from directory
ds_augmented_train = tf.keras.utils.image_dataset_from_directory(
    'augmented_data',
    labels='inferred',
    label_mode='categorical',  # or 'int' if you use sparse labels
    image_size=IMG_SIZE,
    batch_size=None,  # So we can map first, then batch
    shuffle=True
)

# Apply preprocessing and batching
train_ds = ds_augmented_train.map(
    lambda image, label: preprocess(image, label, None, IMG_SIZE),
    num_parallel_calls=tf.data.AUTOTUNE
).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# Keep validation and test the same (assuming they come from TFDS)
val_ds = ds_val.map(
    lambda image, label: preprocess(image, label, ds_info, IMG_SIZE),
    num_parallel_calls=tf.data.AUTOTUNE
).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

test_ds = ds_test.map(
    lambda image, label: preprocess(image, label, ds_info, IMG_SIZE),
    num_parallel_calls=tf.data.AUTOTUNE
).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
from keras.models import Sequential, Model
from keras.layers import Input, Activation, BatchNormalization, Dense, Conv2D, MaxPooling2D, Dropout, Flatten, GlobalAveragePooling2D, ReLU, Rescaling, Add
from keras.optimizers.legacy import Adam, SGD
from keras.losses import CategoricalCrossentropy
from keras.regularizers import l2

from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [None]:
def simple_cnn_v1(#input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3),
                num_classes=NUM_CLASSES):
    model = Sequential([

        Conv2D(16, (5, 5), padding='same', kernel_regularizer=l2(L2_REGULARIZATION)),
        BatchNormalization(),
        Activation('relu'),
        MaxPooling2D((2, 2)),
        Dropout(DROP_RATE),

        Conv2D(32, (5, 5), padding='same', kernel_regularizer=l2(L2_REGULARIZATION)),
        BatchNormalization(),
        Activation('relu'),
        MaxPooling2D((2, 2)),
        Dropout(DROP_RATE),

        Flatten(),

        Dense(64, activation='relu', kernel_regularizer=l2(L2_REGULARIZATION)),
        Dense(num_classes, activation='softmax')
    ])
    return model

In [None]:
optimizer = Adam(learning_rate=0.0002)
# optimizer = SGD(learning_rate=0.05, momentum=0.9)
model.compile(
    optimizer=optimizer,
    loss=CategoricalCrossentropy(),
    metrics=['accuracy']
)

n_epochs = 30

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=n_epochs,
    callbacks=[
        EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
        ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2)
    ]
)

In [None]:
# Plot ROC AUC for each class (one-vs-rest)
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Get true labels and predicted probabilities for the validation set
y_true = []
y_score = []

for images, labels in val_ds:
    y_true.append(labels.numpy())
    y_score.append(model.predict(images))

y_true = np.concatenate(y_true)
y_score = np.concatenate(y_score)

# Compute ROC AUC for each class
n_classes = y_true.shape[1]
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_score[:, i])
    roc_auc[i] = roc_auc_score(y_true[:, i], y_score[:, i])

In [None]:
# Plot confusion matrix for the validation set
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Get true and predicted labels for the validation set
y_true = []
y_pred = []

for images, labels in val_ds:
    y_true.extend(np.argmax(labels.numpy(), axis=1))
    preds = model.predict(images)
    y_pred.extend(np.argmax(preds, axis=1))

# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred, normalize='true')

In [None]:
plt.figure(figsize=(14, 12))
sns.heatmap(cm, annot=False, fmt='d', cmap='magma',
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix (Validation Set)')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()