In [None]:
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_datasets as tfds
import os
import pandas as pd

from tqdm.notebook import tqdm


In [None]:
# Load the PlantVillage dataset from TFDS instead of the new dataset (it performed data aug on the validation set, which is wrong)
(ds_train, ds_val, ds_test), ds_info = tfds.load(
    'plant_village',
    split=['train[:80%]', 'train[80%:90%]', 'train[90%:]'],
    shuffle_files=True,
    as_supervised=True,  # returns (image, label) pairs
    with_info=True
)


2025-07-18 12:23:12.490997: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-07-18 12:23:12.491021: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 48.00 GB
2025-07-18 12:23:12.491025: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 18.00 GB
2025-07-18 12:23:12.491049: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-07-18 12:23:12.491068: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [None]:
data_augmentation = tf.keras.Sequential([
    # tf.keras.layers.RandomFlip("horizontal"),
    # tf.keras.layers.RandomFlip("vertical"),
    tf.keras.layers.RandomRotation(0.1),
    tf.keras.layers.RandomZoom(0.1),
    tf.keras.layers.RandomContrast(0.1),
])

def preprocess(image, label):
    image = tf.image.resize(image, IMG_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image, tf.one_hot(label, ds_info.features['label'].num_classes)

def preprocess_with_aug(image, label):
    image = tf.image.resize(image, IMG_SIZE)
    image = data_augmentation(image)  # <-- augment here
    image = tf.cast(image, tf.float32) / 255.0
    return image, tf.one_hot(label, ds_info.features['label'].num_classes)



In [None]:
import tensorflow as tf
import os

# Create output directory
os.makedirs('augmented_data', exist_ok=True)

for i, (image, label) in enumerate(ds_train.take(1000)):  # Limit for demo
    # Apply augmentation
    aug_image = data_augmentation(image)
    # Save augmented image
    tf.keras.preprocessing.image.save_img(f'augmented_data/img_{i}.png', aug_image.numpy())
    # Optionally, save label info

# Later, load with:
train_ds = tf.keras.utils.image_dataset_from_directory('augmented_data', ...)
