In [1]:
# 02_data_preprocessing.ipynb

import os
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

# Paths
RAW_DATA_PATH = "../data/raw/PlantVillage"
PROCESSED_PATH = "../data/processed"

# Image parameters
IMG_HEIGHT, IMG_WIDTH = 224, 224  # For MobileNetV2 / ResNet
BATCH_SIZE = 32

# ========================
# Step 1: Data Augmentation
# ========================
datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    vertical_flip=False,
    validation_split=0.2  # 20% for validation
)

# ========================
# Step 2: Train / Validation Generator
# ========================
train_generator = datagen.flow_from_directory(
    RAW_DATA_PATH,
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    subset='training',
    shuffle=True
)

validation_generator = datagen.flow_from_directory(
    RAW_DATA_PATH,
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    subset='validation',
    shuffle=False
)

# ========================
# Step 3: Save class indices
# ========================
import pickle

with open("../models/label_encoder.pkl", "wb") as f:
    pickle.dump(train_generator.class_indices, f)

print("Classes saved:", train_generator.class_indices)


Found 16516 images belonging to 15 classes.
Found 4122 images belonging to 15 classes.
Classes saved: {'Pepper__bell___Bacterial_spot': 0, 'Pepper__bell___healthy': 1, 'Potato___Early_blight': 2, 'Potato___Late_blight': 3, 'Potato___healthy': 4, 'Tomato_Bacterial_spot': 5, 'Tomato_Early_blight': 6, 'Tomato_Late_blight': 7, 'Tomato_Leaf_Mold': 8, 'Tomato_Septoria_leaf_spot': 9, 'Tomato_Spider_mites_Two_spotted_spider_mite': 10, 'Tomato__Target_Spot': 11, 'Tomato__Tomato_YellowLeaf__Curl_Virus': 12, 'Tomato__Tomato_mosaic_virus': 13, 'Tomato_healthy': 14}
