In [2]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks
from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings("ignore")

print(f"TensorFlow version: {tf.__version__}")

2026-02-24 13:12:35.159511: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow version: 2.18.0


In [3]:
# Paths
processed_dir = Path("../data/processed")
splits_dir = Path("../data/splits")
models_dir = Path("../models")
models_dir.mkdir(parents=True, exist_ok=True)

# Load preprocessing config
with open(processed_dir / "preprocessing_config.json") as f:
    config = json.load(f)

# Load class mappings
with open(processed_dir / "class_to_index.json") as f:
    class_to_index = json.load(f)

with open(processed_dir / "index_to_class.json") as f:
    index_to_class = {int(k): v for k, v in json.load(f).items()}

# Load class weights
with open(processed_dir / "class_weights.json") as f:
    class_weights = {int(k): v for k, v in json.load(f).items()}

# Core config
IMAGE_SIZE = (224, 224)
BATCH_SIZE = 32
N_CLASSES = len(class_to_index)
MEAN = config['normalization']['mean']
STD = config['normalization']['std']
RANDOM_SEED = config['random_seed']

print(f"Classes: {N_CLASSES}")
print(f"Image size: {IMAGE_SIZE}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Normalization mean: {MEAN}")
print(f"Normalization std: {STD}")

Classes: 15
Image size: (224, 224)
Batch size: 32
Normalization mean: [0.46, 0.48, 0.42]
Normalization std: [0.21, 0.18, 0.22]


In [4]:
# Load split manifests
train_df = pd.read_csv(splits_dir / "train.csv")
val_df = pd.read_csv(splits_dir / "val.csv")
test_df = pd.read_csv(splits_dir / "test.csv")

print(f"Train: {len(train_df)} images")
print(f"Val:   {len(val_df)} images")
print(f"Test:  {len(test_df)} images")

Train: 14227 images
Val:   3049 images
Test:  3049 images


In [5]:
# Normalization
mean = tf.constant(MEAN, dtype=tf.float32)
std = tf.constant(STD, dtype=tf.float32)

def normalize(image):
    image = tf.cast(image, tf.float32) / 255.0
    image = (image - mean) / std
    return image

# Minority classes (< 500 images) get aggressive augmentation
MINORITY_CLASSES = [
    cls for cls, idx in class_to_index.items()
    if len(train_df[train_df['class_name'] == cls]) < 500
]
print(f"Minority classes (aggressive augmentation): {MINORITY_CLASSES}")

def augment_standard(image):
    """Standard augmentation for majority classes"""
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    image = tf.image.random_brightness(image, max_delta=0.2)
    image = tf.image.random_contrast(image, lower=0.8, upper=1.2)
    return image

def augment_aggressive(image):
    """Aggressive augmentation for minority classes"""
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    image = tf.image.random_brightness(image, max_delta=0.4)
    image = tf.image.random_contrast(image, lower=0.6, upper=1.4)
    image = tf.image.random_saturation(image, lower=0.6, upper=1.4)
    image = tf.image.random_hue(image, max_delta=0.1)
    return image

def augment_denoise(image):
    """Denoising augmentation weighted toward Tomato classes"""
    image = augment_standard(image)
    # Light Gaussian smoothing to improve robustness to noise
    image = tf.expand_dims(image, 0)
    image = tf.squeeze(
        tf.nn.avg_pool2d(image, ksize=3, strides=1, padding='SAME'), 0
    )
    return image

Minority classes (aggressive augmentation): ['Potato___healthy', 'Tomato__Tomato_mosaic_virus']
