In [None]:
from google.colab import drive
import os
import json
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from PIL import Image
import cv2

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Define base path to the dataset
base_path = '/content/drive/MyDrive/Strawberry Diseases Detection/raw_data'
train_path = os.path.join(base_path, 'train')
val_path = os.path.join(base_path, 'val')
test_path = os.path.join(base_path, 'test')

print(train_path)
print(val_path)
print(test_path)

Mounted at /content/drive
/content/drive/MyDrive/Strawberry Diseases Detection/raw_data/train
/content/drive/MyDrive/Strawberry Diseases Detection/raw_data/val
/content/drive/MyDrive/Strawberry Diseases Detection/raw_data/test


In [None]:
# Disease labels mapping (7 types)
disease_labels = {
    "Angular Leafspot": 0,
    "Anthracnose Fruit Rot": 1,
    "Blossom Blight": 2,
    "Gray Mold": 3,
    "Leaf Spot": 4,
    "Powdery Mildew Fruit": 5,
    "Powdery Mildew Leaf": 6
}

In [None]:
# Step 1: Load paired data
def load_dataset(dataset_path,limit=None):
    """
    Load and pair images (.jpg) with JSON files in the given path.
    """
    files = os.listdir(dataset_path)
    image_files = [f for f in files if f.endswith('.jpg')]
    json_files = [f for f in files if f.endswith('.json')]

    paired_data = []
    for img_file in image_files:
        json_file = img_file.replace('.jpg', '.json')
        if json_file in json_files:
            paired_data.append((os.path.join(dataset_path, img_file),
                                os.path.join(dataset_path, json_file)))
        if limit and len(paired_data) >= limit:
            break

    print(f"Found {len(paired_data)} paired image and JSON files in {dataset_path}")
    return paired_data

# Load datasets
train_data = load_dataset(train_path)
val_data = load_dataset(val_path)
test_data = load_dataset(test_path)

Found 1450 paired image and JSON files in /content/drive/MyDrive/Strawberry Diseases Detection/raw_data/train
Found 307 paired image and JSON files in /content/drive/MyDrive/Strawberry Diseases Detection/raw_data/val
Found 743 paired image and JSON files in /content/drive/MyDrive/Strawberry Diseases Detection/raw_data/test


In [None]:
# Step 2: Preprocessing Functions
def normalize_image(image):
    """Normalize image pixels to the range [0, 1]."""
    return np.array(image, dtype=np.float32) / 255.0

In [None]:
def augment_image(image):
    """Apply data augmentation."""
    image = tf.image.random_flip_left_right(image)  # Random horizontal flip
    image = tf.image.random_flip_up_down(image)    # Random vertical flip
    image = tf.image.random_brightness(image, max_delta=0.2)  # Adjust brightness
    image = tf.image.random_contrast(image, lower=0.8, upper=1.2)  # Adjust contrast
    return image

In [None]:
def preprocess_data_for_training(paired_data, target_size=(224, 224), augment=True):
    """
    Preprocess paired image and JSON data for model training.
    Maps each image to its associated disease(s), resizes, normalizes, and augments them.
    """
    images = []
    labels = []

    for image_path, json_path in paired_data:
        # Load image using OpenCV and convert to RGB
        image = cv2.imread(image_path)  # Load in BGR format
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert to RGB
        image = cv2.resize(image, target_size)  # Resize to target size (e.g., 224x224 for ResNet)

        # Normalize image to [0, 1]
        normalized_image = normalize_image(image)

        # Ensure augmentation is applied to every image
        augmented_image = augment_image(normalized_image)

        # Load and process JSON annotation
        with open(json_path, 'r') as f:
            annotation = json.load(f)

       # Extract unique disease labels from the JSON
        disease_types = list(set(
            shape.get('label').strip() for shape in annotation.get('shapes', [])
            if shape.get('label').strip() in disease_labels  # Match exactly as declared in `disease_labels`
        ))

        all_labels = [shape.get('label').strip() for shape in annotation.get('shapes', [])]
        disease_indices = [disease_labels[disease] for disease in disease_types]

        # Append preprocessed image and corresponding labels
        images.append(augmented_image)
        labels.append(disease_indices)

    # Convert to NumPy arrays
    images = np.array(images, dtype=np.float32)

    # Calculate mean and standard deviation
    mean = np.mean(images, axis=(0, 1, 2))  # Per-channel mean
    std = np.std(images, axis=(0, 1, 2))   # Per-channel std

    # Standardize images using mean and std
    images = (images - mean) / std

    print(f"Dataset mean: {mean}, std: {std}")

    # Prepare labels for multi-label classification (multi-hot encoding)
    num_classes = len(disease_labels)
    multi_hot_labels = np.zeros((len(labels), num_classes), dtype=np.float32)
    for i, disease_indices in enumerate(labels):
        for index in disease_indices:
            multi_hot_labels[i, index] = 1.0

    return images, multi_hot_labels, mean, std

# Preprocess datasets with forced augmentation for training
print("Preprocessing datasets...")
train_images, train_labels, train_mean, train_std = preprocess_data_for_training(train_data, augment=True)
val_images, val_labels, val_mean, val_std = preprocess_data_for_training(val_data, augment=False)
test_images, test_labels, test_mean, test_std = preprocess_data_for_training(test_data, augment=False)

Preprocessing datasets...
Dataset mean: [0.24333906 0.2417085  0.2001999 ], std: [0.26875508 0.29034236 0.22288966]
Dataset mean: [0.43387204 0.49450576 0.27588496], std: [0.23676863 0.2198072  0.2396303 ]
Dataset mean: [0.38002133 0.45103166 0.28196567], std: [0.237067   0.2302046  0.22575405]


In [None]:
def investigate_preprocessing(images, labels, mean, std, num_samples=5):
    """
    Investigate preprocessing by visualizing random samples and checking image properties.
    """
    for i in range(num_samples):
        idx = np.random.randint(0, len(images))  # Pick a random index
        image = images[idx]
        label = labels[idx]

        # De-normalize the image for visualization
        de_normalized_image = (image * std + mean).clip(0, 1)

        # Visualize the image
        plt.figure(figsize=(5, 5))
        plt.imshow(de_normalized_image)
        plt.title(f"Label(s): {np.where(label == 1)[0]}")
        plt.axis('off')
        plt.show()

        # Print image properties
        print(f"Image {i+1}:")
        print(f"  Shape: {image.shape}")
        print(f"  Min pixel value: {image.min():.3f}")
        print(f"  Max pixel value: {image.max():.3f}")
        print(f"  Label indices: {np.where(label == 1)[0]}\n")

# Investigate training images
investigate_preprocessing(train_images, train_labels, train_mean, train_std, num_samples=10)


In [None]:
# Save processed data to Google Drive
def save_processed_data(images, labels, directory, prefix):
    """
    Save images and labels as .npy files in the specified directory.
    """
    os.makedirs(directory, exist_ok=True)
    np.save(os.path.join(directory, f"{prefix}_images.npy"), images)
    np.save(os.path.join(directory, f"{prefix}_labels.npy"), labels)
    print(f"Saved {prefix} data to {directory}")

# Define directory to save data
save_directory = '/content/drive/MyDrive/Strawberry_Disease_Processed'

# Save train, val, and test datasets
save_processed_data(train_images, train_labels, save_directory, "train")
save_processed_data(val_images, val_labels, save_directory, "val")
save_processed_data(test_images, test_labels, save_directory, "test")


Saved train data to /content/drive/MyDrive/Strawberry_Disease_Processed
Saved val data to /content/drive/MyDrive/Strawberry_Disease_Processed
Saved test data to /content/drive/MyDrive/Strawberry_Disease_Processed
