In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd


In [2]:
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
!nvidia-smi


Sun Jun  1 12:50:45 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 573.24                 Driver Version: 573.24         CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A2000 Laptop GPU  WDDM  |   00000000:01:00.0  On |                  N/A |
| N/A   57C    P8              8W /   40W |    1183MiB /   4096MiB |     14%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [16]:
# TO REMOVE CORRUPTED IMAGES

import os
from PIL import Image

RAW_DATASET_DIR = 'raw_dataset'  # Your original dataset folder

def remove_corrupted_images(dataset_dir):
    removed_files = []
    for class_name in os.listdir(dataset_dir):
        class_path = os.path.join(dataset_dir, class_name)
        if not os.path.isdir(class_path):
            continue
        for img_file in os.listdir(class_path):
            img_path = os.path.join(class_path, img_file)
            try:
                with Image.open(img_path) as img:
                    img.verify()  # Verify if image is corrupted
            except (IOError, SyntaxError) as e:
                print(f"Removing corrupted image: {img_path}")
                os.remove(img_path)
                removed_files.append(img_path)
    return removed_files

removed = remove_corrupted_images(RAW_DATASET_DIR)
print(f"Total corrupted images removed: {len(removed)}")


Total corrupted images removed: 0


In [17]:
# TO CLEAN LABELS
import re

def check_class_names(dataset_dir):
    invalid_names = []
    for class_name in os.listdir(dataset_dir):
        class_path = os.path.join(dataset_dir, class_name)
        if not os.path.isdir(class_path):
            continue
        # Allow only alphanumeric, underscore, and spaces
        if not re.match(r'^[\w\s]+$', class_name):
            invalid_names.append(class_name)
    return invalid_names

invalid_classes = check_class_names(RAW_DATASET_DIR)
if invalid_classes:
    print("Invalid class folder names detected:")
    for name in invalid_classes:
        print(f" - {name}")
else:
    print("All class folder names are clean!")


All class folder names are clean!


In [None]:
# Resizing Images and Converting to JPEG

from PIL import Image
import os

def resize_and_convert(dataset_dir, output_dir, target_size):
    total_images = 0
    total_skipped = 0
    for class_name in os.listdir(dataset_dir):
        class_path = os.path.join(dataset_dir, class_name)
        if not os.path.isdir(class_path):
            continue
        processed_class_path = os.path.join(output_dir, class_name.replace(' ', '_'))
        os.makedirs(processed_class_path, exist_ok=True)

        count = 0
        for img_name in os.listdir(class_path):
            src_path = os.path.join(class_path, img_name)
            try:
                with Image.open(src_path) as img:
                    img = img.convert('RGB')
                    img = img.resize(target_size, Image.Resampling.LANCZOS)
                    base_name = os.path.splitext(img_name)[0]
                    save_path = os.path.join(processed_class_path, base_name + '.jpg')
                    img.save(save_path, 'JPEG', quality=95)
                count += 1
                total_images += 1
            except Exception as e:
                print(f"Skipping invalid image {src_path}: {e}")
                total_skipped += 1
        print(f"Processed {count} images in class '{class_name}'")

    print(f"Total images processed: {total_images}")
    print(f"Total images skipped: {total_skipped}")

# Example usage:
resize_and_convert('raw_dataset', 'processed_dataset', (224, 224))


Processed 997 images in class 'Pepper__bell___Bacterial_spot'
Processed 1478 images in class 'Pepper__bell___healthy'
Processed 1000 images in class 'Potato___Early_blight'
Processed 152 images in class 'Potato___healthy'
Processed 1000 images in class 'Potato___Late_blight'
Processed 2127 images in class 'Tomato_Bacterial_spot'
Processed 1000 images in class 'Tomato_Early_blight'
Processed 1591 images in class 'Tomato_healthy'
Processed 1909 images in class 'Tomato_Late_blight'
Processed 952 images in class 'Tomato_Leaf_Mold'
Processed 1771 images in class 'Tomato_Septoria_leaf_spot'
Processed 1676 images in class 'Tomato_Spider_mites_Two_spotted_spider_mite'
Processed 1404 images in class 'Tomato__Target_Spot'
Processed 373 images in class 'Tomato__Tomato_mosaic_virus'
Processed 3208 images in class 'Tomato__Tomato_YellowLeaf__Curl_Virus'
Total images processed: 20638
Total images skipped: 0


In [26]:
import os

def count_images(dataset_dir):
    total_images = 0
    for class_name in os.listdir(dataset_dir):
        class_path = os.path.join(dataset_dir, class_name)
        if not os.path.isdir(class_path):
            continue
        num_images = len([f for f in os.listdir(class_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
        print(f"Class '{class_name}': {num_images} images")
        total_images += num_images
    print(f"Total images in dataset: {total_images}")
    return total_images

count_images('processed_dataset')


Class 'Pepper__bell___Bacterial_spot': 997 images
Class 'Pepper__bell___healthy': 1478 images
Class 'Potato___Early_blight': 1000 images
Class 'Potato___healthy': 152 images
Class 'Potato___Late_blight': 1000 images
Class 'Tomato_Bacterial_spot': 2127 images
Class 'Tomato_Early_blight': 1000 images
Class 'Tomato_healthy': 1585 images
Class 'Tomato_Late_blight': 1901 images
Class 'Tomato_Leaf_Mold': 952 images
Class 'Tomato_Septoria_leaf_spot': 1771 images
Class 'Tomato_Spider_mites_Two_spotted_spider_mite': 1676 images
Class 'Tomato__Target_Spot': 1404 images
Class 'Tomato__Tomato_mosaic_virus': 373 images
Class 'Tomato__Tomato_YellowLeaf__Curl_Virus': 3208 images
Total images in dataset: 20624


20624

In [27]:
import os

def count_images(dataset_dir):
    total_images = 0
    for class_name in os.listdir(dataset_dir):
        class_path = os.path.join(dataset_dir, class_name)
        if not os.path.isdir(class_path):
            continue
        num_images = len([f for f in os.listdir(class_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
        print(f"Class '{class_name}': {num_images} images")
        total_images += num_images
    print(f"Total images in dataset: {total_images}")
    return total_images

count_images('raw_dataset')

Class 'Pepper__bell___Bacterial_spot': 997 images
Class 'Pepper__bell___healthy': 1478 images
Class 'Potato___Early_blight': 1000 images
Class 'Potato___healthy': 152 images
Class 'Potato___Late_blight': 1000 images
Class 'Tomato_Bacterial_spot': 2127 images
Class 'Tomato_Early_blight': 1000 images
Class 'Tomato_healthy': 1591 images
Class 'Tomato_Late_blight': 1909 images
Class 'Tomato_Leaf_Mold': 952 images
Class 'Tomato_Septoria_leaf_spot': 1771 images
Class 'Tomato_Spider_mites_Two_spotted_spider_mite': 1676 images
Class 'Tomato__Target_Spot': 1404 images
Class 'Tomato__Tomato_mosaic_virus': 373 images
Class 'Tomato__Tomato_YellowLeaf__Curl_Virus': 3208 images
Total images in dataset: 20638


20638

In [25]:
import os
from PIL import Image

PROCESSED_DATASET_DIR = 'processed_dataset'  # update path if different

def dhash(image, hash_size=8):
    # Use LANCZOS resampling instead of deprecated ANTIALIAS
    image = image.convert('L').resize((hash_size + 1, hash_size), Image.Resampling.LANCZOS)
    pixels = list(image.getdata())
    difference = []
    for row in range(hash_size):
        for col in range(hash_size):
            left_pixel = pixels[row * (hash_size + 1) + col]
            right_pixel = pixels[row * (hash_size + 1) + col + 1]
            difference.append(left_pixel > right_pixel)
    decimal_value = 0
    hex_string = []
    for index, value in enumerate(difference):
        if value:
            decimal_value += 2 ** (index % 8)
        if (index % 8) == 7:
            hex_string.append(hex(decimal_value)[2:].rjust(2, '0'))
            decimal_value = 0
    return ''.join(hex_string)

def remove_duplicates(dataset_dir):
    hashes = set()
    removed_count = 0
    for class_name in os.listdir(dataset_dir):
        class_path = os.path.join(dataset_dir, class_name)
        if not os.path.isdir(class_path):
            continue
        for img_name in os.listdir(class_path):
            img_path = os.path.join(class_path, img_name)
            try:
                with Image.open(img_path) as img:
                    h = dhash(img)
                if h in hashes:
                    print(f"Duplicate found, removing: {img_path}")
                    os.remove(img_path)
                    removed_count += 1
                else:
                    hashes.add(h)
            except Exception as e:
                print(f"Error processing image {img_path}: {e}")
    return removed_count

duplicates_removed = remove_duplicates(PROCESSED_DATASET_DIR)
print(f"Duplicates removed: {duplicates_removed}")


Duplicate found, removing: processed_dataset\Tomato_healthy\34c81c57-e1fa-49dd-a49d-34fe8b2385fe___GH_HL Leaf 466.1.jpg
Duplicate found, removing: processed_dataset\Tomato_healthy\505465db-407b-4e0a-8110-7479dad5261c___GH_HL Leaf 389.jpg
Duplicate found, removing: processed_dataset\Tomato_healthy\a5de43e7-fc2f-4a14-a8e6-c0f2f94c84f1___GH_HL Leaf 434.jpg
Duplicate found, removing: processed_dataset\Tomato_healthy\cfd491d6-4af5-4728-8f0e-0d330a07174a___GH_HL Leaf 482.2.jpg
Duplicate found, removing: processed_dataset\Tomato_healthy\d2ce7896-6fa4-45e6-96c5-d162da0e3e1c___GH_HL Leaf 220.jpg
Duplicate found, removing: processed_dataset\Tomato_healthy\e786ac89-29fe-47e3-b49e-b9a9ee7edd9d___GH_HL Leaf 342.1.jpg
Duplicate found, removing: processed_dataset\Tomato_Late_blight\2c47b891-3c97-48f1-a2cc-5aa53d3a1148___GHLB2 Leaf 9011.jpg
Duplicate found, removing: processed_dataset\Tomato_Late_blight\5de6da85-f8c4-48c4-b463-3e6bd78884cc___GHLB_PS Leaf 24 Day 16.jpg
Duplicate found, removing: proces