# Tomato Leaf Disease Detection - Data Exploration

This notebook provides comprehensive data exploration and analysis for the tomato leaf disease detection dataset.

## 1. Setup and Imports

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import cv2
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')
sys.path.append('..')

import config
from data_preprocessing import DataPreprocessor

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Setup complete!")

## 2. Dataset Overview

In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor()

# Analyze dataset
df = preprocessor.analyze_dataset()
df

## 3. Class Distribution Analysis

In [None]:
# Detailed class distribution
fig, axes = plt.subplots(2, 2, figsize=(20, 15))

# Total images per class
axes[0, 0].bar(range(len(df)), df['Total_Count'], color='skyblue')
axes[0, 0].set_xticks(range(len(df)))
axes[0, 0].set_xticklabels([cls.replace('Tomato___', '') for cls in df['Class']], rotation=45, ha='right')
axes[0, 0].set_title('Total Images per Disease Class', fontsize=14, fontweight='bold')
axes[0, 0].set_ylabel('Number of Images')
axes[0, 0].grid(axis='y', alpha=0.3)

# Train vs Validation split
x = np.arange(len(df))
width = 0.35
axes[0, 1].bar(x - width/2, df['Train_Count'], width, label='Train', color='lightcoral')
axes[0, 1].bar(x + width/2, df['Val_Count'], width, label='Validation', color='lightblue')
axes[0, 1].set_xticks(x)
axes[0, 1].set_xticklabels([cls.replace('Tomato___', '') for cls in df['Class']], rotation=45, ha='right')
axes[0, 1].set_title('Train vs Validation Split', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('Number of Images')
axes[0, 1].legend()
axes[0, 1].grid(axis='y', alpha=0.3)

# Binary classification distribution
binary_counts = df.groupby('Binary_Class')['Total_Count'].sum()
colors = ['lightgreen', 'lightcoral']
wedges, texts, autotexts = axes[1, 0].pie(binary_counts.values, labels=binary_counts.index, 
                                         autopct='%1.1f%%', colors=colors, startangle=90)
axes[1, 0].set_title('Healthy vs Diseased Distribution', fontsize=14, fontweight='bold')

# Disease types distribution (excluding healthy)
diseased_df = df[df['Binary_Class'] == 'Diseased']
axes[1, 1].bar(range(len(diseased_df)), diseased_df['Total_Count'], color='salmon')
axes[1, 1].set_xticks(range(len(diseased_df)))
axes[1, 1].set_xticklabels([cls.replace('Tomato___', '') for cls in diseased_df['Class']], rotation=45, ha='right')
axes[1, 1].set_title('Disease Types Distribution', fontsize=14, fontweight='bold')
axes[1, 1].set_ylabel('Number of Images')
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# Print statistics
print(f"Total images: {df['Total_Count'].sum():,}")
print(f"Number of classes: {len(df)}")
print(f"Healthy images: {binary_counts['Healthy']:,} ({binary_counts['Healthy']/df['Total_Count'].sum()*100:.1f}%)")
print(f"Diseased images: {binary_counts['Diseased']:,} ({binary_counts['Diseased']/df['Total_Count'].sum()*100:.1f}%)")
print(f"Average images per class: {df['Total_Count'].mean():.0f}")
print(f"Standard deviation: {df['Total_Count'].std():.0f}")

## 4. Sample Images Visualization

In [None]:
# Display sample images from each class
def display_sample_images(num_classes=10, images_per_class=3):
    fig, axes = plt.subplots(num_classes, images_per_class, figsize=(15, 30))
    
    train_dir = os.path.join(config.RAW_DATA_DIR, 'train')
    
    for i, class_name in enumerate(config.DISEASE_CLASSES[:num_classes]):
        class_dir = os.path.join(train_dir, class_name)
        
        if os.path.exists(class_dir):
            images = os.listdir(class_dir)[:images_per_class]
            
            for j, img_name in enumerate(images):
                img_path = os.path.join(class_dir, img_name)
                
                try:
                    img = Image.open(img_path)
                    axes[i, j].imshow(img)
                    axes[i, j].axis('off')
                    
                    if j == 0:  # Add class name to first image
                        axes[i, j].set_title(class_name.replace('Tomato___', ''), 
                                            fontsize=10, fontweight='bold')
                except Exception as e:
                    axes[i, j].text(0.5, 0.5, f'Error loading\n{img_name}', 
                                   ha='center', va='center', transform=axes[i, j].transAxes)
                    axes[i, j].axis('off')
        else:
            for j in range(images_per_class):
                axes[i, j].text(0.5, 0.5, f'Class not found\n{class_name}', 
                               ha='center', va='center', transform=axes[i, j].transAxes)
                axes[i, j].axis('off')
    
    plt.suptitle('Sample Images from Each Disease Class', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

display_sample_images()

## 5. Image Properties Analysis

In [None]:
# Analyze image properties (size, format, etc.)
def analyze_image_properties(sample_size=100):
    train_dir = os.path.join(config.RAW_DATA_DIR, 'train')
    
    image_info = []
    
    for class_name in config.DISEASE_CLASSES:
        class_dir = os.path.join(train_dir, class_name)
        
        if os.path.exists(class_dir):
            images = os.listdir(class_dir)[:sample_size//len(config.DISEASE_CLASSES)]
            
            for img_name in images:
                img_path = os.path.join(class_dir, img_name)
                
                try:
                    with Image.open(img_path) as img:
                        image_info.append({
                            'class': class_name,
                            'filename': img_name,
                            'width': img.width,
                            'height': img.height,
                            'format': img.format,
                            'mode': img.mode,
                            'size_mb': os.path.getsize(img_path) / (1024 * 1024)
                        })
                except Exception as e:
                    print(f"Error processing {img_path}: {e}")
    
    return pd.DataFrame(image_info)

# Analyze sample images
img_df = analyze_image_properties(sample_size=500)

if not img_df.empty:
    # Display statistics
    print("Image Properties Analysis:")
    print(f"Sample size: {len(img_df)} images")
    print(f"\nImage dimensions:")
    print(f"Width - Mean: {img_df['width'].mean():.0f}, Std: {img_df['width'].std():.0f}")
    print(f"Height - Mean: {img_df['height'].mean():.0f}, Std: {img_df['height'].std():.0f}")
    print(f"\nFile size:")
    print(f"Mean: {img_df['size_mb'].mean():.3f} MB, Std: {img_df['size_mb'].std():.3f} MB")
    print(f"\nImage formats:")
    print(img_df['format'].value_counts())
    print(f"\nColor modes:")
    print(img_df['mode'].value_counts())
    
    # Plot distributions
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Width distribution
    axes[0, 0].hist(img_df['width'], bins=30, alpha=0.7, color='skyblue')
    axes[0, 0].set_title('Image Width Distribution')
    axes[0, 0].set_xlabel('Width (pixels)')
    axes[0, 0].set_ylabel('Frequency')
    
    # Height distribution
    axes[0, 1].hist(img_df['height'], bins=30, alpha=0.7, color='lightcoral')
    axes[0, 1].set_title('Image Height Distribution')
    axes[0, 1].set_xlabel('Height (pixels)')
    axes[0, 1].set_ylabel('Frequency')
    
    # File size distribution
    axes[1, 0].hist(img_df['size_mb'], bins=30, alpha=0.7, color='lightgreen')
    axes[1, 0].set_title('File Size Distribution')
    axes[1, 0].set_xlabel('Size (MB)')
    axes[1, 0].set_ylabel('Frequency')
    
    # Aspect ratio
    img_df['aspect_ratio'] = img_df['width'] / img_df['height']
    axes[1, 1].hist(img_df['aspect_ratio'], bins=30, alpha=0.7, color='gold')
    axes[1, 1].set_title('Aspect Ratio Distribution')
    axes[1, 1].set_xlabel('Aspect Ratio (Width/Height)')
    axes[1, 1].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()
else:
    print("No image data could be analyzed.")

## 6. Data Quality Assessment

In [None]:
# Check for potential data quality issues
def assess_data_quality():
    issues = []
    
    # Check class balance
    class_counts = df['Total_Count'].values
    cv = np.std(class_counts) / np.mean(class_counts)  # Coefficient of variation
    
    if cv > 0.5:
        issues.append(f"High class imbalance detected (CV: {cv:.2f})")
    
    # Check minimum samples per class
    min_samples = df['Total_Count'].min()
    if min_samples < 100:
        issues.append(f"Some classes have very few samples (min: {min_samples})")
    
    # Check binary class balance
    binary_counts = df.groupby('Binary_Class')['Total_Count'].sum()
    healthy_ratio = binary_counts['Healthy'] / binary_counts.sum()
    
    if healthy_ratio < 0.2 or healthy_ratio > 0.8:
        issues.append(f"Binary classification imbalance (Healthy: {healthy_ratio:.1%})")
    
    return issues

quality_issues = assess_data_quality()

print("Data Quality Assessment:")
if quality_issues:
    print("⚠️  Issues found:")
    for issue in quality_issues:
        print(f"  - {issue}")
else:
    print("✅ No major data quality issues detected")

# Recommendations
print("\n📋 Recommendations:")
print("  - Use data augmentation to balance classes")
print("  - Consider stratified sampling for train/val/test splits")
print("  - Monitor for overfitting due to potential class imbalance")
print("  - Use appropriate evaluation metrics (F1-score, balanced accuracy)")

## 7. Preprocessing Recommendations

In [None]:
print("🔧 Preprocessing Recommendations:")
print("\n1. Image Resizing:")
print(f"   - Target size: {config.IMAGE_SIZE}")
print("   - Maintains aspect ratio compatibility with pre-trained models")

print("\n2. Data Augmentation:")
print("   - Rotation, shifting, shearing, zooming")
print("   - Horizontal flipping (appropriate for plant leaves)")
print("   - Brightness and contrast adjustments")

print("\n3. Normalization:")
print("   - Pixel values scaled to [0, 1] range")
print("   - Compatible with ImageNet pre-trained models")

print("\n4. Data Splitting:")
print(f"   - Train: 70%, Validation: 20%, Test: 10%")
print("   - Stratified splitting to maintain class distribution")

print("\n5. Class Handling:")
print("   - Multi-class: All 10 disease categories")
print("   - Binary: Healthy vs Diseased classification")

## 8. Next Steps

In [None]:
print("🚀 Next Steps:")
print("\n1. Run data preprocessing:")
print("   preprocessor.create_processed_dataset()")

print("\n2. Train models:")
print("   - Start with EfficientNet for best accuracy")
print("   - Try MobileNet for faster inference")
print("   - Compare with custom CNN")

print("\n3. Evaluate performance:")
print("   - Use comprehensive metrics")
print("   - Analyze confusion matrices")
print("   - Test on unseen data")

print("\n4. Deploy model:")
print("   - Create prediction interface")
print("   - Test with new images")
print("   - Consider mobile deployment")