# üåæ CAPSTONE-LAZARUS: Plant Disease Exploration & Training

## üî¨ **Comprehensive Plant Disease Detection & Training Pipeline**

### **Mission**: Professional EDA and training on all 52,266+ plant disease images

This notebook provides a **complete exploration and training system** with:
- üìä **Interactive visualizations** with Plotly and Seaborn
- üîç **Deep dataset analysis** across 19 disease classes
- üéØ **Multi-architecture model training** (EfficientNet, ResNet)
- üìà **Real-time performance tracking** with comprehensive metrics
- üî• **Professional training pipeline** with augmentation and optimization

### **Data Overview**:
- **üåΩ Corn diseases**: 4 classes (Cercospora, Common Rust, Northern Leaf Blight, Healthy)
- **ü•î Potato diseases**: 3 classes (Early Blight, Late Blight, Healthy)
- **üçÖ Tomato diseases**: 10+ classes (Various bacterial, viral, fungal diseases)

---
**üöÄ Ready to explore and train on ALL your plant disease images!**

In [1]:
# üîß **COMPREHENSIVE IMPORTS & SETUP**
# ====================================

# Suppress warnings for clean output
import warnings
warnings.filterwarnings('ignore')
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
import random
from datetime import datetime

# Interactive visualizations
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Image processing
from PIL import Image

# Deep learning
import tensorflow as tf
from tensorflow.keras import layers, Model, optimizers, callbacks
from tensorflow.keras.applications import EfficientNetB0, ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img

# Machine learning utilities
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# Add project modules
sys.path.append('../src')
from data_utils import PlantDiseaseDataLoader
from model_factory import ModelFactory
from inference import PlantDiseaseInference

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("Set2")

print("üåæ CAPSTONE-LAZARUS: Plant Disease Detection System")
print("=" * 65)
print(f"üñ•Ô∏è  TensorFlow Version: {tf.__version__}")
print(f"üéÆ GPU Available: {len(tf.config.list_physical_devices('GPU'))} devices")
print(f"üïê Session Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 65)

üåæ CAPSTONE-LAZARUS: Plant Disease Detection System
üñ•Ô∏è  TensorFlow Version: 2.20.0
üéÆ GPU Available: 0 devices
üïê Session Started: 2025-09-20 18:55:15


In [3]:
# üìä **COMPREHENSIVE DATASET EXPLORATION**
# ========================================

print("üîç LOADING & ANALYZING PLANT DISEASE DATASET...")
print("=" * 60)

# Initialize data loader
data_loader = PlantDiseaseDataLoader(data_dir='../data')

# Get comprehensive dataset statistics
print("üìà Gathering dataset statistics...")
dataset_stats = data_loader.get_dataset_stats()

print(f"\nüå± DATASET OVERVIEW:")
print(f"   üìÅ Total Images: {dataset_stats['total_images']:,}")
print(f"   üè∑Ô∏è  Disease Classes: {dataset_stats['num_classes']}")
print(f"   üìä Avg Images/Class: {dataset_stats['total_images']//dataset_stats['num_classes']:,}")

# Get class information and distribution
class_names = data_loader.get_class_names()
class_distribution = data_loader.analyze_class_distribution()

print(f"\nüåø PLANT DISEASE CLASSES ({len(class_names)}):")
for i, (class_name, count) in enumerate(class_distribution.items()):
    print(f"   {i+1:2d}. {class_name}: {count:,} images")

print("\n‚úÖ Dataset exploration complete!")

üîç LOADING & ANALYZING PLANT DISEASE DATASET...
üìà Gathering dataset statistics...
üîç Scanning dataset for comprehensive statistics...
‚úÖ Dataset statistics complete:
   üìä Total Images: 52,266
   ‚úÖ Valid Images: 52,266
   ‚ùå Corrupted Images: 0
   üè∑Ô∏è  Classes: 19
   ‚öñÔ∏è  Imbalance Ratio: 35.24

üå± DATASET OVERVIEW:
   üìÅ Total Images: 52,266
   üè∑Ô∏è  Disease Classes: 19
   üìä Avg Images/Class: 2,750


AttributeError: 'PlantDiseaseDataLoader' object has no attribute 'analyze_class_distribution'

In [None]:
# üìä **INTERACTIVE VISUALIZATIONS**
# =================================

print("üé® CREATING INTERACTIVE VISUALIZATIONS...")

# Create DataFrame for visualizations
viz_data = []
for class_name, count in class_distribution.items():
    # Extract plant type
    if 'Corn' in class_name or 'maize' in class_name:
        plant_type = 'Corn'
    elif 'Potato' in class_name:
        plant_type = 'Potato'
    elif 'Tomato' in class_name:
        plant_type = 'Tomato'
    else:
        plant_type = 'Other'
    
    viz_data.append({
        'Class_Name': class_name,
        'Plant_Type': plant_type,
        'Image_Count': count,
        'Percentage': (count / dataset_stats['total_images']) * 100
    })

viz_df = pd.DataFrame(viz_data)

# 1. Class Distribution Bar Chart
fig_bar = px.bar(
    viz_df,
    x='Class_Name',
    y='Image_Count',
    color='Plant_Type',
    title='üåæ Plant Disease Dataset Distribution',
    labels={'Image_Count': 'Number of Images', 'Class_Name': 'Disease Classes'}
)
fig_bar.update_layout(height=600, xaxis_tickangle=-45)
fig_bar.show()

# 2. Plant Type Distribution
plant_summary = viz_df.groupby('Plant_Type')['Image_Count'].sum().reset_index()
fig_pie = px.pie(
    plant_summary,
    values='Image_Count',
    names='Plant_Type',
    title='ü•ß Plant Type Distribution'
)
fig_pie.show()

print("‚úÖ Interactive visualizations created!")

In [None]:
# üñºÔ∏è **SAMPLE IMAGE DISPLAY**
# ============================

def display_sample_images(data_dir, class_names, max_classes=6, samples_per_class=3):
    """Display sample images from each class"""
    
    print(f"üñºÔ∏è DISPLAYING SAMPLE IMAGES...")
    
    fig, axes = plt.subplots(max_classes, samples_per_class, figsize=(15, 3*max_classes))
    
    for class_idx in range(min(max_classes, len(class_names))):
        class_name = class_names[class_idx]
        class_dir = Path(data_dir) / class_name
        
        if class_dir.exists():
            # Get sample images
            image_files = list(class_dir.glob('*.jpg')) + list(class_dir.glob('*.JPG'))
            if len(image_files) >= samples_per_class:
                sample_files = random.sample(image_files, samples_per_class)
                
                for img_idx, img_path in enumerate(sample_files):
                    img = load_img(img_path, target_size=(224, 224))
                    axes[class_idx, img_idx].imshow(img)
                    axes[class_idx, img_idx].set_title(f'{class_name}', fontsize=10)
                    axes[class_idx, img_idx].axis('off')
    
    plt.suptitle('üå± Sample Images from Plant Disease Classes', fontsize=16)
    plt.tight_layout()
    plt.show()

# Display sample images
display_sample_images('../data', class_names)
print("‚úÖ Sample images displayed!")

In [None]:
# üèóÔ∏è **MODEL TRAINING SETUP**
# ============================

# Training configuration
TRAINING_CONFIG = {
    'image_size': (224, 224),
    'batch_size': 32,
    'epochs': 30,
    'learning_rate': 1e-3,
    'validation_split': 0.2
}

print("‚öôÔ∏è TRAINING CONFIGURATION:")
for key, value in TRAINING_CONFIG.items():
    print(f"   {key}: {value}")

# Data preparation
print("\nüîÑ PREPARING DATA FOR TRAINING...")

train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    brightness_range=[0.8, 1.2],
    validation_split=TRAINING_CONFIG['validation_split']
)

# Create generators
train_generator = train_datagen.flow_from_directory(
    '../data',
    target_size=TRAINING_CONFIG['image_size'],
    batch_size=TRAINING_CONFIG['batch_size'],
    class_mode='categorical',
    subset='training',
    shuffle=True
)

validation_generator = train_datagen.flow_from_directory(
    '../data',
    target_size=TRAINING_CONFIG['image_size'],
    batch_size=TRAINING_CONFIG['batch_size'],
    class_mode='categorical',
    subset='validation',
    shuffle=False
)

print(f"‚úÖ Training samples: {train_generator.samples:,}")
print(f"‚úÖ Validation samples: {validation_generator.samples:,}")
print(f"‚úÖ Classes: {train_generator.num_classes}")

In [None]:
# üöÄ **COMPREHENSIVE MODEL TRAINING**
# ==================================

def create_and_train_model(model_name='EfficientNetB0'):
    """Create and train a plant disease detection model"""
    
    print(f"üèóÔ∏è BUILDING {model_name} MODEL...")
    
    # Create model using ModelFactory
    model_factory = ModelFactory(
        input_shape=(*TRAINING_CONFIG['image_size'], 3),
        num_classes=train_generator.num_classes
    )
    
    if model_name == 'EfficientNetB0':
        model = model_factory.create_efficientnet_v2(variant='B0')
    else:
        # Fallback to simple CNN if EfficientNet fails
        model = tf.keras.Sequential([
            layers.Conv2D(32, 3, activation='relu', input_shape=(*TRAINING_CONFIG['image_size'], 3)),
            layers.MaxPooling2D(),
            layers.Conv2D(64, 3, activation='relu'),
            layers.MaxPooling2D(),
            layers.Conv2D(128, 3, activation='relu'),
            layers.MaxPooling2D(),
            layers.Flatten(),
            layers.Dense(256, activation='relu'),
            layers.Dropout(0.5),
            layers.Dense(train_generator.num_classes, activation='softmax')
        ])
    
    print(f"üìä Model parameters: {model.count_params():,}")
    
    # Compile model
    model.compile(
        optimizer=optimizers.Adam(learning_rate=TRAINING_CONFIG['learning_rate']),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    # Callbacks
    callbacks_list = [
        callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
        callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5),
        callbacks.ModelCheckpoint(f'../models/{model_name}_best.h5', monitor='val_accuracy', save_best_only=True)
    ]
    
    # Train model
    print(f"üî• TRAINING {model_name} ON ALL PLANT DISEASE IMAGES...")
    
    history = model.fit(
        train_generator,
        epochs=TRAINING_CONFIG['epochs'],
        validation_data=validation_generator,
        callbacks=callbacks_list,
        verbose=1
    )
    
    return model, history

# Train the model
print("üåæ STARTING TRAINING ON ALL PLANT DISEASE IMAGES")
print("=" * 60)

model, history = create_and_train_model('EfficientNetB0')

# Evaluate model
val_loss, val_accuracy = model.evaluate(validation_generator, verbose=0)

print("\nüéâ TRAINING COMPLETED!")
print(f"üèÜ Final Validation Accuracy: {val_accuracy:.4f}")
print(f"üèÜ Best Validation Accuracy: {max(history.history['val_accuracy']):.4f}")

print("\nüöÄ YOUR PLANT DISEASE DETECTION MODEL IS READY!")
print(f"üìä Trained on {train_generator.samples:,} plant disease images!")
print(f"üéØ Can classify {train_generator.num_classes} different plant diseases!")