# 🌱 CAPSTONE-LAZARUS: Plant Disease EDA

## Comprehensive Exploratory Data Analysis
**Immersive data exploration with Tableau-level visualizations using Plotly**

### 🎯 Objectives:
- **Dataset Overview**: 26K+ images across 19 disease classes
- **Class Distribution Analysis**: Identify imbalances and patterns
- **Image Quality Assessment**: Resolution, brightness, sharpness metrics
- **Crop-wise Disease Mapping**: Corn, Potato, Tomato health analysis
- **Data Preprocessing Strategy**: Augmentation and balancing approaches

In [None]:
# 📦 Import Essential Libraries
import os
import sys
import numpy as np
import pandas as pd
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# 🖼️ Image Processing
import cv2
from PIL import Image, ImageStat
import matplotlib.pyplot as plt
import seaborn as sns

# 📊 Interactive Visualizations (Tableau-level)
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

# 🧮 Statistics & ML
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter, defaultdict
import random
from typing import Dict, List, Tuple, Any

# Add src to path
sys.path.append('../src')
from data_utils import PlantDiseaseDataLoader, visualize_class_distribution, analyze_image_quality

# 🎨 Configure Plotly Theme
px.defaults.template = "plotly_white"
px.defaults.color_continuous_scale = px.colors.sequential.Viridis

print("🚀 Libraries loaded successfully!")
print(f"📂 Working directory: {os.getcwd()}")

## 🔍 Dataset Discovery & Overview

In [None]:
# 📂 Initialize Data Loader
data_dir = "../data"
loader = PlantDiseaseDataLoader(data_dir, img_size=(224, 224), batch_size=32)

# 🔬 Comprehensive Dataset Scan
dataset_stats = loader.scan_dataset()

print("\n" + "="*60)
print("🌱 CAPSTONE-LAZARUS DATASET OVERVIEW")
print("="*60)
print(f"📊 Total Images: {dataset_stats['total_images']:,}")
print(f"🏷️  Disease Classes: {dataset_stats['num_classes']}")
print(f"⚖️  Imbalance Ratio: {dataset_stats['imbalance_ratio']:.2f}:1")
print(f"🌽 Corn Images: {dataset_stats['crop_types']['corn']:,}")
print(f"🥔 Potato Images: {dataset_stats['crop_types']['potato']:,}")
print(f"🍅 Tomato Images: {dataset_stats['crop_types']['tomato']:,}")
print(f"✅ Healthy Images: {dataset_stats['crop_types']['healthy']:,}")
print(f"🦠 Diseased Images: {dataset_stats['crop_types']['diseased']:,}")
print("="*60)

## 📊 Interactive Class Distribution Analysis

In [None]:
# 📈 Create Interactive Class Distribution Visualization
class_df = pd.DataFrame([
    {'class': k, 'count': v, 'crop': k.split('___')[0].replace('_', ' ').title(), 
     'condition': 'Healthy' if 'healthy' in k.lower() else 'Diseased'}
    for k, v in dataset_stats['class_distribution'].items()
])

# Sort by count for better visualization
class_df = class_df.sort_values('count', ascending=False).reset_index(drop=True)

# 🎨 Enhanced Bar Chart with Hover Information
fig1 = px.bar(
    class_df,
    x='class',
    y='count',
    color='condition',
    hover_data=['crop'],
    color_discrete_map={'Healthy': '#28a745', 'Diseased': '#dc3545'},
    title="🌱 Plant Disease Class Distribution - Interactive Overview",
    labels={'count': 'Number of Images', 'class': 'Disease Classes'}
)

fig1.update_layout(
    height=600,
    xaxis_tickangle=45,
    showlegend=True,
    hovermode='x unified',
    title_x=0.5
)

fig1.show()

# 📊 Summary Statistics
print(f"\n📈 Class Distribution Summary:")
print(f"   • Largest class: {class_df.iloc[0]['class']} ({class_df.iloc[0]['count']:,} images)")
print(f"   • Smallest class: {class_df.iloc[-1]['class']} ({class_df.iloc[-1]['count']:,} images)")
print(f"   • Mean images per class: {class_df['count'].mean():.0f}")
print(f"   • Standard deviation: {class_df['count'].std():.0f}")

In [None]:
# 🥧 Interactive Pie Chart - Crop Distribution
crop_counts = class_df.groupby('crop')['count'].sum().reset_index()

fig2 = px.pie(
    crop_counts,
    values='count',
    names='crop',
    title="🌾 Crop Distribution Across Dataset",
    hole=0.4,  # Donut chart
    color_discrete_sequence=px.colors.qualitative.Set3
)

fig2.update_traces(
    textposition='inside',
    textinfo='percent+label',
    hovertemplate='<b>%{label}</b><br>Count: %{value:,}<br>Percentage: %{percent}<extra></extra>'
)

fig2.update_layout(height=500, title_x=0.5)
fig2.show()

In [None]:
# 🎯 Imbalance Analysis with Heatmap
imbalance_matrix = class_df.pivot_table(index='crop', columns='condition', values='count', fill_value=0)

fig3 = px.imshow(
    imbalance_matrix,
    aspect="auto",
    title="🔥 Crop vs Health Status - Heatmap Analysis",
    color_continuous_scale='RdYlGn',
    text_auto=True
)

fig3.update_layout(height=400, title_x=0.5)
fig3.show()

# Calculate imbalance ratios per crop
print("\n⚖️  Imbalance Ratios by Crop:")
for crop in imbalance_matrix.index:
    healthy = imbalance_matrix.loc[crop, 'Healthy']
    diseased = imbalance_matrix.loc[crop, 'Diseased']
    if healthy > 0 and diseased > 0:
        ratio = max(healthy, diseased) / min(healthy, diseased)
        print(f"   • {crop}: {ratio:.2f}:1 ({'Diseased-heavy' if diseased > healthy else 'Healthy-heavy'})")