In [None]:
import pandas as pd
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Set paths
data_dir = Path("../data/raw/PlantVillage")

In [3]:
# Get all class folders
classes = sorted([d.name for d in data_dir.iterdir() if d.is_dir()])
print(f"Total classes found: {len(classes)}")
print("\nClasses:")
for i, cls in enumerate(classes, 1):
    print(f"{i}. {cls}")

# Count images per class
class_counts = {}
all_images = []

for class_name in classes:
    class_path = data_dir / class_name
    images = list(class_path.glob("*.jpg")) + list(class_path.glob("*.JPG")) + \
             list(class_path.glob("*.png")) + list(class_path.glob("*.PNG"))
    class_counts[class_name] = len(images)
    all_images.extend(images)

Total classes found: 15

Classes:
1. Pepper__bell___Bacterial_spot
2. Pepper__bell___healthy
3. Potato___Early_blight
4. Potato___Late_blight
5. Potato___healthy
6. Tomato_Bacterial_spot
7. Tomato_Early_blight
8. Tomato_Late_blight
9. Tomato_Leaf_Mold
10. Tomato_Septoria_leaf_spot
11. Tomato_Spider_mites_Two_spotted_spider_mite
12. Tomato__Target_Spot
13. Tomato__Tomato_YellowLeaf__Curl_Virus
14. Tomato__Tomato_mosaic_virus
15. Tomato_healthy


In [5]:
# Train/Val/Test Split Feasibility Check
quality_df = pd.read_csv('../data/processed/quality_analysis.csv')

BLUR_THRESHOLDS = {
    'Tomato__Tomato_YellowLeaf__Curl_Virus': 25,
    'default': 100
}
BRIGHTNESS_THRESHOLD = 50
QUALITY_THRESHOLD = 0.40

# Load duplicates to exclude
exact_df = pd.read_csv('../data/processed/exact_duplicates.csv')
cross_df = pd.read_csv('../data/processed/cross_class_duplicates.csv')

excluded_paths = set(
    exact_df['image_path'].tolist() +
    cross_df['image1'].tolist() +
    cross_df['image2'].tolist()
)

# Apply all filters
def get_blur_threshold(class_name):
    return BLUR_THRESHOLDS.get(class_name, BLUR_THRESHOLDS['default'])

clean_rows = []
for _, row in quality_df.iterrows():
    blur_thresh = get_blur_threshold(row['class_name'])
    if (
        row['image_path'] not in excluded_paths and
        row['blur_score'] >= blur_thresh and
        row['brightness'] >= BRIGHTNESS_THRESHOLD and
        row['quality_score'] >= QUALITY_THRESHOLD
    ):
        clean_rows.append(row)

clean_df = pd.DataFrame(clean_rows)

# Split ratios
TRAIN, VAL, TEST = 0.70, 0.15, 0.15
MIN_VAL_TEST = 15

print(f"{'Class':<50} {'Total':>8} {'Train':>8} {'Val':>6} {'Test':>6} {'Status':>12}")
print("-" * 95)

all_feasible = True
for class_name in sorted(clean_df['class_name'].unique()):
    class_total = len(clean_df[clean_df['class_name'] == class_name])
    train_n = int(class_total * TRAIN)
    val_n = int(class_total * VAL)
    test_n = int(class_total * TEST)

    feasible = val_n >= MIN_VAL_TEST and test_n >= MIN_VAL_TEST
    status = 'feasible' if feasible else 'TOO SMALL'
    if not feasible:
        all_feasible = False

    print(f"{class_name:<50} {class_total:>8} {train_n:>8} {val_n:>6} {test_n:>6} {status:>12}")

print()
if all_feasible:
    print("All classes have sufficient images for a 70/15/15 split.")
else:
    print("Some classes may be too small. Consider reducing val/test ratio or augmenting before splitting.")

Class                                                 Total    Train    Val   Test       Status
-----------------------------------------------------------------------------------------------
Pepper__bell___Bacterial_spot                           994      695    149    149     feasible
Pepper__bell___healthy                                 1474     1031    221    221     feasible
Potato___Early_blight                                  1000      700    150    150     feasible
Potato___Late_blight                                    999      699    149    149     feasible
Potato___healthy                                        152      106     22     22     feasible
Tomato_Bacterial_spot                                  2109     1476    316    316     feasible
Tomato_Early_blight                                     990      693    148    148     feasible
Tomato_Late_blight                                     1723     1206    258    258     feasible
Tomato_Leaf_Mold                        

## Train/Val/Test Split Feasibility Key Findings

- All 15 classes pass the feasibility check with sufficient images for a clean 70/15/15 stratified split.
- `Potato___healthy` was the most vulnerable class at 152 images after filtering and it produces 22 images in both val and test, sitting at the minimum acceptable threshold but still viable.
- `Tomato__Tomato_mosaic_virus` is the smallest class at 373 images with 55 in val and test and it passes but should be monitored closely for per-class validation accuracy after training.
- `Tomato__Tomato_YellowLeaf__Curl_Virus` correctly reflects the class-specific blur threshold and duplicate removals, coming down from 3208 to 3137 clean images.
- Total clean dataset across all classes is **18,303 images** ready for splitting.

> **Action:** Proceed to preprocessing with a confirmed 70/15/15 stratified split using a fixed random seed. Flag `Potato___healthy` and `Tomato__Tomato_mosaic_virus` for per-class metric tracking post-training. Apply augmentation-based oversampling to minority classes during training to compensate for their smaller split sizes.