In [1]:
import pandas as pd
import numpy as np
import json
import shutil
from pathlib import Path
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Paths
data_dir = Path("../data/raw/PlantVillage")
processed_dir = Path("../data/processed")
output_dir = Path("../data/splits")
output_dir.mkdir(parents=True, exist_ok=True)

# Load artifacts from previous notebooks
quality_df = pd.read_csv(processed_dir / "quality_analysis.csv")
exact_df = pd.read_csv(processed_dir / "exact_duplicates.csv")
cross_df = pd.read_csv(processed_dir / "cross_class_duplicates.csv")

print(f"Quality records: {len(quality_df)}")
print(f"Exact duplicate records: {len(exact_df)}")
print(f"Cross-class duplicate pairs: {len(cross_df)}")

Quality records: 20637
Exact duplicate records: 28
Cross-class duplicate pairs: 9


In [3]:
# Exact duplicates...keep first image in each group, exclude the rest
exact_df_sorted = exact_df.sort_values(['group_id', 'image_index'])
exact_exclude = set(
    exact_df_sorted[exact_df_sorted['image_index'] > 1]['image_path'].tolist()
)

# Cross-class near-duplicates...remove image2 from pairs 1, 2, 5, 9
# Pairs are 0-indexed in the dataframe
pairs_to_remove = [0, 1, 4, 8] 
cross_exclude = set(
    cross_df.iloc[pairs_to_remove]['image2'].tolist()
)

# Quality-based exclusions with class-specific blur thresholds
BLUR_THRESHOLDS = {
    'Tomato__Tomato_YellowLeaf__Curl_Virus': 25,
    'default': 100
}
BRIGHTNESS_THRESHOLD = 50
QUALITY_THRESHOLD = 0.40

quality_exclude = set()
for _, row in quality_df.iterrows():
    blur_thresh = BLUR_THRESHOLDS.get(row['class_name'], BLUR_THRESHOLDS['default'])
    if (
        row['blur_score'] < blur_thresh or
        row['brightness'] < BRIGHTNESS_THRESHOLD or
        row['quality_score'] < QUALITY_THRESHOLD
    ):
        quality_exclude.add(row['image_path'])

# Combine all exclusions
all_excluded = exact_exclude | cross_exclude | quality_exclude

print(f"Exact duplicate exclusions:       {len(exact_exclude)}")
print(f"Cross-class duplicate exclusions: {len(cross_exclude)}")
print(f"Quality-based exclusions:         {len(quality_exclude)}")
print(f"Total unique exclusions:          {len(all_excluded)}")
print(f"Clean images remaining:           {len(quality_df) - len(all_excluded)}")

Exact duplicate exclusions:       14
Cross-class duplicate exclusions: 4
Quality-based exclusions:         299
Total unique exclusions:          312
Clean images remaining:           20325


In [4]:
# Build clean dataset
clean_df = quality_df[~quality_df['image_path'].isin(all_excluded)].copy()
clean_df = clean_df[['image_path', 'class_name']].reset_index(drop=True)

print(f"Total clean images: {len(clean_df)}")
print(f"\nPer-class counts after filtering:")
print(clean_df['class_name'].value_counts().to_string())

Total clean images: 20325

Per-class counts after filtering:
class_name
Tomato__Tomato_YellowLeaf__Curl_Virus          3138
Tomato_Bacterial_spot                          2111
Tomato_Septoria_leaf_spot                      1761
Tomato_Late_blight                             1727
Tomato_Spider_mites_Two_spotted_spider_mite    1673
Tomato_healthy                                 1583
Pepper__bell___healthy                         1475
Tomato__Target_Spot                            1402
Potato___Early_blight                          1000
Potato___Late_blight                            999
Pepper__bell___Bacterial_spot                   995
Tomato_Early_blight                             991
Tomato_Leaf_Mold                                945
Tomato__Tomato_mosaic_virus                     373
Potato___healthy                                152


In [5]:
# Label encoding
classes = sorted(clean_df['class_name'].unique())
class_to_index = {cls: idx for idx, cls in enumerate(classes)}
index_to_class = {idx: cls for cls, idx in class_to_index.items()}

clean_df['class_index'] = clean_df['class_name'].map(class_to_index)

# Save mappings
with open(processed_dir / 'class_to_index.json', 'w') as f:
    json.dump(class_to_index, f, indent=2)

with open(processed_dir / 'index_to_class.json', 'w') as f:
    json.dump(index_to_class, f, indent=2)

print("Class mappings saved.")
print(f"\nClass index mapping:")
for cls, idx in class_to_index.items():
    print(f"  {idx}: {cls}")

Class mappings saved.

Class index mapping:
  0: Pepper__bell___Bacterial_spot
  1: Pepper__bell___healthy
  2: Potato___Early_blight
  3: Potato___Late_blight
  4: Potato___healthy
  5: Tomato_Bacterial_spot
  6: Tomato_Early_blight
  7: Tomato_Late_blight
  8: Tomato_Leaf_Mold
  9: Tomato_Septoria_leaf_spot
  10: Tomato_Spider_mites_Two_spotted_spider_mite
  11: Tomato__Target_Spot
  12: Tomato__Tomato_YellowLeaf__Curl_Virus
  13: Tomato__Tomato_mosaic_virus
  14: Tomato_healthy


In [6]:
# Stratified train/val/test split
RANDOM_SEED = 42
TRAIN_RATIO = 0.70
VAL_RATIO = 0.15
TEST_RATIO = 0.15

# First split: train vs temp (val + test)
train_df, temp_df = train_test_split(
    clean_df,
    test_size=(VAL_RATIO + TEST_RATIO),
    stratify=clean_df['class_name'],
    random_state=RANDOM_SEED
)

# Second split: val vs test from temp
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df['class_name'],
    random_state=RANDOM_SEED
)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print(f"Train: {len(train_df)} images")
print(f"Val:   {len(val_df)} images")
print(f"Test:  {len(test_df)} images")

Train: 14227 images
Val:   3049 images
Test:  3049 images


In [7]:
# Compute class weights from training split only
class_counts = train_df['class_name'].value_counts()
total_train = len(train_df)
n_classes = len(classes)

class_weights = {}
for cls in classes:
    count = class_counts.get(cls, 1)
    weight = total_train / (n_classes * count)
    class_weights[class_to_index[cls]] = round(weight, 4)

with open(processed_dir / 'class_weights.json', 'w') as f:
    json.dump(class_weights, f, indent=2)

print("Class weights (higher = rarer class):")
for idx, weight in sorted(class_weights.items(), key=lambda x: x[1], reverse=True):
    print(f"  {index_to_class[idx]:<50} {weight:.4f}")

Class weights (higher = rarer class):
  Potato___healthy                                   8.9478
  Tomato__Tomato_mosaic_virus                        3.6340
  Tomato_Leaf_Mold                                   1.4327
  Tomato_Early_blight                                1.3667
  Pepper__bell___Bacterial_spot                      1.3627
  Potato___Late_blight                               1.3569
  Potato___Early_blight                              1.3550
  Tomato__Target_Spot                                0.9668
  Pepper__bell___healthy                             0.9191
  Tomato_healthy                                     0.8560
  Tomato_Spider_mites_Two_spotted_spider_mite        0.8100
  Tomato_Late_blight                                 0.7845
  Tomato_Septoria_leaf_spot                          0.7692
  Tomato_Bacterial_spot                              0.6417
  Tomato__Tomato_YellowLeaf__Curl_Virus              0.4317


In [8]:
# Export split manifests
train_df.to_csv(output_dir / 'train.csv', index=False)
val_df.to_csv(output_dir / 'val.csv', index=False)
test_df.to_csv(output_dir / 'test.csv', index=False)

print("Manifests saved:")
print(f"  {output_dir / 'train.csv'}")
print(f"  {output_dir / 'val.csv'}")
print(f"  {output_dir / 'test.csv'}")

Manifests saved:
  ../data/splits/train.csv
  ../data/splits/val.csv
  ../data/splits/test.csv


In [9]:
# Save preprocessing config
config = {
    'random_seed': RANDOM_SEED,
    'split_ratios': {'train': TRAIN_RATIO, 'val': VAL_RATIO, 'test': TEST_RATIO},
    'image_size': [224, 224],
    'normalization': {
        'mean': [0.46, 0.48, 0.42],
        'std': [0.21, 0.18, 0.22]
    },
    'filtering_thresholds': {
        'blur_default': 100,
        'blur_overrides': {'Tomato__Tomato_YellowLeaf__Curl_Virus': 25},
        'brightness_min': BRIGHTNESS_THRESHOLD,
        'quality_score_min': QUALITY_THRESHOLD
    },
    'exclusions': {
        'exact_duplicates': len(exact_exclude),
        'cross_class_duplicates': len(cross_exclude),
        'quality_filtered': len(quality_exclude),
        'total_excluded': len(all_excluded)
    },
    'final_dataset': {
        'total_clean': len(clean_df),
        'train': len(train_df),
        'val': len(val_df),
        'test': len(test_df),
        'n_classes': n_classes
    }
}

with open(processed_dir / 'preprocessing_config.json', 'w') as f:
    json.dump(config, f, indent=2)

print("Preprocessing config saved.")

Preprocessing config saved.


In [10]:
# Validation checks
print("Running validation checks...\n")

# 1. No path overlap across splits
train_paths = set(train_df['image_path'])
val_paths = set(val_df['image_path'])
test_paths = set(test_df['image_path'])

assert len(train_paths & val_paths) == 0, "LEAK: train/val overlap"
assert len(train_paths & test_paths) == 0, "LEAK: train/test overlap"
assert len(val_paths & test_paths) == 0, "LEAK: val/test overlap"
print("✅ No path overlap across splits")

# 2. No excluded files remain
assert len(train_paths & all_excluded) == 0, "Excluded files found in train"
assert len(val_paths & all_excluded) == 0, "Excluded files found in val"
assert len(test_paths & all_excluded) == 0, "Excluded files found in test"
print("✅ No excluded files remain in any split")

# 3. Class indices are contiguous
indices = sorted(index_to_class.keys())
assert indices == list(range(n_classes)), "Class indices are not contiguous"
print("✅ Class indices are contiguous")

# 4. All files exist on disk
missing = [p for p in clean_df['image_path'] if not Path(p).exists()]
assert len(missing) == 0, f"{len(missing)} files missing from disk"
print("✅ All files exist on disk")

print(f"\n{'='*50}")
print(f"FINAL DATASET SUMMARY")
print(f"{'='*50}")
print(f"Total clean images : {len(clean_df)}")
print(f"Train              : {len(train_df)}")
print(f"Val                : {len(val_df)}")
print(f"Test               : {len(test_df)}")
print(f"Classes            : {n_classes}")
print(f"Excluded total     : {len(all_excluded)}")

Running validation checks...

✅ No path overlap across splits
✅ No excluded files remain in any split
✅ Class indices are contiguous
✅ All files exist on disk

FINAL DATASET SUMMARY
Total clean images : 20325
Train              : 14227
Val                : 3049
Test               : 3049
Classes            : 15
Excluded total     : 312
