In [2]:
from pathlib import Path
import cv2
import pandas as pd

In [3]:

# Resolve repo root safely and path to the dataset
REPO_ROOT = REPO_ROOT = Path.cwd()
DATASET_ROOT = REPO_ROOT / "data" / "sesame-plant-detection"

TRAIN_IMAGES = DATASET_ROOT / "train" / "images"
TRAIN_LABELS = DATASET_ROOT / "train" / "labels"
VALID_IMAGES = DATASET_ROOT / "valid" / "images"
VALID_LABELS = DATASET_ROOT / "valid" / "labels"



In [4]:
#Utility functions

#checking missing value
def check_missing_labels(images_dir, labels_dir):
    missing = []
    for img in images_dir.glob("*.jpg"):
        if not (labels_dir / f"{img.stem}.txt").exists():
            missing.append(img.name)
    return missing

#checking corrupted image

def check_corrupted_images(images_dir):
    corrupted = []
    for img in images_dir.glob("*.jpg"):
        im = cv2.imread(str(img))
        if im is None or im.size == 0:
            corrupted.append(img.name)
    return corrupted

#count the classes
def count_classes(labels_dir):
    class_counts = {}
    for label in labels_dir.glob("*.txt"):
        with open(label) as f:
            for line in f:
                if line.strip():
                    class_id = int(line.split()[0])
                    class_counts[class_id] = class_counts.get(class_id, 0) + 1
    return class_counts

In [5]:

# Run validation

#for missing values
train_missing = check_missing_labels(TRAIN_IMAGES, TRAIN_LABELS)
valid_missing = check_missing_labels(VALID_IMAGES, VALID_LABELS)

#for  checking corruoted images 
train_corrupted = check_corrupted_images(TRAIN_IMAGES)
valid_corrupted = check_corrupted_images(VALID_IMAGES)

#for counting
train_classes = count_classes(TRAIN_LABELS)
valid_classes = count_classes(VALID_LABELS)


# Print report

print("\nDATASET VALIDATION REPORT")
print("-" * 40)
print(f"Train images: {len(list(TRAIN_IMAGES.glob('*.jpg')))}")
print(f"Valid images: {len(list(VALID_IMAGES.glob('*.jpg')))}")

print("\nMissing labels:")
print(f"  Train: {len(train_missing)}")
print(f"  Valid: {len(valid_missing)}")

print("\nCorrupted images:")
print(f"  Train: {len(train_corrupted)}")
print(f"  Valid: {len(valid_corrupted)}")

print("\nClass distribution:")
all_classes = set(train_classes) | set(valid_classes)
for cls in sorted(all_classes):
    print(
        f"Class {cls}: "
        f"train={train_classes.get(cls,0)}, "
        f"valid={valid_classes.get(cls,0)}"
    )

print(f"\nTotal classes detected: {len(all_classes)}")



DATASET VALIDATION REPORT
----------------------------------------
Train images: 262
Valid images: 31

Missing labels:
  Train: 0
  Valid: 0

Corrupted images:
  Train: 0
  Valid: 0

Class distribution:
Class 0: train=4566, valid=511

Total classes detected: 1


In [6]:

# Save summary

# summary DataFrame
summary = pd.DataFrame([{
    "train_images": len(list(TRAIN_IMAGES.glob("*.jpg"))),
    "valid_images": len(list(VALID_IMAGES.glob("*.jpg"))),
    "missing_train_labels": len(train_missing),
    "missing_valid_labels": len(valid_missing),
    "corrupted_train_images": len(train_corrupted),
    "corrupted_valid_images": len(valid_corrupted),
    "total_classes": len(all_classes)
}])

# Define path inside the validation folder
VALIDATION_PATH = REPO_ROOT / "01-data-management" / "01-validation"
VALIDATION_PATH.mkdir(parents=True, exist_ok=True)  

summary_path = VALIDATION_PATH / "dataset_summary.csv"

# Save CSV
summary.to_csv(summary_path, index=False)
print(f"\nSummary saved to {summary_path}")



Summary saved to /home/user/end-to-end-computer-vision-mlops/01-data-management/01-validation/dataset_summary.csv
