In [2]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.decomposition import PCA

# --- Define Paths and Directories ---
# The input path points to your raw data
dataset_path = "../data/raw/Maize"
# The output path will be a new folder for the cleaned data
output_path = "../data/processed/IT24103044_clean"
os.makedirs(output_path, exist_ok=True)

# Track stats for EDA
class_valid_counts = {}
class_outlier_counts = {}
brightness_means = []

print("Starting outlier detection and removal...")

class_folders = [f for f in os.listdir(input_path) if os.path.isdir(os.path.join(input_path, f))]

for class_name in class_folders:
    class_path = os.path.join(input_path, class_name)
    output_class_path = os.path.join(output_path, class_name)
    os.makedirs(output_class_path, exist_ok=True)

    valid_count = 0
    outlier_count = 0

    for img_file in os.listdir(class_path):
        if not img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
            continue

        img_path = os.path.join(class_path, img_file)
        try:
            # Try to load image
            img = cv2.imread(img_path)
            if img is None:
                print(f"⚠ Outlier removed (corrupted): {img_file}")
                outlier_count += 1
                continue

            # Check image size (too small = outlier)
            h, w, _ = img.shape
            if h < 50 or w < 50:
                print(f"⚠ Outlier removed (too small): {img_file}")
                outlier_count += 1
                continue

            # Check brightness (too dark or too bright = outlier)
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            mean_brightness = np.mean(gray)
            brightness_means.append(mean_brightness)

            if mean_brightness < 20 or mean_brightness > 235:  # almost black or white
                print(f"⚠ Outlier removed (abnormal brightness): {img_file}")
                outlier_count += 1
                continue

            # ✅ Passed all checks → save valid image
            cv2.imwrite(os.path.join(output_class_path, img_file), img)
            valid_count += 1

        except Exception as e:
            print(f"⚠ Error reading {img_file} in {class_name}: {e}")
            outlier_count += 1

    class_valid_counts[class_name] = valid_count
    class_outlier_counts[class_name] = outlier_count
    print(f"Class '{class_name}': {valid_count} valid, {outlier_count} outliers removed")

print("\n✅ Completed outlier removal.")

# --- EDA Visualizations ---

# 1. Brightness distribution
plt.figure(figsize=(8, 5))
plt.hist(brightness_means, bins=50, color="purple")
plt.title("Brightness Distribution of Valid Images")
plt.xlabel("Mean Brightness (0–255)")
plt.ylabel("Frequency")
plt.show()

# 2. Valid vs Outliers per class
labels = list(class_valid_counts.keys())
valids = list(class_valid_counts.values())
outliers = list(class_outlier_counts.values())

x = np.arange(len(labels))
width = 0.35

plt.figure(figsize=(12, 6))
plt.bar(x - width/2, valids, width, label="Valid", color="green")
plt.bar(x + width/2, outliers, width, label="Outliers", color="red")
plt.xticks(x, labels, rotation=45, ha="right")
plt.ylabel("Image Count")
plt.title("Valid vs Outlier Images per Class")
plt.legend()
plt.tight_layout()
plt.show()


Starting outlier detection and removal...


NameError: name 'input_path' is not defined