In [1]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
# ============================
# CONFIGURATION
# ============================
IMG_SIZE = 224 # Safe for MobileNet / ResNet
PUBLIC_DATASET_DIR = "/home/devuser/CV-project/Public_dataset"
SELF_DATASET_DIR = "/home/devuser/CV-project/Dataset"


PREP_PUBLIC_DIR = "/home/devuser/CV-project/preprocessed_Public"
PREP_SELF_DIR = "/home/devuser/CV-project/preprocessed_self"


TRAIN_RATIO = 0.7
VAL_RATIO = 0.15
TEST_RATIO = 0.15 # Public only

In [None]:
# ============================
# PREPROCESSING FUNCTIONS
# ============================
"""
    Apply Contrast Limited Adaptive Histogram Equalization (CLAHE)
    to enhance image contrast.

    Technique category:
    - Image Enhancement (Contrast enhancement)

    Rationale:
    - Improves visibility of surface texture and material details
    - Reduces illumination differences across images
"""
def apply_clahe(img):
    lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    cl = clahe.apply(l)
    merged = cv2.merge((cl, a, b))
    return cv2.cvtColor(merged, cv2.COLOR_LAB2BGR)

"""
    Apply full preprocessing pipeline to an image.

    Techniques applied:
    - Resize (standardization)
    - CLAHE (contrast enhancement)
    - Gaussian blur (noise filtering)
"""
def preprocess_image(img_path):
    img = cv2.imread(img_path)
    img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))

    # 1Ô∏è‚É£ Contrast Enhancement
    img = apply_clahe(img)

    # 2Ô∏è‚É£ Noise Filtering
    img = cv2.GaussianBlur(img, (3, 3), 0)

    # 3Ô∏è‚É£ Edge Detection
    edges = apply_edge_detection(img)

    # 4Ô∏è‚É£ Morphological Transformation
    morph = apply_morphology(img)

    # üîÄ Combine original + edge + morphology
    # Weighted sum keeps CNN-friendly features
    combined = cv2.addWeighted(img, 0.6, edges, 0.2, 0)
    combined = cv2.addWeighted(combined, 1.0, morph, 0.2, 0)

    return combined

"""
    Apply geometric transformations for data augmentation.

    Technique category:
    - Image transformation (rotation, flipping)

    Used ONLY for:
    - Public dataset training images
"""
def augment_image(img):
    augmented = []
    augmented.append(cv2.flip(img, 1)) # Horizontal flip


    for angle in [-10, 10]:
        h, w = img.shape[:2]
        M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
        augmented.append(cv2.warpAffine(img, M, (w, h)))

    return augmented

def apply_edge_detection(img):
    """
    Apply Canny edge detection and return a 3-channel edge map
    """
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, threshold1=50, threshold2=150)
    edges_3ch = cv2.cvtColor(edges, cv2.COLOR_GRAY2BGR)
    return edges_3ch


def apply_morphology(img):
    """
    Apply morphological closing to strengthen object regions
    """
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    closed = cv2.morphologyEx(gray, cv2.MORPH_CLOSE, kernel)
    closed_3ch = cv2.cvtColor(closed, cv2.COLOR_GRAY2BGR)
    return closed_3ch


In [4]:
# ============================
# DIRECTORY CREATION
# ============================

def make_dirs(base_dir, splits, classes):
    for split in splits:
        for cls in classes:
            os.makedirs(os.path.join(base_dir, split, cls), exist_ok=True)

In [5]:
# ============================
# PROCESS PUBLIC DATASET
# ============================
print("\nüì¶ Processing PUBLIC dataset...")


classes = sorted(os.listdir(PUBLIC_DATASET_DIR))


images, labels, paths = [], [], []


for cls in classes:
    cls_dir = os.path.join(PUBLIC_DATASET_DIR, cls)
    for file in os.listdir(cls_dir):
        paths.append(os.path.join(cls_dir, file))
        labels.append(cls)


train_paths, temp_paths, train_labels, temp_labels = train_test_split(
paths, labels, test_size=(1 - TRAIN_RATIO), stratify=labels, random_state=42
)


val_size = VAL_RATIO / (VAL_RATIO + TEST_RATIO)
val_paths, test_paths, val_labels, test_labels = train_test_split(
temp_paths, temp_labels, test_size=(1 - val_size), stratify=temp_labels, random_state=42
)


make_dirs(PREP_PUBLIC_DIR, ["train", "val", "test"], classes)


# Save TRAIN (with augmentation)
for path, cls in tqdm(zip(train_paths, train_labels), total=len(train_paths)):
    img = preprocess_image(path)
    filename = os.path.basename(path)
    cv2.imwrite(os.path.join(PREP_PUBLIC_DIR, "train", cls, filename), img)


    for idx, aug in enumerate(augment_image(img)):
        aug_name = filename.replace('.', f'_aug{idx}.')
        cv2.imwrite(os.path.join(PREP_PUBLIC_DIR, "train", cls, aug_name), aug)


# Save VAL / TEST (NO augmentation)
for split_name, split_paths, split_labels in [
    ("val", val_paths, val_labels),
    ("test", test_paths, test_labels)
]:
    for path, cls in tqdm(zip(split_paths, split_labels), total=len(split_paths)):
        img = preprocess_image(path)
        filename = os.path.basename(path)
        cv2.imwrite(os.path.join(PREP_PUBLIC_DIR, split_name, cls, filename), img)


print("‚úÖ Public dataset preprocessing completed")


üì¶ Processing PUBLIC dataset...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1878/1878 [00:05<00:00, 344.34it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 403/403 [00:00<00:00, 656.92it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 403/403 [00:00<00:00, 673.48it/s]

‚úÖ Public dataset preprocessing completed





In [None]:
# ============================
# PROCESS SELF-COLLECTED DATASET (TEST ONLY)
# ============================
print("\nüì¶ Processing SELF-COLLECTED dataset...")


classes = sorted(os.listdir(SELF_DATASET_DIR))
make_dirs(PREP_SELF_DIR, ["test"], classes)


for cls in classes:
    cls_dir = os.path.join(SELF_DATASET_DIR, cls)
    for file in tqdm(os.listdir(cls_dir)):
        img_path = os.path.join(cls_dir, file)
        img = preprocess_image(img_path)
        cv2.imwrite(os.path.join(PREP_SELF_DIR, "test", cls, file), img)


print("‚úÖ Self-collected dataset preprocessing completed")


# ============================
# SUMMARY
# ============================
print("\nüìä FINAL OUTPUT STRUCTURE")
print("preprocessed_Public/")
print("‚îú‚îÄ‚îÄ train/")
print("‚îú‚îÄ‚îÄ val/")
print("‚îî‚îÄ‚îÄ test/")
print("preprocessed_self/")
print(" ‚îî‚îÄ‚îÄ test/")


üì¶ Processing SELF-COLLECTED dataset...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 127/127 [00:00<00:00, 379.22it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 151/151 [00:00<00:00, 397.45it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 234/234 [00:00<00:00, 351.83it/s]

‚úÖ Self-collected dataset preprocessing completed

üìä FINAL OUTPUT STRUCTURE
dataset/
‚îú‚îÄ‚îÄ preprocessed_Public/
‚îÇ ‚îú‚îÄ‚îÄ train/
‚îÇ ‚îú‚îÄ‚îÄ val/
‚îÇ ‚îî‚îÄ‚îÄ test/
‚îî‚îÄ‚îÄ preprocessed_self/
 ‚îî‚îÄ‚îÄ test/



