In [2]:
import os
import cv2
import numpy as np
from sklearn.cluster import MiniBatchKMeans

def load_images_from_folder(folder):
    images = []
    for filename in os.listdir(folder):
        img = cv2.imread(os.path.join(folder, filename))
        if img is not None:
            images.append(img)
    return images

def preprocess_images(images, target_size=(100, 100)):
    preprocessed_images = []
    for img in images:
        resized_img = cv2.resize(img, target_size)
        normalized_img = resized_img / 255.0  # Normalize pixel values
        preprocessed_images.append(normalized_img.astype(np.uint8))  # Convert to CV_8UC3
    return np.array(preprocessed_images)

# Define folder paths
folder_paths = [
    './raffaello',
    './COAP',
    './COAP/bag',
    './COAP/dress',
    './COAP/outers',
    './COAP/pants',
    './COAP/shoes',
    './COAP/top',
    './GABABA/outers',
    './GABABA/pants',
    './GABABA/tops',
    './ZARA_men/bag',
    './ZARA_men/outer',
    './ZARA_men/pants',
    './ZARA_men/shoes',
    './ZARA_men/top'
]

# Load all images from folders
all_images = []
for folder_path in folder_paths:
    all_images.extend(load_images_from_folder(folder_path))

# Preprocess all images
processed_images = preprocess_images(all_images)
flattened_images = processed_images.reshape(processed_images.shape[0], -1)

# Initialize MiniBatchKMeans object
num_clusters = 5  # Specify number of clusters
kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=42)

# Fit KMeans to data
kmeans.fit(flattened_images)

# Create directories for each cluster
output_dir = "./clustered_data"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save images to cluster folders
for i in range(num_clusters):
    cluster_dir = os.path.join(output_dir, f"cluster_{i}")
    if not os.path.exists(cluster_dir):
        os.makedirs(cluster_dir)
    cluster_indices = np.where(kmeans.labels_ == i)[0]
    for idx in cluster_indices:
        img = all_images[idx]
        filename = f"image_{idx}.jpg"
        cv2.imwrite(os.path.join(cluster_dir, filename), img)