# Image Clustering & Retrieval - Training Pipeline

This notebook performs the heavy lifting for the Image Clustering project:
1.  **Setup**: Install dependencies.
2.  **Data Loading**: Download STL-10 dataset.
3.  **Feature Extraction**: Use ResNet50 to extract embeddings.
4.  **Clustering**: Apply K-Means to cluster the images.
5.  **Evaluation**: Calculate Silhouette Score.
6.  **Export**: Save embeddings and models for the local Streamlit app.

In [None]:
# 1. Setup
!pip install torch torchvision scikit-learn numpy matplotlib tqdm

In [None]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from torchvision.datasets import STL10
from torch.utils.data import DataLoader
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import pickle
import os
from tqdm import tqdm

In [None]:
# 2. Data Loading
# We use STL-10 for this example. You can replace this with your own ImageFolder.

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

dataset = STL10(root='./data', split='train', download=True, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=2)

print(f"Dataset size: {len(dataset)}")

In [None]:
# 3. Feature Extraction
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load Pretrained ResNet50
model = models.resnet50(pretrained=True)
model = torch.nn.Sequential(*list(model.children())[:-1]) # Remove classification layer
model = model.to(device)
model.eval()

features_list = []
labels_list = []

print("Extracting features...")
with torch.no_grad():
    for inputs, labels in tqdm(dataloader):
        inputs = inputs.to(device)
        features = model(inputs)
        features = features.view(features.size(0), -1)
        features_list.append(features.cpu().numpy())
        labels_list.append(labels.numpy())

embeddings = np.concatenate(features_list)
labels = np.concatenate(labels_list)
print(f"Embeddings shape: {embeddings.shape}")

In [None]:
# 4. Clustering (K-Means)
k = 10 # Number of classes in STL-10
print(f"Clustering into {k} clusters...")

kmeans = KMeans(n_clusters=k, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)

# 5. Evaluation
sil_score = silhouette_score(embeddings, cluster_labels)
print(f"Silhouette Score: {sil_score:.4f}")

In [None]:
# 6. Export Artifacts
# Save these files and upload them to your local 'data/' folder

np.save('embeddings.npy', embeddings)
with open('kmeans_model.pkl', 'wb') as f:
    pickle.dump(kmeans, f)

# For filenames, since we used STL-10, we don't have paths.
# In a real scenario with ImageFolder, you would save dataset.samples
# Here we'll save a placeholder list of indices/labels for demonstration
import json
filenames = [f"Image_{i}" for i in range(len(dataset))]
with open('filenames.json', 'w') as f:
    json.dump(filenames, f)

print("Artifacts saved: embeddings.npy, kmeans_model.pkl, filenames.json")