Write a colab for clustering with images using imagebind llm embeddings

#Select an Appropriate Image Dataset

For this exercise, the CIFAR-10 dataset is a suitable choice. It contains 60,000 color images across 10 classes, with each image sized at 32x32 pixels, making it lightweight and manageable for clustering tasks
#Set Up the Environment

Begin by cloning the ImageBind repository and installing the required dependencies:

In [None]:
!git clone https://github.com/facebookresearch/ImageBind.git
!pip install git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d
!pip install timm==0.6.7 ftfy regex einops fvcore decord==0.6.0

Cloning into 'ImageBind'...
remote: Enumerating objects: 146, done.[K
remote: Counting objects: 100% (88/88), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 146 (delta 60), reused 39 (delta 39), pack-reused 58 (from 1)[K
Receiving objects: 100% (146/146), 2.64 MiB | 25.08 MiB/s, done.
Resolving deltas: 100% (68/68), done.
Collecting git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d
  Cloning https://github.com/facebookresearch/pytorchvideo.git (to revision 28fe037d212663c6a24f373b94cc5d478c8c1a1d) to /tmp/pip-req-build-d9jwkzpz
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/pytorchvideo.git /tmp/pip-req-build-d9jwkzpz
  Running command git rev-parse -q --verify 'sha^28fe037d212663c6a24f373b94cc5d478c8c1a1d'
  Running command git fetch -q https://github.com/facebookresearch/pytorchvideo.git 28fe037d212663c6a24f373b94cc5d478c8c1a1d
  Running command git checkout -q 28

#Load and Preprocess the Dataset

Load the CIFAR-10 dataset and preprocess the images:

In [None]:
!pip uninstall -y torch torchaudio

Found existing installation: torch 2.5.1+cu121
Uninstalling torch-2.5.1+cu121:
  Successfully uninstalled torch-2.5.1+cu121
Found existing installation: torchaudio 2.5.1+cu121
Uninstalling torchaudio-2.5.1+cu121:
  Successfully uninstalled torchaudio-2.5.1+cu121


In [None]:
!pip install torch==2.1.1+cu121 torchaudio==2.1.1+cu121 torchvision==0.16.1+cu121 -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==2.1.1+cu121
  Downloading https://download.pytorch.org/whl/cu121/torch-2.1.1%2Bcu121-cp310-cp310-linux_x86_64.whl (2200.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 GB[0m [31m695.6 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==2.1.1+cu121
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.1.1%2Bcu121-cp310-cp310-linux_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.16.1+cu121
  Downloading https://download.pytorch.org/whl/cu121/torchvision-0.16.1%2Bcu121-cp310-cp310-linux_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
Collecting triton==2.1.0 (from torch==2.1.1+cu121)
  Downloading triton-2.1.0-0-cp310-cp310-manylinux2014_x86_6

#Extract Image Embeddings Using ImageBind

Utilize ImageBind to extract embeddings for each image:

In [None]:
# Import necessary libraries
import torch
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torchvision.datasets as datasets

# Define transformations for the dataset
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to match model input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load the CIFAR-10 dataset
dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
dataloader = DataLoader(dataset, batch_size=64, shuffle=False)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:12<00:00, 13148239.24it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data


In [None]:
# Import ImageBind model
import sys
sys.path.append('/content/ImageBind')  # Add ImageBind to the Python path
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType

# Set up the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load ImageBind model
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)




Downloading imagebind weights to .checkpoints/imagebind_huge.pth ...


100%|██████████| 4.47G/4.47G [00:34<00:00, 139MB/s]


ImageBindModel(
  (modality_preprocessors): ModuleDict(
    (vision): RGBDTPreprocessor(
      (cls_token): tensor((1, 1, 1280), requires_grad=True)
      
      (rgbt_stem): PatchEmbedGeneric(
        (proj): Sequential(
          (0): PadIm2Video()
          (1): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
        )
      )
      (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper(
        (pos_embed): tensor((1, 257, 1280), requires_grad=True)
        
      )
    )
    (text): TextPreprocessor(
      (pos_embed): tensor((1, 77, 1024), requires_grad=True)
      (mask): tensor((77, 77), requires_grad=False)
      
      (token_embedding): Embedding(49408, 1024)
    )
    (audio): AudioPreprocessor(
      (cls_token): tensor((1, 1, 768), requires_grad=True)
      
      (rgbt_stem): PatchEmbedGeneric(
        (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10), bias=False)
        (norm_layer): LayerNorm((768,), eps=1e-05, elementwise_affine=

In [None]:
# Function to save embeddings batch by batch
import os

def extract_and_save_embeddings(dataloader, save_dir="embeddings"):
    os.makedirs(save_dir, exist_ok=True)
    batch_idx = 0
    for images, _ in dataloader:
        images = images.to(device)
        inputs = {ModalityType.VISION: images}
        with torch.no_grad():
            outputs = model(inputs)
            vision_embeddings = outputs[ModalityType.VISION].cpu()

        # Save embeddings for this batch
        save_path = os.path.join(save_dir, f"embeddings_batch_{batch_idx}.pt")
        torch.save(vision_embeddings, save_path)
        print(f"Saved batch {batch_idx} embeddings to {save_path}")
        batch_idx += 1

# Run the function
extract_and_save_embeddings(dataloader)


#Load and Combine Saved Embeddings
First, load the saved embeddings from disk and combine them into a single tensor.

Code to Load and Combine:

In [None]:
import torch
import glob
import os

def load_saved_embeddings(save_dir="/content/embeddings"):
    all_embeddings = []
    for file_path in sorted(glob.glob(os.path.join(save_dir, "embeddings_batch_*.pt"))):
        batch_embeddings = torch.load(file_path)  # Load each batch
        all_embeddings.append(batch_embeddings)
    combined_embeddings = torch.cat(all_embeddings)  # Combine all batches
    return combined_embeddings

# Load combined embeddings
image_embeddings = load_saved_embeddings()
print("Combined embeddings shape:", image_embeddings.shape)

#Prepare Data for Clustering
Convert the embeddings tensor into a NumPy array if your clustering algorithm requires it.

Convert to NumPy:

In [None]:
import numpy as np

embeddings_array = image_embeddings.numpy()
print("Embeddings as NumPy array:", embeddings_array.shape)

# Apply Clustering Algorithms
You can now use any clustering algorithm (e.g., K-Means, Hierarchical Clustering, etc.) on the combined embeddings.

Example: K-Means Clustering

In [None]:
from sklearn.cluster import KMeans

# Specify the number of clusters
n_clusters = 10

# Perform K-Means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(embeddings_array)

# Cluster labels for each data point
labels = kmeans.labels_
print("Cluster labels:", labels)


#Visualize Clusters
For visualization, you can reduce the dimensionality of the embeddings using techniques like PCA or t-SNE.

Example: t-SNE Visualization

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Reduce dimensionality to 2D
tsne = TSNE(n_components=2, random_state=42)
reduced_embeddings = tsne.fit_transform(embeddings_array)

# Plot the clusters
plt.figure(figsize=(10, 7))
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap='viridis', s=10)
plt.colorbar(label="Cluster")
plt.title("t-SNE Visualization of Clusters")
plt.xlabel("t-SNE 1")
plt.ylabel("t-SNE 2")
plt.show()

#Save Cluster Assignments
You can save the cluster labels for further use.

Save Labels:

In [None]:
np.save("cluster_labels.npy", labels)
print("Cluster labels saved.")


#Use Clusters for Insights
You can map the cluster labels back to the original data to analyze each cluster.

Example:

In [None]:
# Map cluster labels to dataset indices
for cluster_id in range(n_clusters):
    cluster_indices = np.where(labels == cluster_id)[0]
    print(f"Cluster {cluster_id}: {len(cluster_indices)} items")