In [83]:
import clip
import scipy
import torch
from torchvision import *
from torch.utils.data import Dataset, DataLoader
import numpy as np
from PIL import Image
# Verify the download by listing the directory contents.
# data = datasets.Caltech101(root = "./data", target_type = "category", download = True)

In [10]:
clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [12]:
data = datasets.Caltech101(root = "./data", target_type = "category", download = True)

In [8]:
categories = sorted(data.categories)  # Sort for consistency
categories

['Faces',
 'Faces_easy',
 'Leopards',
 'Motorbikes',
 'accordion',
 'airplanes',
 'anchor',
 'ant',
 'barrel',
 'bass',
 'beaver',
 'binocular',
 'bonsai',
 'brain',
 'brontosaurus',
 'buddha',
 'butterfly',
 'camera',
 'cannon',
 'car_side',
 'ceiling_fan',
 'cellphone',
 'chair',
 'chandelier',
 'cougar_body',
 'cougar_face',
 'crab',
 'crayfish',
 'crocodile',
 'crocodile_head',
 'cup',
 'dalmatian',
 'dollar_bill',
 'dolphin',
 'dragonfly',
 'electric_guitar',
 'elephant',
 'emu',
 'euphonium',
 'ewer',
 'ferry',
 'flamingo',
 'flamingo_head',
 'garfield',
 'gerenuk',
 'gramophone',
 'grand_piano',
 'hawksbill',
 'headphone',
 'hedgehog',
 'helicopter',
 'ibis',
 'inline_skate',
 'joshua_tree',
 'kangaroo',
 'ketch',
 'lamp',
 'laptop',
 'llama',
 'lobster',
 'lotus',
 'mandolin',
 'mayfly',
 'menorah',
 'metronome',
 'minaret',
 'nautilus',
 'octopus',
 'okapi',
 'pagoda',
 'panda',
 'pigeon',
 'pizza',
 'platypus',
 'pyramid',
 'revolver',
 'rhino',
 'rooster',
 'saxophone',
 'sc

In [106]:
labels = [data[i][1] for i in range(len(data))]
labels[0]

0

In [107]:
clip_model, preprocess = clip.load("ViT-B/32", device=device)


In [98]:
batch_size = 32
num_images = len(data)
image_embeddings = []
for i in range(0, num_images, batch_size):
    if i % (320) == 0:
        print(f"{i}/{num_images}")
    preprocessed_images = [preprocess(data[i][0]) for i in range(i, min(i+batch_size, num_images))]
    image_batch = torch.stack(preprocessed_images).to(device)
    with torch.no_grad():
        image_emb_batch = clip_model.encode_image(image_batch).cpu()
    image_embeddings.append(image_emb_batch)

0/8677
320/8677
640/8677
960/8677
1280/8677
1600/8677
1920/8677
2240/8677
2560/8677
2880/8677
3200/8677
3520/8677
3840/8677
4160/8677
4480/8677
4800/8677
5120/8677
5440/8677
5760/8677
6080/8677
6400/8677
6720/8677
7040/8677
7360/8677
7680/8677
8000/8677
8320/8677
8640/8677


In [108]:
image_embeddings = torch.cat(image_embeddings)
image_embeddings.shape

torch.Size([8677, 512])

In [109]:
class ClipEmbeddingsDataset(Dataset):
    def __init__(self, image_embeddings, labels):
        self.image_embeddings = image_embeddings
        self.labels = labels

    def __len__(self):
        return len(self.image_embeddings)

    def __getitem__(self, idx):
        return self.image_embeddings[idx], self.labels[idx]

In [110]:
embeddings_dataset = ClipEmbeddingsDataset(image_embeddings, labels)
# embeddings_dataset.__getitem__(8000)

In [111]:
#split into train and test
train_size = int(0.8 * len(embeddings_dataset))
test_size = len(embeddings_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(embeddings_dataset, [train_size, test_size])
train_dataloader = DataLoader(train_dataset, batch_size=10, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=10, shuffle=True)

In [112]:
train_dataloader.batch_size

10

In [113]:
import torch.nn as nn
import torch.nn.functional as F

class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.layer1 = nn.Linear(512, 256)
        self.layer2 = nn.Linear(256, 256)
        self.layer3 = nn.Linear(256, 101)
        self._init_weights()
        
    def __call__(self, x):
        return self.forward(x)
    
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)
        return x
    
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)  # Applies Xavier initialization
                if m.bias is not None:
                    nn.init.zeros_(m.bias)  # Sets biases to zero


In [115]:
import torch.optim as optim
model = NeuralNet()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)
optimizer = optim.SGD(model.parameters(), lr = 0.01)
criterion = nn.CrossEntropyLoss()
n_epochs= 10
for epoch in range(n_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()  # Add the loss value for monitoring
    print(f"Epoch [{epoch+1}/{n_epochs}], Loss: {running_loss/len(train_dataloader):.4f}")


cpu
Epoch [1/10], Loss: 3.1393
Epoch [2/10], Loss: 1.4804
Epoch [3/10], Loss: 0.7074
Epoch [4/10], Loss: 0.4396
Epoch [5/10], Loss: 0.3227
Epoch [6/10], Loss: 0.2562
Epoch [7/10], Loss: 0.2116
Epoch [8/10], Loss: 0.1818
Epoch [9/10], Loss: 0.1581
Epoch [10/10], Loss: 0.1419


In [70]:
input, label = next(iter(train_dataloader))
input.shape, label

(torch.Size([10, 512]), tensor([94,  0, 45, 50,  1, 92, 71,  5,  3, 53]))

In [76]:
with torch.no_grad():
    output = model(input)
output
# torch.argmax(model(input), 1)

tensor([[ -6.3651,  -2.2537,  -3.9694,  ...,  -4.5904,   6.2325,   3.2399],
        [ 19.9324,   8.4823,  -1.3478,  ...,  -2.6515,   4.0993,  -1.7972],
        [ -3.3025,  -3.6270,  -6.5164,  ...,   2.1753,  -3.0322,   1.5346],
        ...,
        [  1.4780,  -6.7455,   1.7283,  ..., -11.5951,   2.0453,  -4.8570],
        [ -2.1178,  -5.4750,   1.2461,  ...,  -1.3860,  -3.4950,  -7.7456],
        [  3.1898,  -4.4681,   2.1757,  ...,  -1.0387,  -1.9317,  -3.8632]])

In [117]:
image = Image.open("sample.jpg")
image2 = data[0][0]
preprocessed = preprocess(image).unsqueeze(0)
image_embedding = clip_model.encode_image(preprocessed)
model(image_embedding)

tensor([[-3.1272e+00, -1.9285e+00, -3.7517e+00, -4.6694e+00, -5.5210e+00,
         -6.2097e+00,  4.7161e+00,  9.9283e+00, -1.9095e+00,  2.4524e+00,
          2.5349e+00, -7.4745e-01, -9.7745e-01,  2.8726e+00,  3.2155e-01,
         -1.5648e+00,  2.1812e+00, -3.5097e+00, -3.8766e+00, -2.8040e+00,
          1.5049e+00, -6.6107e-01, -3.2324e+00, -4.8463e+00, -1.0110e+00,
         -4.9893e-01,  4.0377e+00,  4.8096e+00,  4.6827e-01,  2.1204e+00,
         -2.9925e+00,  2.9240e+00,  1.1393e+00,  1.3170e-01,  3.3412e+00,
         -1.9973e+00,  1.4258e+00,  2.3205e-01, -2.3907e+00, -3.7426e+00,
         -2.8767e+00, -2.0958e+00,  4.9416e+00, -1.5762e+00,  4.2071e+00,
         -6.3230e+00, -6.9147e+00,  3.0733e-02, -3.4945e+00,  7.9555e-01,
         -7.4709e-01,  2.4367e+00, -3.5454e-01,  1.0377e+00,  3.2801e+00,
         -4.8946e+00, -2.6122e+00, -2.2284e-01, -9.0631e-01,  4.4204e+00,
          1.3482e+00, -1.7854e+00,  6.8808e+00, -3.6229e+00, -5.9012e+00,
         -1.6516e+00,  1.0400e+00,  2.

In [78]:
#run inference on test_dataloader
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        predicted = torch.argmax(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

In [81]:
print(correct/total)
print(total)

0.9585253456221198
1736
