In [None]:
import numpy as np
import torch
from pkg_resources import packaging
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from PIL import Image
from tqdm import tqdm
from chromadb.config import Settings
from annoy import AnnoyIndex
import json
import numpy as np
# print("Torch version:", torch.__version__)

import clip

clip.available_models()


['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [4]:
model, preprocess = clip.load("ViT-B/32")
# model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408


CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [None]:
def get_image_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    preprocessed = preprocess(image)
    image_input = torch.tensor(preprocessed).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        image_features = model.encode_image(image_input).float()
    image_features /= image_features.norm(dim=-1, keepdim=True)
    return image_features

def get_text_embedding(text, model=model, device=device):
    input_token = clip.tokenize(text).to(device)
    with torch.no_grad():
        text_features = model.encode_text(input_token).float()
    text_features /= text_features.norm(dim=-1, keepdim=True)
    return text_features

def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [None]:
with open('output.json', 'r') as file:
    images = json.load(file)
print(len(images))

20


In [None]:
import os
from PIL import Image

# Specify the folder path
combined_dict = {}

for folder in images:
    folder_path = f'/Users/kaushalpatil/Development/USC MS CSAI Program/Deep Learning and its Applications/image retrieval dataset/{str(folder)}'
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
            image_path = os.path.join(folder_path, filename)
            img = Image.open(image_path)
            # print(filename[18:-4])
            id = int(folder + filename[18:-4])
            # # print(image_path)
            # # (type(id))
            combined_dict[id] = image_path

In [82]:
combined_dict

{552666373119: '/Users/kaushalpatil/Development/USC MS CSAI Program/Deep Learning and its Applications/image retrieval dataset/552666/train-image_000000373119.jpg',
 552666424812: '/Users/kaushalpatil/Development/USC MS CSAI Program/Deep Learning and its Applications/image retrieval dataset/552666/train-image_000000424812.jpg',
 552666474934: '/Users/kaushalpatil/Development/USC MS CSAI Program/Deep Learning and its Applications/image retrieval dataset/552666/train-image_000000474934.jpg',
 552666441009: '/Users/kaushalpatil/Development/USC MS CSAI Program/Deep Learning and its Applications/image retrieval dataset/552666/train-image_000000441009.jpg',
 552666068120: '/Users/kaushalpatil/Development/USC MS CSAI Program/Deep Learning and its Applications/image retrieval dataset/552666/train-image_000000068120.jpg',
 552666445512: '/Users/kaushalpatil/Development/USC MS CSAI Program/Deep Learning and its Applications/image retrieval dataset/552666/train-image_000000445512.jpg',
 552666322

In [83]:
embeddings_combined_dict = {}

for folder in images:
    folder_path = f'/Users/kaushalpatil/Development/USC MS CSAI Program/Deep Learning and its Applications/image retrieval dataset/{str(folder)}'
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
            image_path = os.path.join(folder_path, filename)
            img = Image.open(image_path)
            id = int(folder + filename[18:-4])
            embeddings_combined_dict[id] = get_image_embedding(image_path)

  image_input = torch.tensor(preprocessed).unsqueeze(0)  # Add batch dimension


In [84]:
# len(list(embeddings_combined_dict))
# len(set(list(embeddings_combined_dict)))
alternate_keys = {}
start = 1
for i in embeddings_combined_dict:
    alternate_keys[i] = start
    start += 1

In [85]:
reverse_alternate_keys = {}

for i, j in alternate_keys.items():
    reverse_alternate_keys[j] = i

In [86]:
len(embeddings_combined_dict[552666373119][0])

512

In [87]:
f = 512 # Number of Dimensions
t = AnnoyIndex(f)
for i, j in embeddings_combined_dict.items():
    t.add_item(alternate_keys[i], j[0]) # Adding the key-value pair of the AnnoyIndex

t.build(f) # Building 1024 trees for AnnoyIndex, more the number of trees, more the memory consumed, better are the results of ANN algorithm
t.save('image-search-tree-rn50.ann') # Saving the AnnoyIndex for faster reading

  t = AnnoyIndex(f)


True

In [89]:
search_space = AnnoyIndex(512)
search_space.load('./image-search-tree-rn50.ann')

  search_space = AnnoyIndex(512)


True

In [90]:
def text_image_search(query: str, num : int = 10):
    query_vector = get_text_embedding(query) 
    ans = search_space.get_nns_by_vector(query_vector[0], num)
    return ans

In [91]:
def recall_at_k(actual, predicted, k):
    actual_set = set(actual)
    top_k_predicted = set(predicted[:k])
    relevant_hits = len(actual_set & top_k_predicted)
    if len(actual_set) == 0:
        return 0.0
    return round(relevant_hits / len(actual_set), 2)

In [92]:
with open('captions.json', 'r') as file:
    captions = json.load(file)
    
with open('output.json', 'r') as file:
    actual_images = json.load(file)
    
answers = {}
finals = {1: [], 2: [], 5: [], 10: []}
for i, caption in captions.items():
    ans = text_image_search(caption)
    predicted = []
    for p in ans:
        key = reverse_alternate_keys[p]
        predicted.append(key)
    ls = actual_images[i]
    actual = [int(i+k[-6:]) for k in ls]
    answers[i] = []
    for val in [1, 2, 5, 10]:
        temp = recall_at_k(actual, predicted, val)
        finals[val].append(temp)
        answers[i].append(temp)

In [93]:
ekdum_final = {}
for i, ls in finals.items():
    ekdum_final[i] = round(sum(ls) / len(ls), 5)
    # ekdum_final[i] = % .4f % ekdum_final[i]
print(ekdum_final)

{1: 0.106, 2: 0.1985, 5: 0.45, 10: 0.711}


In [94]:
answers

{'552666': [0.0, 0.0, 0.15, 0.31],
 '687618': [0.11, 0.22, 0.56, 0.89],
 '405058': [0.1, 0.2, 0.5, 1.0],
 '703860': [0.09, 0.18, 0.36, 0.45],
 '776132': [0.14, 0.29, 0.71, 0.86],
 '287571': [0.1, 0.2, 0.5, 0.9],
 '67000': [0.17, 0.33, 0.5, 0.67],
 '137494': [0.06, 0.06, 0.19, 0.38],
 '427130': [0.11, 0.22, 0.56, 1.0],
 '285328': [0.0, 0.12, 0.25, 0.5],
 '549270': [0.17, 0.33, 0.83, 0.83],
 '270030': [0.12, 0.25, 0.62, 1.0],
 '377603': [0.33, 0.33, 0.33, 1.0],
 '22348': [0.1, 0.2, 0.5, 0.9],
 '196658': [0.1, 0.2, 0.5, 0.5],
 '231576': [0.17, 0.33, 0.67, 0.67],
 '491426': [0.08, 0.15, 0.38, 0.77],
 '455622': [0.0, 0.1, 0.2, 0.4],
 '452624': [0.08, 0.17, 0.42, 0.83],
 '587826': [0.09, 0.09, 0.27, 0.36]}