In [22]:
import torch
import torchvision.models as models
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
import clip
import numpy as np

In [46]:
import subprocess

def monitor_gpu_memory():
    result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE)
    output = result.stdout.decode('utf-8')
    return output

In [23]:
with open('classes.txt', 'r') as file:
    lines = file.readlines()
names = []
for line in lines:
    name = line.split(',')[0].strip()[4:]
    if name[-1] == "'":
        name = name[:-1]
    if name[0] == " ":
        name = name[1:]
    if name[0] == "'":
        name = name[1:]
    names.append(name)
print(names)

['tench', 'goldfish', 'great white shark', 'tiger shark', 'hammerhead', 'electric ray', 'stingray', 'cock', 'hen', 'ostrich', 'brambling', 'goldfinch', 'house finch', 'junco', 'indigo bunting', 'robin', 'bulbul', 'jay', 'magpie', 'chickadee', 'water ouzel', 'kite', 'bald eagle', 'vulture', 'great grey owl', 'European fire salamander', 'common newt', 'eft', 'spotted salamander', 'axolotl', 'bullfrog', 'tree frog', 'tailed frog', 'loggerhead', 'leatherback turtle', 'mud turtle', 'terrapin', 'box turtle', 'banded gecko', 'common iguana', 'American chameleon', 'whiptail', 'agama', 'frilled lizard', 'alligator lizard', 'Gila monster', 'green lizard', 'African chameleon', 'Komodo dragon', 'African crocodile', 'American alligator', 'triceratops', 'thunder snake', 'ringneck snake', 'hognose snake', 'green snake', 'king snake', 'garter snake', 'water snake', 'vine snake', 'night snake', 'boa constrictor', 'rock python', 'Indian cobra', 'green mamba', 'sea snake', 'horned viper', 'diamondback', 

In [24]:
resnet_imagenet = models.resnet50(weights=True)



In [47]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("RN50", device=device)

def preprocess_image(image_path):
    image = Image.open(image_path)
    image_input = preprocess(image).unsqueeze(0).to(device)
    return image_input

def preprocess_text(captions):
    text_input = clip.tokenize(captions).to(device)
    return text_input

In [32]:
imagenet_classes = np.array(names)
class_embeddings = []
for class_name in imagenet_classes:
  text_input = preprocess_text(class_name)
  with torch.no_grad():
    class_embedding = model.encode_text(text_input)
  class_embeddings.append(class_embedding.squeeze())
class_embeddings = torch.stack(class_embeddings)

In [38]:
import time
total_time = 0
image_paths = ["./images/n01440764_tench.jpg", "./images/n01532829_house_finch.jpg", "./images/n01632777_axolotl.jpg", "./images/n01644900_tailed_frog.jpg", "./images/n02676566_acoustic_guitar.jpg"]
for i in range(20):
  for image_path in image_paths:
    image = preprocess_image(image_path)
    with torch.no_grad():
      start = time.time()
      image_feature = model.encode_image(image).squeeze()
      end = time.time()
      total_time += end - start
    similarities = []
    for embedding in class_embeddings:
      dot_product = torch.dot(image_feature, embedding)
      magnitude1 = torch.norm(image_feature)
      magnitude2 = torch.norm(embedding)
      similarity = dot_product / (magnitude1 * magnitude2)
      similarities.append(similarity)
    # print(similarities)
    list_of_tensors_cpu = [tensor.cpu() for tensor in similarities]
    numpy_arrays = [tensor.numpy() for tensor in list_of_tensors_cpu]
    numpy_array = np.stack(numpy_arrays)
    # print(numpy_array)
    sorted_indices = np.argsort(numpy_array)
    top5_indices = sorted_indices[-1:-6:-1]
    # print("Indices of the top 5 values:", top5_indices)
    # print("Top 5 classes:", imagenet_classes[top5_indices])
print("Time taken:", total_time/100, "seconds")

Time taken: 0.010677425861358643 seconds


In [45]:
import time
total_time = 0
image_paths = ["./images/n01440764_tench.jpg", "./images/n01532829_house_finch.jpg", "./images/n01632777_axolotl.jpg", "./images/n01644900_tailed_frog.jpg", "./images/n02676566_acoustic_guitar.jpg"]
for i in range(1):
  for image_path in image_paths:
    image = preprocess_image(image_path)
    with torch.no_grad():
      start = time.time()
      image_feature = model.encode_image(image).squeeze()
      end = time.time()
      total_time += end - start
    similarities = []
    for embedding in class_embeddings:
      dot_product = torch.dot(image_feature, embedding)
      magnitude1 = torch.norm(image_feature)
      magnitude2 = torch.norm(embedding)
      similarity = dot_product / (magnitude1 * magnitude2)
      similarities.append(similarity)
    # print(similarities)
    list_of_tensors_cpu = [tensor.cpu() for tensor in similarities]
    numpy_arrays = [tensor.numpy() for tensor in list_of_tensors_cpu]
    numpy_array = np.stack(numpy_arrays)
    # print(numpy_array)
    sorted_indices = np.argsort(numpy_array)
    top5_indices = sorted_indices[-1:-6:-1]
    # print("Indices of the top 5 values:", top5_indices)
    print("Top 5 classes:", imagenet_classes[top5_indices])
    print("Top 5 probabilities:", numpy_array[top5_indices])
print("Time taken:", total_time/100, "seconds")

Top 5 classes: ['tench' 'goldfish' 'puffer' 'sturgeon' 'gar']
Top 5 probabilities: [0.2556 0.2352 0.2344 0.2202 0.2152]
Top 5 classes: ['house finch' 'coucal' 'bulbul' 'junco' 'hen']
Top 5 probabilities: [0.2908 0.2435 0.2399 0.2303 0.2278]
Top 5 classes: ['common newt' 'ringneck snake' 'European fire salamander'
 'spotted salamander' 'rhinoceros beetle']
Top 5 probabilities: [0.291  0.2615 0.2598 0.2576 0.235 ]
Top 5 classes: ['tailed frog' 'bullfrog' 'amphibian' 'tree frog' 'spotted salamander']
Top 5 probabilities: [0.2426 0.2274 0.2184 0.2043 0.1985]
Top 5 classes: ['acoustic guitar' 'electric guitar' 'banjo' 'violin' 'cello']
Top 5 probabilities: [0.2363 0.2112 0.2057 0.1932 0.1908]
Time taken: 0.00076005220413208 seconds


In [48]:
torch.cuda.empty_cache()
image_paths = ["./images/n01440764_tench.jpg", "./images/n01532829_house_finch.jpg", "./images/n01632777_axolotl.jpg", "./images/n01644900_tailed_frog.jpg", "./images/n02676566_acoustic_guitar.jpg"]
for i in range(1):
  for image_path in image_paths:
    image = preprocess_image(image_path)
    with torch.no_grad():
      start = time.time()
      image_feature = model.encode_image(image).squeeze()
      end = time.time()
      total_time += end - start
    similarities = []
    for embedding in class_embeddings:
      dot_product = torch.dot(image_feature, embedding)
      magnitude1 = torch.norm(image_feature)
      magnitude2 = torch.norm(embedding)
      similarity = dot_product / (magnitude1 * magnitude2)
      similarities.append(similarity)
    list_of_tensors_cpu = [tensor.cpu() for tensor in similarities]
    numpy_arrays = [tensor.numpy() for tensor in list_of_tensors_cpu]
    numpy_array = np.stack(numpy_arrays)
    sorted_indices = np.argsort(numpy_array)
    top5_indices = sorted_indices[-1:-6:-1]
print(monitor_gpu_memory())

Fri Apr 26 14:22:41 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 546.26                 Driver Version: 546.26       CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3070 ...  WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   44C    P5              19W / 143W |    912MiB /  8192MiB |     45%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [52]:
model, preprocess = clip.load("RN50", device=device)
model = model.half()

In [53]:
def preprocess_image(image_path):
    image = Image.open(image_path)
    image_input = preprocess(image).unsqueeze(0).to(device)
    return image_input

def preprocess_text(captions):
    text_input = clip.tokenize(captions).to(device)
    return text_input

In [54]:
import time
total_time = []
image_paths = ["./images/n01440764_tench.jpg", "./images/n01532829_house_finch.jpg", "./images/n01632777_axolotl.jpg", "./images/n01644900_tailed_frog.jpg", "./images/n02676566_acoustic_guitar.jpg"]
for i in range(20):
  for image_path in image_paths:
    image = preprocess_image(image_path)
    with torch.no_grad():
      start = time.time()
      image_feature = model.encode_image(image).squeeze()
      end = time.time()
      total = end - start
      total_time.append(total)
    similarities = []
    for embedding in class_embeddings:
      dot_product = torch.dot(image_feature, embedding)
      magnitude1 = torch.norm(image_feature)
      magnitude2 = torch.norm(embedding)
      similarity = dot_product / (magnitude1 * magnitude2)
      similarities.append(similarity)
    # print(similarities)
    list_of_tensors_cpu = [tensor.cpu() for tensor in similarities]
    numpy_arrays = [tensor.numpy() for tensor in list_of_tensors_cpu]
    numpy_array = np.stack(numpy_arrays)
    # print(numpy_array)
    sorted_indices = np.argsort(numpy_array)
    top5_indices = sorted_indices[-1:-6:-1]
    # print("Indices of the top 5 values:", top5_indices)
    # print("Top 5 classes:", imagenet_classes[top5_indices])
print("Time taken:", np.sum(np.array(total_time))/100, "seconds")
print(np.std(np.array(total_time)))

Time taken: 0.011531620025634766 seconds
0.0019128007460889886


In [43]:
import time
total_time = 0
image_paths = ["./images/n01440764_tench.jpg", "./images/n01532829_house_finch.jpg", "./images/n01632777_axolotl.jpg", "./images/n01644900_tailed_frog.jpg", "./images/n02676566_acoustic_guitar.jpg"]
for i in range(1):
  for image_path in image_paths:
    image = preprocess_image(image_path)
    with torch.no_grad():
      start = time.time()
      image_feature = model.encode_image(image).squeeze()
      end = time.time()
      total_time += end - start
    similarities = []
    for embedding in class_embeddings:
      dot_product = torch.dot(image_feature, embedding)
      magnitude1 = torch.norm(image_feature)
      magnitude2 = torch.norm(embedding)
      similarity = dot_product / (magnitude1 * magnitude2)
      similarities.append(similarity)
    # print(similarities)
    list_of_tensors_cpu = [tensor.cpu() for tensor in similarities]
    numpy_arrays = [tensor.numpy() for tensor in list_of_tensors_cpu]
    numpy_array = np.stack(numpy_arrays)
    # print(numpy_array)
    sorted_indices = np.argsort(numpy_array)
    top5_indices = sorted_indices[-1:-6:-1]
    # print("Indices of the top 5 values:", top5_indices)
    print("Top 5 classes:", imagenet_classes[top5_indices])
    print("Top 5 probabilities:", numpy_array[top5_indices])
print("Time taken:", total_time/100, "seconds")

Top 5 classes: ['tench' 'goldfish' 'puffer' 'sturgeon' 'gar']
Top 5 probabilities: [0.2556 0.2356 0.235  0.2203 0.2155]
Top 5 classes: ['house finch' 'coucal' 'bulbul' 'junco' 'hen']
Top 5 probabilities: [0.291  0.2439 0.2399 0.2302 0.2281]
Top 5 classes: ['common newt' 'ringneck snake' 'European fire salamander'
 'spotted salamander' 'rhinoceros beetle']
Top 5 probabilities: [0.2908 0.2612 0.26   0.2578 0.2345]
Top 5 classes: ['tailed frog' 'bullfrog' 'amphibian' 'tree frog' 'spotted salamander']
Top 5 probabilities: [0.2426 0.2272 0.2183 0.2042 0.1987]
Top 5 classes: ['acoustic guitar' 'electric guitar' 'banjo' 'violin' 'cello']
Top 5 probabilities: [0.2365 0.211  0.2058 0.1932 0.1908]
Time taken: 0.0007296371459960938 seconds


In [51]:
torch.cuda.empty_cache()
image_paths = ["./images/n01440764_tench.jpg", "./images/n01532829_house_finch.jpg", "./images/n01632777_axolotl.jpg", "./images/n01644900_tailed_frog.jpg", "./images/n02676566_acoustic_guitar.jpg"]
for i in range(1):
  for image_path in image_paths:
    image = preprocess_image(image_path)
    with torch.no_grad():
      start = time.time()
      image_feature = model.encode_image(image).squeeze()
      end = time.time()
      total_time += end - start
    similarities = []
    for embedding in class_embeddings:
      dot_product = torch.dot(image_feature, embedding)
      magnitude1 = torch.norm(image_feature)
      magnitude2 = torch.norm(embedding)
      similarity = dot_product / (magnitude1 * magnitude2)
      similarities.append(similarity)
    list_of_tensors_cpu = [tensor.cpu() for tensor in similarities]
    numpy_arrays = [tensor.numpy() for tensor in list_of_tensors_cpu]
    numpy_array = np.stack(numpy_arrays)
    sorted_indices = np.argsort(numpy_array)
    top5_indices = sorted_indices[-1:-6:-1]
print(monitor_gpu_memory())

Fri Apr 26 14:23:19 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 546.26                 Driver Version: 546.26       CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3070 ...  WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   45C    P0              25W / 120W |    794MiB /  8192MiB |     25%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Observations: 


ResNet-50 (RN50) initialized with ImageNet pretraining and OpenAI's CLIP use similar base architectures but have different modifications to suit their respective tasks.

(a) ImageNet Pretraining (torchvision.models):
The ResNet-50 model initialized with ImageNet pretraining typically consists of 50 layers and is trained on the ImageNet dataset for image classification. It follows the standard ResNet architecture, which includes residual blocks with skip connections to alleviate the vanishing gradient problem. Each residual block contains several convolutional layers followed by batch normalization and ReLU activation.

(b) OpenAI's CLIP:
The ResNet-50 used in OpenAI's CLIP framework is modified to serve a dual-purpose: image understanding and natural language understanding. Therefore, it combines convolutional layers for image processing and transformer layers for text processing. The architecture includes alternating layers of convolutional and transformer layers, where the convolutional layers process the image input, and the transformer layers process the text input. Additionally, CLIP's ResNet-50 is trained on a diverse dataset that includes images and associated text, which is different from ImageNet pretraining.

Differences:

Text Processing Layers: OpenAI's CLIP ResNet-50 incorporates transformer layers for processing textual inputs, allowing the model to understand and process both images and text. This is a significant difference compared to the standard ResNet-50 used for ImageNet pretraining, which focuses solely on image inputs.
Training Data: The ResNet-50 model pretrained on ImageNet is trained solely on images for image classification tasks. In contrast, CLIP's ResNet-50 is trained on a diverse dataset that includes both images and associated text, allowing it to understand and associate images with textual descriptions.
Objective: The objective of pretraining ResNet-50 on ImageNet is to learn image representations that are useful for various downstream image-related tasks. On the other hand, CLIP's ResNet-50 is trained to understand and associate images with text, enabling it to perform tasks like image-text retrieval and zero-shot image classification.




(i) The ImageNet challenge dataset organizes its labels hierarchically according to the WordNet hierarchy. WordNet is a lexical database of English, where nouns, verbs, adjectives, and adverbs are grouped into sets of cognitive synonyms (synsets), each expressing a distinct concept.

(ii) In ImageNet, a synset refers to a set of synonymous words that represent a concept. For instance, the synset "dog" might include words like "pooch," "hound," and "canine." Synsets in ImageNet are often used as labels for object recognition tasks.

(iii) Grouping objects based on synsets could indeed lead to problems for visual recognition, especially in cases where objects within the same synset exhibit significant visual variations. For example, the synset "dog" could include various dog breeds, each with distinct appearances. Additionally, objects from different synsets might share visual similarities, leading to potential confusion.

(iv) Three types of visual differences we can expect to see in images with objects corresponding to the same synset are:

Appearance Variation: Objects within the same synset may vary in appearance due to factors like color, size, shape, and texture. For example, different breeds of dogs (e.g., Labrador Retriever, Poodle) may exhibit diverse physical characteristics despite belonging to the same synset ("dog").
Viewpoint Variation: Objects may appear differently in images depending on the viewpoint from which they are captured. For instance, a chair photographed from the front may look different from the same chair photographed from the side, even though both images depict the same object belonging to the synset "chair."
Contextual Variation: The appearance of objects can be influenced by their surrounding context. For example, a "car" photographed on a city street may look different from a "car" photographed in a desert landscape, even though both images depict objects belonging to the same synset.


There are some differences in the probabilities of the f32 and f16 model as the f16 model can store lesser information abd thus it performes worser as it can get confused easily. While since the most important information is stored in the f16, it performs good enough.