In [None]:
# Install dependencies
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

Collecting ftfy
  Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m41.0/54.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m798.8 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.2.0
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-kxubz5e1
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-kxubz5e1
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidia_cuda_n

In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import libraries
import torch
import clip
from PIL import Image
import pandas as pd
from tqdm.notebook import tqdm
import os
import numpy as py

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Load model  and preprocess
model, preprocess = clip.load("ViT-B/32", device=device)

100%|███████████████████████████████████████| 338M/338M [00:08<00:00, 41.5MiB/s]


In [None]:
# Load dataset
dataset = pd.read_csv('/content/drive/MyDrive/Project2_4/dataset.csv')

In [None]:
# Function for scores with clip
def get_label_scores(image_path, labels):
  image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
  text_inputs = clip.tokenize(labels).to(device)

  with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text_inputs)

  image_features = image_features / image_features.norm(dim=1, keepdim = True)
  text_features = text_features / text_features.norm(dim=-1, keepdim=True)
  similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)

  scores = {labels[i]: similarity[0][i].item() for i in range(len(labels))}
  return scores

results = []

for index, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
  image_path = row['image']
  labels = [row['label1'], row['label2'], row['label3']]

  # Get scores
  scores = get_label_scores(image_path, labels)

  # Rank labels based on scores (1 = best, 3 = worst)
  ranked_labels = sorted(scores.items(), key=lambda item: item[1], reverse=True)
  ranked_dict = {ranked_labels[i][0]: i + 1 for i in range(len(ranked_labels))}

    # Add to results
  results.append({
        'image': image_path,
        'label1': labels[0],
        'label2': labels[1],
        'label3': labels[2],
        'label1_rank': ranked_dict[labels[0]],
        'label2_rank': ranked_dict[labels[1]],
        'label3_rank': ranked_dict[labels[2]]
  })

# Convert results to DataFrame
results_df = pd.DataFrame(results)


  0%|          | 0/36 [00:00<?, ?it/s]

In [None]:
results_df.to_csv('/content/drive/MyDrive/Project2_4/CLIP_labels.csv', index=False)