##CLIP directional similarities

In [1]:
import torch
from transformers import (
    CLIPTokenizer,
    CLIPTextModelWithProjection,
    CLIPVisionModelWithProjection,
    CLIPImageProcessor,
)

clip_id = "openai/clip-vit-large-patch14"
tokenizer = CLIPTokenizer.from_pretrained(clip_id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to(device)
image_processor = CLIPImageProcessor.from_pretrained(clip_id)
image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

In [2]:
import torch.nn as nn
import torch.nn.functional as F


class DirectionalSimilarity(nn.Module):
    def __init__(self, tokenizer, text_encoder, image_processor, image_encoder):
        super().__init__()
        self.tokenizer = tokenizer
        self.text_encoder = text_encoder
        self.image_processor = image_processor
        self.image_encoder = image_encoder

    def preprocess_image(self, image):
        image = self.image_processor(image, return_tensors="pt")["pixel_values"]
        return {"pixel_values": image.to(device)}

    def tokenize_text(self, text):
        inputs = self.tokenizer(
            text,
            max_length=self.tokenizer.model_max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {"input_ids": inputs.input_ids.to(device)}

    def encode_image(self, image):
        preprocessed_image = self.preprocess_image(image)
        image_features = self.image_encoder(**preprocessed_image).image_embeds
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        return image_features

    def encode_text(self, text):
        tokenized_text = self.tokenize_text(text)
        text_features = self.text_encoder(**tokenized_text).text_embeds
        text_features = text_features / text_features.norm(dim=1, keepdim=True)
        return text_features

    def compute_directional_similarity(self, img_feat_one, img_feat_two, text_feat_one, text_feat_two):
        sim_direction = F.cosine_similarity(img_feat_two - img_feat_one, text_feat_two - text_feat_one)
        return sim_direction

    def forward(self, image_one, image_two, caption_one, caption_two):
        img_feat_one = self.encode_image(image_one)
        img_feat_two = self.encode_image(image_two)
        text_feat_one = self.encode_text(caption_one)
        text_feat_two = self.encode_text(caption_two)
        directional_similarity = self.compute_directional_similarity(
            img_feat_one, img_feat_two, text_feat_one, text_feat_two
        )
        return directional_similarity


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import json
import os

ir_similarity = DirectionalSimilarity(tokenizer, text_encoder, image_processor, image_encoder)

dataset_path = "/content/drive/MyDrive/counterfactual_dataset.json"
images_folder = "/content/drive/MyDrive/project_ii_images"

with open(dataset_path, 'r') as f:
    dataset = json.load(f)

scores = []
grouped_scores = []

In [11]:
from PIL import Image

for i, obj in enumerate(dataset):
    original_prompt = obj['original_prompt']
    counterfactual_prompt = obj['counterfactual_prompt']
    original_image = Image.open(f"{images_folder}/{obj['original_image']}")
    counterfactual_image = Image.open(f"{images_folder}/{obj['counterfactual_image']}")

    similarity_score = dir_similarity(
        original_image, counterfactual_image, original_prompt, counterfactual_prompt
    )
    scores.append(float(similarity_score.detach().cpu()))

In [15]:
scores_grouped = {
    "set1": scores[:5],
    "set2": scores[5:10],
    "set3": scores[10:15],
    "set4": scores[15:20],
    "set5": scores[20:25],
    "set6": scores[25:30],
    "set7": scores[30:35],
    "set8": scores[35:40],
    "set9": scores[40:45],
    "set10": scores[45:50],
}

In [16]:
output_json_path = "/content/drive/MyDrive/grouped_scores.json"

with open(output_json_path, 'w') as json_file:
    json.dump(scores_grouped, json_file, indent=4)

print(f"Grouped scores saved to {output_json_path}")

Grouped scores saved to /content/drive/MyDrive/grouped_scores.json


In [17]:
for i in scores_grouped.values():
    print(i)

[0.4349178075790405, 0.00859120488166809, 0.2400635927915573, 0.0570552721619606, 0.1728382408618927]
[-0.06462213397026062, -0.04752476513385773, 0.13901932537555695, 0.04639345407485962, 0.051717568188905716]
[-0.027728775516152382, 0.08802482485771179, 0.10400983691215515, 0.03137911856174469, 0.5458086729049683]
[0.03711457923054695, -0.04878167808055878, 0.059051595628261566, -0.0006553810089826584, -0.07174277305603027]
[0.12630991637706757, 0.2263745814561844, 0.014275670051574707, 0.05010939761996269, 0.35616564750671387]
[0.2717835605144501, 0.5363991856575012, 0.35293954610824585, 0.1686299741268158, -0.036774616688489914]
[0.04942314326763153, 0.0705825686454773, 0.22698244452476501, 0.12720085680484772, 0.23067256808280945]
[0.12711021304130554, 0.06796368956565857, 0.1053372174501419, 0.21666809916496277, 0.15829628705978394]
[0.028721347451210022, 0.033698998391628265, 0.15327632427215576, 0.04858868941664696, 0.0004078540951013565]
[0.021281398832798004, 0.03560439869761

##CLIP Score

In [19]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.6.0-py3-none-any.whl.metadata (20 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.9-py3-none-any.whl.metadata (5.2 kB)
Downloading torchmetrics-1.6.0-py3-none-any.whl (926 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m926.4/926.4 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.9-py3-none-any.whl (28 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.11.9 torchmetrics-1.6.0


In [20]:
from diffusers import StableDiffusionPipeline
import torch
from torchmetrics.functional.multimodal import clip_score
from functools import partial
from PIL import Image
import numpy as np
import json
from torchvision.transforms import functional as F

model_ckpt = "CompVis/stable-diffusion-v1-4"
sd_pipeline = StableDiffusionPipeline.from_pretrained(model_ckpt, torch_dtype=torch.float16).to("cuda")

clip_score_fn = partial(clip_score, model_name_or_path="openai/clip-vit-base-patch16")

model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

(…)ature_extractor/preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

safety_checker/config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

(…)kpoints/scheduler_config-checkpoint.json:   0%|          | 0.00/209 [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

text_encoder/config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [23]:
def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    image = F.to_tensor(image)
    return image.permute(1, 2, 0).numpy()

def calculate_clip_score(images, prompts):
    images_int = (images * 255).astype("uint8")
    clip_score_value = clip_score_fn(torch.from_numpy(images_int).permute(0, 3, 1, 2), prompts).detach()
    return round(float(clip_score_value), 4)

In [24]:
dataset_path = "/content/drive/MyDrive/counterfactual_dataset.json"
images_folder = "/content/drive/MyDrive/project_ii_images"

with open(dataset_path, 'r') as f:
    dataset = json.load(f)

In [25]:
counterfactual_prompts = []
counterfactual_images = []

for obj in dataset:
    counterfactual_prompts.append(obj['counterfactual_prompt'])
    image_path = f"{images_folder}/{obj['counterfactual_image']}"
    processed_image = preprocess_image(image_path)
    counterfactual_images.append(processed_image)

In [26]:
def calculate_clip_scores_per_image(images, prompts):
    scores = []
    for i, (image, prompt) in enumerate(zip(images, prompts)):
        image_int = (image * 255).astype("uint8")
        clip_score_value = clip_score_fn(
            torch.from_numpy(image_int).permute(2, 0, 1).unsqueeze(0),
            [prompt]
        ).detach().cpu().item()
        scores.append(round(clip_score_value, 4))
    return scores

clip_scores = calculate_clip_scores_per_image(counterfactual_images, counterfactual_prompts)

config.json:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/599M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [28]:
grouped_clip_scores = {
    f"group_{i // 5 + 1}": clip_scores[i:i + 5]
    for i in range(0, len(clip_scores), 5)
}

# Save grouped scores to JSON
output_grouped_scores_path = "/content/drive/MyDrive/grouped_clip_scores.json"
with open(output_grouped_scores_path, 'w') as f:
    json.dump(grouped_clip_scores, f, indent=4)

print(f"Grouped CLIP scores saved to {output_grouped_scores_path}")

Grouped CLIP scores saved to /content/drive/MyDrive/grouped_clip_scores.json


In [29]:
grouped_clip_scores

{'group_1': [34.9135, 33.6042, 32.1737, 33.7121, 34.3911],
 'group_2': [31.3322, 32.6462, 30.3954, 29.8675, 30.4573],
 'group_3': [27.9168, 29.6904, 28.3834, 27.5439, 29.6012],
 'group_4': [24.6436, 26.4958, 25.4787, 26.1111, 26.2127],
 'group_5': [29.687, 28.0719, 28.3613, 29.0289, 31.0683],
 'group_6': [27.173, 29.4813, 29.4949, 28.9573, 29.8417],
 'group_7': [30.5266, 31.3319, 33.2682, 30.8828, 35.1859],
 'group_8': [27.7283, 27.2512, 27.1531, 26.3749, 25.5819],
 'group_9': [32.4138, 32.5684, 31.7888, 32.2023, 32.5475],
 'group_10': [26.8266, 27.6283, 29.0633, 29.5216, 27.8748]}

In [32]:
all_scores = [score for group in grouped_clip_scores.values() for score in group]

average_score = sum(all_scores) / len(all_scores)

print(f"Average CLIP Score: {average_score}")

Average CLIP Score: 29.689132000000004


##Minimality (CLD)

In [33]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

model_name = "openai/clip-vit-base-patch32"
clip_model = CLIPModel.from_pretrained(model_name)
clip_processor = CLIPProcessor.from_pretrained(model_name)

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [34]:
images_folder = "/content/drive/MyDrive/project_ii_images"
dataset_path = "/content/drive/MyDrive/counterfactual_dataset.json"

with open(dataset_path, 'r') as f:
    dataset = json.load(f)

def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    return clip_processor(images=image, return_tensors="pt")

l2_cld_scores = []
cosine_cld_scores = []

In [35]:
for i, obj in enumerate(dataset):
    original_image_path = f"{images_folder}/{obj['original_image']}"
    counterfactual_image_path = f"{images_folder}/{obj['counterfactual_image']}"

    original_inputs = preprocess_image(original_image_path)
    counterfactual_inputs = preprocess_image(counterfactual_image_path)

    with torch.no_grad():
        original_latent = clip_model.get_image_features(**original_inputs)
        counterfactual_latent = clip_model.get_image_features(**counterfactual_inputs)

    l2_cld = torch.dist(original_latent, counterfactual_latent, p=2).item()
    l2_cld_scores.append(round(l2_cld, 4))

    cosine_cld = 1 - torch.nn.functional.cosine_similarity(original_latent, counterfactual_latent, dim=1).item()
    cosine_cld_scores.append(round(cosine_cld, 4))

In [37]:
cosine_cld_grouped = {
    f"group_{i // 5 + 1}": cosine_cld_scores[i:i + 5]
    for i in range(0, len(cosine_cld_scores), 5)
}

l2_cld_grouped = {
    f"group_{i // 5 + 1}": l2_cld_scores[i:i + 5]
    for i in range(0, len(l2_cld_scores), 5)
}

In [38]:
print(cosine_cld_grouped)

{'group_1': [0.0265, 0.0094, 0.0772, 0.0271, 0.0376], 'group_2': [0.0381, 0.0917, 0.0467, 0.0334, 0.0498], 'group_3': [0.0052, 0.0432, 0.0421, 0.025, 0.2311], 'group_4': [0.0091, 0.0179, 0.0136, 0.0222, 0.0154], 'group_5': [0.0111, 0.0411, 0.002, 0.0048, 0.1366], 'group_6': [0.021, 0.0054, 0.0415, 0.0019, 0.0009], 'group_7': [0.1207, 0.074, 0.2209, 0.115, 0.1278], 'group_8': [0.0159, 0.0046, 0.0083, 0.008, 0.0035], 'group_9': [0.0063, 0.0148, 0.0139, 0.0147, 0.013], 'group_10': [0.0251, 0.021, 0.0189, 0.0128, 0.0083]}


In [39]:
print(l2_cld_grouped)

{'group_1': [2.554, 1.5286, 4.3998, 2.6271, 3.0396], 'group_2': [3.0965, 4.9027, 3.4568, 2.9359, 3.5346], 'group_3': [1.1351, 3.2401, 3.2045, 2.4853, 7.3967], 'group_4': [1.631, 2.2874, 1.9762, 2.5455, 2.1074], 'group_5': [1.6457, 3.1614, 0.7068, 1.0886, 5.726], 'group_6': [2.1665, 1.093, 3.168, 0.6257, 0.4313], 'group_7': [5.1045, 3.9586, 6.7043, 4.9905, 5.2729], 'group_8': [1.9768, 1.0587, 1.4153, 1.3945, 0.9192], 'group_9': [1.2066, 1.8486, 1.7867, 1.8512, 1.728], 'group_10': [2.469, 2.2224, 2.1125, 1.7402, 1.407]}


In [40]:
output_l2_path = "/content/drive/MyDrive/l2_cld_grouped.json"
output_cosine_path = "/content/drive/MyDrive/cosine_cld_grouped.json"

with open(output_l2_path, 'w') as f:
    json.dump(l2_cld_grouped, f, indent=4)

with open(output_cosine_path, 'w') as f:
    json.dump(cosine_cld_grouped, f, indent=4)

print(f"L2 CLD scores saved to {output_l2_path}")
print(f"Cosine CLD scores saved to {output_cosine_path}")

L2 CLD scores saved to /content/drive/MyDrive/l2_cld_grouped.json
Cosine CLD scores saved to /content/drive/MyDrive/cosine_cld_grouped.json


##Realism (FID)

In [2]:
from torchmetrics.image.fid import FrechetInceptionDistance
from PIL import Image
import torch
import json
from torchvision.transforms import functional as F

images_folder = "/content/drive/MyDrive/project_ii_images"
dataset_path = "/content/drive/MyDrive/counterfactual_dataset.json"

with open(dataset_path, 'r') as f:
    dataset = json.load(f)

def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    image_tensor = F.to_tensor(image)
    return image_tensor.unsqueeze(0)

def calculate_fid(real_images, fake_images):
    fid = FrechetInceptionDistance(normalize=True)
    fid.update(real_images, real=True)
    fid.update(fake_images, real=False)
    return float(fid.compute())

In [3]:
grouped_jsons = [
    dataset[i:i + 5] for i in range(0, len(dataset), 5)
]

grouped_fid_scores = {}

In [46]:
!pip install torchmetrics[image]



In [4]:
for group_idx, group in enumerate(grouped_jsons, start=1):
    real_images = []
    fake_images = []

    for obj in group:
        original_image_path = f"{images_folder}/{obj['original_image']}"
        counterfactual_image_path = f"{images_folder}/{obj['counterfactual_image']}"

        real_image = preprocess_image(original_image_path)
        fake_image = preprocess_image(counterfactual_image_path)

        real_images.append(real_image)
        fake_images.append(fake_image)

    real_images = torch.cat(real_images, dim=0) if real_images else None
    fake_images = torch.cat(fake_images, dim=0) if fake_images else None

    if real_images is not None and fake_images is not None:
        fid_score = calculate_fid(real_images, fake_images)
        grouped_fid_scores[f"group_{group_idx}"] = fid_score
        print(f"FID for group {group_idx}: {fid_score}")
    else:
        print(f"Skipping group {group_idx} due to missing images.")

Downloading: "https://github.com/toshas/torch-fidelity/releases/download/v0.2.0/weights-inception-2015-12-05-6726825d.pth" to /root/.cache/torch/hub/checkpoints/weights-inception-2015-12-05-6726825d.pth
100%|██████████| 91.2M/91.2M [00:00<00:00, 300MB/s]


FID for group 1: 100.74525451660156
FID for group 2: 62.56085968017578
FID for group 3: 62.456298828125
FID for group 4: 70.9094009399414
FID for group 5: 67.02835083007812
FID for group 6: 63.2408561706543
FID for group 7: 186.8501434326172
FID for group 8: 13.913700103759766
FID for group 9: 10.46253776550293
FID for group 10: 28.100801467895508


In [5]:
output_fid_path = "/content/drive/MyDrive/grouped_fid_scores.json"
with open(output_fid_path, 'w') as f:
    json.dump(grouped_fid_scores, f, indent=4)

print(f"Grouped FID scores saved to {output_fid_path}")

Grouped FID scores saved to /content/drive/MyDrive/grouped_fid_scores.json
