<a href="https://colab.research.google.com/github/MustiCankan/MustiCankan/blob/main/ClipScore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch torchvision ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-1hv07ymx
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-1hv07ymx
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369489 sha256=9bd5eca54991b1b14fc5b1bffeb7ed923a5fe55bc3b47f614b2b051e563e37a8
  Stored in directory: /tmp/pip-ephem-w

CLIP Score

In [8]:
import torch
import clip
from PIL import Image

# Load the CLIP model and preprocess function
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Function to compute CLIP score
def compute_clip_score(image_path, text_prompt):
    # Load and preprocess the image
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)

    # Encode the image and text
    text = clip.tokenize([text_prompt]).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)

    # Normalize features to unit vectors
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    # Compute cosine similarity (CLIP score)
    similarity = (image_features @ text_features.T).item()

    return similarity

# Example Usage
image_path = "/content/Unknown-2.png"  # Path to your image file
text_prompt = "a photograph of an astronaut riding a horse"  # Your text description
clip_score = compute_clip_score(image_path, text_prompt)
print(f"CLIP Score: {clip_score:.4f}")

CLIP Score: 0.3706


FID Score Creation

In [1]:
!pip install torch torchvision scipy numpy pillow tqdm
!pip install git+https://github.com/mseitzer/pytorch-fid.git

Collecting git+https://github.com/mseitzer/pytorch-fid.git
  Cloning https://github.com/mseitzer/pytorch-fid.git to /tmp/pip-req-build-jngaqsuv
  Running command git clone --filter=blob:none --quiet https://github.com/mseitzer/pytorch-fid.git /tmp/pip-req-build-jngaqsuv
  Resolved https://github.com/mseitzer/pytorch-fid.git to commit b9c18118d082cbd263c1b8963fc4221dc1cbb659
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [20]:
import torch
from pytorch_fid import fid_score

# Function to compute FID for text-to-image generation
def compute_text_to_image_fid(real_images_dir, generated_images_dir, batch_size=16):
    """
    real_images_dir: Path to directory containing real images corresponding to text prompts.
    generated_images_dir: Path to directory containing images generated from text prompts.
    """
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # Compute FID between real and generated image sets
    fid_value = fid_score.calculate_fid_given_paths([real_images_dir, generated_images_dir],
    batch_size=batch_size,
    device=device,
    dims=2048)
    print(f"FID Score (Text-to-Image): {fid_value:.4f}")
    return fid_value

# Example Usage
real_images_path = "/content/real"  # Real images (e.g., from MS COCO)
generated_images_path = "/content/real"  # Model-generated images
fid_score_value = compute_text_to_image_fid(real_images_path, generated_images_path,64)



100%|██████████| 1/1 [00:00<00:00,  8.71it/s]




100%|██████████| 1/1 [00:00<00:00,  8.91it/s]

FID Score (Text-to-Image): 0.0000





R-Precession

In [22]:
import torch
import clip
from PIL import Image
import numpy as np

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Function to compute R-Precision
def compute_r_precision(text_prompts, image_paths, top_k=1):
    """
    Args:
    text_prompts: List of text prompts
    image_paths: List of corresponding generated image paths (including distractor images)
    top_k: The number of relevant images to include in the precision calculation.

    Returns:
    R-Precision score (mean percentage of correct retrievals)
    """
    correct_retrievals = 0
    total_samples = len(text_prompts)

    for idx, prompt in enumerate(text_prompts):
        # Load and preprocess images
        images = [preprocess(Image.open(img_path)).unsqueeze(0).to(device) for img_path in image_paths[idx]]
        images = torch.cat(images, dim=0)

        # Tokenize text prompt
        text = clip.tokenize([prompt]).to(device)

        # Encode images and text
        with torch.no_grad():
            image_features = model.encode_image(images)
            text_features = model.encode_text(text)

        # Normalize features
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        # Compute cosine similarities
        similarities = (image_features @ text_features.T).squeeze()

        # Sort by similarity
        sorted_indices = torch.argsort(similarities, descending=True)

        # Check if the correct image is within the top-k predictions
        if 0 in sorted_indices[:top_k]:  # Assuming the first image is the correct one
            correct_retrievals += 1

    r_precision = correct_retrievals / total_samples
    print(f"R-Precision (Top-{top_k}): {r_precision:.4f}")
    return r_precision

# Example Usage:
text_prompts = [
    "a photograph of an astronaut riding a horse",
]

# List of lists of images (each list should contain one correct image + distractors)
image_paths = [
    ["/content/fake2/Unknown.png", "/content/real/Unknown-2.png"]

]

# Compute R-Precision
compute_r_precision(text_prompts, image_paths, top_k=1)

R-Precision (Top-1): 0.0000


0.0

Diversity Score

In [23]:
import torch
import clip
from PIL import Image
import numpy as np
from itertools import combinations

# Load CLIP model and preprocessing
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Function to compute Diversity Score
def compute_diversity_score(image_paths):
    """
    Compute diversity score (DS) for a set of generated images.

    Args:
    - image_paths: List of paths to generated images.

    Returns:
    - diversity_score: The average pairwise cosine distance between image embeddings.
    """
    image_embeddings = []

    # Extract embeddings for each image
    for img_path in image_paths:
        image = preprocess(Image.open(img_path)).unsqueeze(0).to(device)
        with torch.no_grad():
            image_embedding = model.encode_image(image)
            image_embedding /= image_embedding.norm(dim=-1, keepdim=True)  # Normalize embeddings
            image_embeddings.append(image_embedding.cpu().numpy().squeeze())

    # Convert list to numpy array
    image_embeddings = np.array(image_embeddings)

    # Compute pairwise cosine distances
    pairwise_distances = []
    for (img1, img2) in combinations(image_embeddings, 2):
        cosine_similarity = np.dot(img1, img2) / (np.linalg.norm(img1) * np.linalg.norm(img2))
        distance = 1 - cosine_similarity  # Cosine distance
        pairwise_distances.append(distance)

    # Average pairwise distance as diversity score
    diversity_score = np.mean(pairwise_distances)
    print(f"Diversity Score (DS): {diversity_score:.4f}")
    return diversity_score

# Example Usage
generated_image_paths = [
    "/content/fake2/Unknown.png",
    "/content/real/Unknown-2.png",
]

compute_diversity_score(generated_image_paths)

Diversity Score (DS): 0.2021


0.2021484375

Attentional Consistency Matrix

In [28]:
import torch
import clip
from PIL import Image

# Load CLIP model and preprocessing
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Function to compute attentional consistency
def compute_attention_consistency(image_path, text_prompt, sub_phrases):
    """
    Compute attentional consistency between the image and parts of the text prompt.

    Args:
    - image_path: Path to the generated image.
    - text_prompt: The full text prompt.
    - sub_phrases: List of sub-phrases (object descriptions, relationships).

    Returns:
    - consistency_score: Average cosine similarity between sub-phrase embeddings and the full text/image embeddings.
    """
    # Load and preprocess the image
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)

    # Encode full text prompt and image
    with torch.no_grad():
        text_embedding = model.encode_text(clip.tokenize([text_prompt]).to(device))
        image_embedding = model.encode_image(image)
        text_embedding /= text_embedding.norm(dim=-1, keepdim=True)
        image_embedding /= image_embedding.norm(dim=-1, keepdim=True)

    # Sub-phrase embeddings
    sub_phrase_embeddings = []
    for phrase in sub_phrases:
        with torch.no_grad():
            phrase_embedding = model.encode_text(clip.tokenize([phrase]).to(device))
            phrase_embedding /= phrase_embedding.norm(dim=-1, keepdim=True)
            sub_phrase_embeddings.append(phrase_embedding)

    # Compute cosine similarities between sub-phrases and image
    consistency_scores = []
    for sub_embedding in sub_phrase_embeddings:
        similarity = torch.cosine_similarity(sub_embedding, image_embedding).item()
        consistency_scores.append(similarity)

    # Average consistency score
    average_consistency = sum(consistency_scores) / len(consistency_scores)
    print(f"Attentional Consistency Score: {average_consistency:.4f}")
    return average_consistency

# Example Usage
image_path = "/content/fake2/Unknown.png"
text_prompt = "an astronaut riding a horse"
sub_phrases = [" an astronaut", "horse"]

compute_attention_consistency(image_path, text_prompt, sub_phrases)

Attentional Consistency Score: 0.2476


0.24761962890625