In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch

from matplotlib import pyplot as plt
from tqdm import tqdm
from datasets import load_dataset
from pkgs.openai.clip import load as load_model

# Load Winoground

In [3]:
auth_token = "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"  # Replace with an auth token, which you can get from your huggingface account: Profile -> Settings -> Access Tokens -> New Token
winoground = load_dataset("facebook/winoground", use_auth_token=auth_token)["test"]

Found cached dataset winoground (C:/Users/dipti/.cache/huggingface/datasets/facebook___winoground/default/0.0.0/72585f4d9cd5a28790bb9bc2adbdd45633f36dfbf85df529e0756e114e134285)


  0%|          | 0/1 [00:00<?, ?it/s]

# Load CLIP Model

In [4]:
device = 'cpu'

In [5]:
## pretrained = True loads the original OpenAI CLIP model trained on 400M image-text pairs
clip_model, clip_processor = load_model(name = 'ViT-B/32', pretrained = True, keep_positional = True, rotate = False)

Positional encoding -  True
Rotary embedding -  True


In [6]:
clip_model

CLIP(
  (visual): VisualTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
            (embed_rotary_positions): SinusoidalPositionalEmbedding(77, 64)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamical

In [8]:
## Replace with the location of the checkpoint 
## The link for checkpoints -- https://drive.google.com/drive/u/0/folders/1K0kPJZ3MA4KAdx3Fpq25dgW59wIf7M-x

checkpoint = '../checkpoints/cyclip-3M.pt/best.pt'

In [9]:
state_dict = torch.load(checkpoint, map_location = device)["state_dict"]
if(next(iter(state_dict.items()))[0].startswith("module")):
    state_dict = {key[len("module."):]: value for key, value in state_dict.items()}
    
clip_model.load_state_dict(state_dict, strict=False)
clip_model.eval()

CLIP(
  (visual): ModifiedResNet(
    (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (avgpool): AvgPool2d(kernel_size=2, stride=2, padding=0)
    (relu): ReLU(inplace=True)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn

In [7]:
def get_inputs(image, caption):
    captions = clip_processor.process_text(caption)
    pixel_values = clip_processor.process_image(image.convert("RGB"))
    return captions['input_ids'].to(device), captions['attention_mask'].to(device), pixel_values.to(device).unsqueeze(0)

In [8]:
def clipscore(model, output):
    return (model.logit_scale.exp() * output.image_embeds @ output.text_embeds.t()).item()

# Look at an example from Winoground and get the image-caption scores from CLIP

In [9]:
# Note that some images in winoground are RGBA and some are RGB. Need to convert all to RGB with .convert('RGB')
# Note that we could run this example through CLIP as a batch, but I want to drive the point home that we get four independent image-caption scores for each example
input_c0_i0 = get_inputs(winoground[155]["image_0"], winoground[155]["caption_0"])
input_c1_i0 = get_inputs(winoground[155]["image_0"], winoground[155]["caption_1"])
input_c0_i1 = get_inputs(winoground[155]["image_1"], winoground[155]["caption_0"])
input_c1_i1 = get_inputs(winoground[155]["image_1"], winoground[155]["caption_1"])

output_c0_i0 = clip_model(input_ids = input_c0_i0[0], attention_mask = input_c0_i0[1], pixel_values = input_c0_i0[2])
output_c1_i0 = clip_model(input_ids = input_c1_i0[0], attention_mask = input_c1_i0[1], pixel_values = input_c1_i0[2])
output_c0_i1 = clip_model(input_ids = input_c0_i1[0], attention_mask = input_c0_i1[1], pixel_values = input_c0_i1[2])
output_c1_i1 = clip_model(input_ids = input_c1_i1[0], attention_mask = input_c1_i1[1], pixel_values = input_c1_i1[2])

clip_score_c0_i0 = clipscore(clip_model, output_c0_i0)
clip_score_c1_i0 = clipscore(clip_model, output_c1_i0)
clip_score_c0_i1 = clipscore(clip_model, output_c0_i1)
clip_score_c1_i1 = clipscore(clip_model, output_c1_i1)

print()
print("CLIP image-text match scores:")
print("image_0, caption_0:", clip_score_c0_i0)
print("image_0, caption_1:", clip_score_c1_i0)
print("image_1, caption_0:", clip_score_c0_i1)
print("image_1, caption_1:", clip_score_c1_i1)


CLIP image-text match scores:
image_0, caption_0: 28.252960205078125
image_0, caption_1: 24.553625106811523
image_1, caption_0: 24.56781578063965
image_1, caption_1: 23.062685012817383


# Get CLIP image-caption scores from the whole dataset

In [11]:
winoground_clip_scores = []
for example in tqdm(winoground):
    # Note that some images in winoground are RGBA and some are RGB. Need to convert all to RGB with .convert('RGB')
    # Note that we could run this example through CLIP as a batch, but I want to drive the point home that we get four independent image-caption scores for each example
    input_c0_i0 = get_inputs(example["image_0"], example["caption_0"])
    input_c1_i0 = get_inputs(example["image_0"], example["caption_1"])
    input_c0_i1 = get_inputs(example["image_1"], example["caption_0"])
    input_c1_i1 = get_inputs(example["image_1"], example["caption_1"])

    output_c0_i0 = clip_model(input_ids = input_c0_i0[0], attention_mask = input_c0_i0[1], pixel_values = input_c0_i0[2])
    output_c1_i0 = clip_model(input_ids = input_c1_i0[0], attention_mask = input_c1_i0[1], pixel_values = input_c1_i0[2])
    output_c0_i1 = clip_model(input_ids = input_c0_i1[0], attention_mask = input_c0_i1[1], pixel_values = input_c0_i1[2])
    output_c1_i1 = clip_model(input_ids = input_c1_i1[0], attention_mask = input_c1_i1[1], pixel_values = input_c1_i1[2])

    clip_score_c0_i0 = clipscore(clip_model, output_c0_i0)
    clip_score_c1_i0 = clipscore(clip_model, output_c1_i0)
    clip_score_c0_i1 = clipscore(clip_model, output_c0_i1)
    clip_score_c1_i1 = clipscore(clip_model, output_c1_i1)

    winoground_clip_scores.append({"id" : example["id"], "c0_i0": clip_score_c0_i0, "c0_i1": clip_score_c0_i1, "c1_i0": clip_score_c1_i0, "c1_i1": clip_score_c1_i1})

  0%|                                                                                                                                    | 0/9 [00:00<?, ?it/s]


TypeError: string indices must be integers

In [14]:
def text_correct(result):
    return result["c0_i0"] > result["c1_i0"] and result["c1_i1"] > result["c0_i1"]

def image_correct(result):
    return result["c0_i0"] > result["c0_i1"] and result["c1_i1"] > result["c1_i0"]

def group_correct(result):
    return image_correct(result) and text_correct(result)

text_correct_count = 0
image_correct_count = 0
group_correct_count = 0
for result in winoground_clip_scores:
    text_correct_count += 1 if text_correct(result) else 0
    image_correct_count += 1 if image_correct(result) else 0
    group_correct_count += 1 if group_correct(result) else 0

denominator = len(winoground_clip_scores)
print("text score:", text_correct_count/denominator)
print("image score:", image_correct_count/denominator)
print("group score:", group_correct_count/denominator)

text score: 0.2025
image score: 0.095
group score: 0.05
