In [1]:
#revised to use open_clip instead of clip
import csv
import os
from PIL import Image
import torch
#from clip import load
import open_clip
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import argparse
import os
import csv
from tqdm import tqdm

def benchmark_model(model_name, benchmark_dir,pretrained='openai',checkpoint_dir=None, device = "cpu"):
    model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained,force_quick_gelu=True)
    tokenizer=open_clip.get_tokenizer(model_name)
    if checkpoint_dir is not None:
        checkpoint =torch.load(checkpoint_dir, map_location='cpu')
        # loading a bare (model only) checkpoint for fine-tune or evaluation
        sd = checkpoint["state_dict"]
        if next(iter(sd.items()))[0].startswith('module'):
            sd = {k[len('module.'):]: v for k, v in sd.items()}
        model.load_state_dict(sd)

    model.to(device)
    image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
    csv_file = os.path.join(benchmark_dir, 'Questions.csv')
    

    csv_outfile = open('output.csv', 'w', newline='')
    csv_writer = csv.writer(csv_outfile)
    csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score','img_sim','text_sim'])  # header

    categories = [
        'Orientation and Direction', 'Presence of Specific Features', 
        'State and Condition', 'Quantity and Count', 
        'Positional and Relational Context', 'Color and Appearance',
        'Structural Characteristics', 'Texts',
        'Viewpoint and Perspective'
    ]

    pair_accuracies = {category: 0 for category in categories}
    text_similarities={category: 0 for category in categories}
    img_similarities={category: 0 for category in categories}
    num_pairs = 0
    
    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # skip header
        for i, row in tqdm(enumerate(reader)):
            qid1, qtype1, statement1 = row
        
            # Get next row for the pair
            row = next(reader, None)
            if not row:
                break
            qid2, qtype2, statement2 = row
            
            qid1, qid2 = int(qid1), int(qid2)
            
            img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
            img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))

            text1 = 'a photo of ' + statement1
            text2 = 'a photo of ' + statement2

            
            #texts = tokenizer([text1,text2]).to(device)
            text1 = tokenizer([text1]).to(device)
            text2 = tokenizer([text2]).to(device)
            
            img1 = preprocess(img1).unsqueeze(0).to(device)
            img2 = preprocess(img2).unsqueeze(0).to(device)
            imgs = torch.cat((img1, img2), dim=0)
      


            with torch.no_grad():
                image_features = model.encode_image(imgs)
               
                text_features1 = model.encode_text(text1)
                image_features /= image_features.norm(dim=-1, keepdim=True)
                text_features1 /= text_features1.norm(dim=-1, keepdim=True)
                # logits_per_image1, logits_per_text1 = model(imgs, text1)
                # logits_per_image2, logits_per_text2 = model(imgs, text2)
                
                # probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
                # probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
                sim_img=image_features@image_features.T
                sim_img=sim_img[0][1]
                
                probs1 = (100.0 * image_features @ text_features1.T).softmax(dim=0)
              

                image_features = model.encode_image(imgs)
                text_features2 = model.encode_text(text2)
                image_features /= image_features.norm(dim=-1, keepdim=True)
                text_features2 /= text_features2.norm(dim=-1, keepdim=True)
                # logits_per_image1, logits_per_text1 = model(imgs, text1)
                # logits_per_image2, logits_per_text2 = model(imgs, text2)
                
                # probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
                # probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
                probs2 = (100.0 * image_features @ text_features2.T).softmax(dim=0)
                sim_text=text_features1@text_features2.T
                sim_text=sim_text[0][0]
                
  
            img1_score1 = probs1[0][0]
            img1_score2 = probs2[0][0]
            
            pred1 = "img1" if img1_score1 > 0.5 else "img2"
            pred2 = "img1" if img1_score2 > 0.5 else "img2"

            gt1 = "img1" if qid1 % 2 == 1 else "img2"
            gt2 = "img1" if qid2 % 2 == 1 else "img2"

            
            csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2,sim_img,sim_text])
                
            current_category = categories[num_pairs // 15]
            if pred1 == gt1 and pred2 == gt2:
                pair_accuracies[current_category] += 1
            text_similarities[current_category]+=sim_text
            img_similarities[current_category]+=sim_img
            num_pairs += 1
      

        

        csv_outfile.close()

    # Calculate percentage accuracies
    for category in pair_accuracies:
        pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
        text_similarities[category]=text_similarities[category]/(num_pairs // len(categories))
        img_similarities[category]=img_similarities[category]/(num_pairs // len(categories))

    return pair_accuracies,text_similarities,img_similarities







In [None]:
model='ViT-L-14' 
MMVP_dir='./MMVP_VLM'
checkpoint_dir='./weights/ViT_stage_1_epoch_20.pt'
result,_,_=benchmark_model(model,MMVP_dir,pretrained='openai',device=torch.device('cuda:0'),checkpoint_dir=checkpoint_dir)
print(result)

In [1]:
#revised to use longclip instead of clip
from model import longclip
import csv
import os
from PIL import Image
import torch
import open_clip
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import argparse
import os
import csv
from tqdm import tqdm

def benchmark_model(model_name, benchmark_dir,pretrained='openai',checkpoint_dir=None, device = "cpu"):
    model, preprocess = longclip.load(checkpoint_dir,device=device)
    #tokenizer=open_clip.get_tokenizer(model_name)
    

    #model.to(device)
    image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
    csv_file = os.path.join(benchmark_dir, 'Questions.csv')
    

    csv_outfile = open('output.csv', 'w', newline='')
    csv_writer = csv.writer(csv_outfile)
    csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score','img_sim','text_sim'])  # header

    categories = [
        'Orientation and Direction', 'Presence of Specific Features', 
        'State and Condition', 'Quantity and Count', 
        'Positional and Relational Context', 'Color and Appearance',
        'Structural Characteristics', 'Texts',
        'Viewpoint and Perspective'
    ]

    pair_accuracies = {category: 0 for category in categories}
    text_similarities={category: 0 for category in categories}
    img_similarities={category: 0 for category in categories}
    num_pairs = 0
    
    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # skip header
        for i, row in tqdm(enumerate(reader)):
            qid1, qtype1, statement1 = row
        
            # Get next row for the pair
            row = next(reader, None)
            if not row:
                break
            qid2, qtype2, statement2 = row
            
            qid1, qid2 = int(qid1), int(qid2)
            
            img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
            img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))

            text1 = 'a photo of ' + statement1
            text2 = 'a photo of ' + statement2

            
            #texts = tokenizer([text1,text2]).to(device)
            text1 = longclip.tokenize([text1]).to(device)
            text2 = longclip.tokenize([text2]).to(device)
            
            img1 = preprocess(img1).unsqueeze(0).to(device)
            img2 = preprocess(img2).unsqueeze(0).to(device)
            imgs = torch.cat((img1, img2), dim=0)
      


            with torch.no_grad():
                image_features = model.encode_image(imgs)
               
                text_features1 = model.encode_text(text1)
                image_features /= image_features.norm(dim=-1, keepdim=True)
                text_features1 /= text_features1.norm(dim=-1, keepdim=True)
                # logits_per_image1, logits_per_text1 = model(imgs, text1)
                # logits_per_image2, logits_per_text2 = model(imgs, text2)
                
                # probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
                # probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
                sim_img=image_features@image_features.T
                sim_img=sim_img[0][1]
                
                probs1 = (100.0 * image_features @ text_features1.T).softmax(dim=0)
              

                image_features = model.encode_image(imgs)
                text_features2 = model.encode_text(text2)
                image_features /= image_features.norm(dim=-1, keepdim=True)
                text_features2 /= text_features2.norm(dim=-1, keepdim=True)
                # logits_per_image1, logits_per_text1 = model(imgs, text1)
                # logits_per_image2, logits_per_text2 = model(imgs, text2)
                
                # probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
                # probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
                probs2 = (100.0 * image_features @ text_features2.T).softmax(dim=0)
                sim_text=text_features1@text_features2.T
                sim_text=sim_text[0][0]
                
  
            img1_score1 = probs1[0][0]
            img1_score2 = probs2[0][0]
            
            pred1 = "img1" if img1_score1 > 0.5 else "img2"
            pred2 = "img1" if img1_score2 > 0.5 else "img2"

            gt1 = "img1" if qid1 % 2 == 1 else "img2"
            gt2 = "img1" if qid2 % 2 == 1 else "img2"

            
            csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2,sim_img,sim_text])
                
            current_category = categories[num_pairs // 15]
            if pred1 == gt1 and pred2 == gt2:
                pair_accuracies[current_category] += 1
            text_similarities[current_category]+=sim_text
            img_similarities[current_category]+=sim_img
            num_pairs += 1
      

        

        csv_outfile.close()

    # Calculate percentage accuracies
    for category in pair_accuracies:
        pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
        text_similarities[category]=text_similarities[category]/(num_pairs // len(categories))
        img_similarities[category]=img_similarities[category]/(num_pairs // len(categories))

    return pair_accuracies,text_similarities,img_similarities







In [None]:
# evaluate based on the concatenation of the representations from a list of clip models

import csv
import os
from PIL import Image
import torch
#from clip import load

import open_clip
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import argparse
import os
import csv
from tqdm import tqdm

def benchmark_model(benchmark_dir,model_name,pretrained='openai',checkpoint_dirs=[], device = "cpu"):
    models=[]
    for checkpoint_dir in checkpoint_dirs:
        model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained,force_quick_gelu=True)
        tokenizer=open_clip.get_tokenizer(model_name)
        if checkpoint_dir is not None:
            checkpoint =torch.load(checkpoint_dir, map_location='cpu')
            # loading a bare (model only) checkpoint for fine-tune or evaluation
            sd = checkpoint["state_dict"]
            if next(iter(sd.items()))[0].startswith('module'):
                sd = {k[len('module.'):]: v for k, v in sd.items()}
            model.load_state_dict(sd)
        model.to(device)
        models.append(model)

    image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
    csv_file = os.path.join(benchmark_dir, 'Questions.csv')
    

    csv_outfile = open('output.csv', 'w', newline='')
    csv_writer = csv.writer(csv_outfile)
    csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score','img_sim','text_sim'])  # header

    categories = [
        'Orientation and Direction', 'Presence of Specific Features', 
        'State and Condition', 'Quantity and Count', 
        'Positional and Relational Context', 'Color and Appearance',
        'Structural Characteristics', 'Texts',
        'Viewpoint and Perspective'
    ]

    pair_accuracies = {category: 0 for category in categories}
    text_similarities={category: 0 for category in categories}
    img_similarities={category: 0 for category in categories}
    num_pairs = 0
    
    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # skip header
        for i, row in tqdm(enumerate(reader)):
            qid1, qtype1, statement1 = row
        
            # Get next row for the pair
            row = next(reader, None)
            if not row:
                break
            qid2, qtype2, statement2 = row
            
            qid1, qid2 = int(qid1), int(qid2)
            
            img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
            img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))

            text1 = 'a photo of ' + statement1
            text2 = 'a photo of ' + statement2

            
            #texts = tokenizer([text1,text2]).to(device)
            text1 = tokenizer([text1]).to(device)
            text2 = tokenizer([text2]).to(device)
            
            img1 = preprocess(img1).unsqueeze(0).to(device)
            img2 = preprocess(img2).unsqueeze(0).to(device)
            imgs = torch.cat((img1, img2), dim=0)
      


            with torch.no_grad():
                image_features = [models[k].encode_image(imgs) for k in range(len(models))]
                image_features=torch.cat(image_features,dim=1)
               
                text_features1 = [models[k].encode_text(text1) for k in range(len(models))]
                text_features1=torch.cat(text_features1,dim=1)
                image_features /= image_features.norm(dim=-1, keepdim=True)
                text_features1 /= text_features1.norm(dim=-1, keepdim=True)
                # logits_per_image1, logits_per_text1 = model(imgs, text1)
                # logits_per_image2, logits_per_text2 = model(imgs, text2)
                
                # probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
                # probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
                sim_img=image_features@image_features.T
                sim_img=sim_img[0][1]
                
                probs1 = (100.0 * image_features @ text_features1.T).softmax(dim=0)
              

                #image_features = model.encode_image(imgs)
                text_features2 = [models[k].encode_text(text2) for k in range(len(models))]
                text_features2=torch.cat(text_features2,dim=1)
                #image_features /= image_features.norm(dim=-1, keepdim=True)
                text_features2 /= text_features2.norm(dim=-1, keepdim=True)
                # logits_per_image1, logits_per_text1 = model(imgs, text1)
                # logits_per_image2, logits_per_text2 = model(imgs, text2)
                
                # probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
                # probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
                probs2 = (100.0 * image_features @ text_features2.T).softmax(dim=0)
                sim_text=text_features1@text_features2.T
                sim_text=sim_text[0][0]
                
  
            img1_score1 = probs1[0][0]
            img1_score2 = probs2[0][0]
            
            pred1 = "img1" if img1_score1 > 0.5 else "img2"
            pred2 = "img1" if img1_score2 > 0.5 else "img2"

            gt1 = "img1" if qid1 % 2 == 1 else "img2"
            gt2 = "img1" if qid2 % 2 == 1 else "img2"

            
            csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2,sim_img,sim_text])
                
            current_category = categories[num_pairs // 15]
            if pred1 == gt1 and pred2 == gt2:
                pair_accuracies[current_category] += 1
            text_similarities[current_category]+=sim_text
            img_similarities[current_category]+=sim_img
            num_pairs += 1
      

        

        csv_outfile.close()

    # Calculate percentage accuracies
    for category in pair_accuracies:
        pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
        text_similarities[category]=text_similarities[category]/(num_pairs // len(categories))
        img_similarities[category]=img_similarities[category]/(num_pairs // len(categories))

    return pair_accuracies,text_similarities,img_similarities







In [None]:


model='ViT-L-14' 
MMVP_dir='./MMVP_VLM'
checkpoint_dirs=[None,
    './weights/ViT_stage_1_epoch_20.pt',
    './weights/ViT_stage_2_epoch_20.pt',
    './weights/ViT_stage_3_epoch_20.pt']

result,_,_=benchmark_model(MMVP_dir,model_name=model,device=torch.device('cuda:0'),checkpoint_dirs=checkpoint_dirs)
print(result)