In [None]:
import os
import sys  
import json
import torch 
import argparse
import numpy as np
from PIL import Image  
from tqdm import tqdm
from utils import model_gen, load_jsonl
from transformers import AutoModelForCausalLM, AutoTokenizer  

#### get the files from "https://github.com/AoiDragon/POPE/tree/e3e39262c85a6a83f26cf5094022a782cb0df58d/output/coco"
adv_path = 'data/json_files/coco_pope_adversarial.json'
pop_path = 'data/json_files/coco_pope_popular.json'
rand_path = 'data/json_files/coco_pope_random.json'

#### set the path to CoCo2014 val set
image_path = ''

ckpt_path = 'internlm/internlm-xcomposer2-vl-7b'
tokenizer = AutoTokenizer.from_pretrained(ckpt_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map="cuda", trust_remote_code=True).eval().cuda().half()
model.tokenizer = tokenizer

In [None]:
def eval_func(pred_list, label_list):
    pos = 1
    neg = 0
    yes_ratio = pred_list.count(1) / len(pred_list)

    TP, TN, FP, FN = 0, 0, 0, 0
    for pred, label in zip(pred_list, label_list):
        if pred == pos and label == pos:
            TP += 1
        elif pred == pos and label == neg:
            FP += 1
        elif pred == neg and label == neg:
            TN += 1
        elif pred == neg and label == pos:
            FN += 1

    print('TP\tFP\tTN\tFN\t')
    print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))

    precision = float(TP) / float(TP + FP)
    recall = float(TP) / float(TP + FN)
    f1 = 2*precision*recall / (precision + recall)
    acc = (TP + TN) / (TP + TN + FP + FN)
    print('Accuracy: {}'.format(acc))
    print('Precision: {}'.format(precision))
    print('Recall: {}'.format(recall))
    print('F1 score: {}'.format(f1))
    print('Yes ratio: {}'.format(yes_ratio))
    print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
    return f1

In [None]:
samples = load_jsonl(adv_path)

pred_list = []
label_list = []
for q in tqdm(samples):     
    im_path = os.path.join(image_path, q['image'])
    txt = '[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'.format(q['text'])
    with torch.cuda.amp.autocast(): 
        out = model_gen(model, txt, im_path)   
    ans = q['label'] 
    pred_list.append(0 if out == 'no' else 1)
    label_list.append(0 if ans == 'no' else 1) 
    
adversarial_f1 = eval_func(pred_list, label_list)

In [None]:
samples = load_jsonl(pop_path)

pred_list = []
label_list = []
for q in tqdm(samples):     
    im_path = os.path.join(image_path, q['image'])
    txt = '[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'.format(q['text'])
    with torch.cuda.amp.autocast(): 
        out = model_gen(model, txt, im_path)   
    ans = q['label'] 
    pred_list.append(0 if out == 'no' else 1)
    label_list.append(0 if ans == 'no' else 1) 
    
popular_f1 = eval_func(pred_list, label_list)

In [None]:
samples = load_jsonl(rand_path)  

pred_list = []
label_list = []
for q in tqdm(samples):     
    im_path = os.path.join(image_path, q['image'])
    txt = '[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'.format(q['text'])
    with torch.cuda.amp.autocast(): 
        out = model_gen(model, txt, im_path)   
    ans = q['label'] 
    pred_list.append(0 if out == 'no' else 1)
    label_list.append(0 if ans == 'no' else 1) 
    
random_f1 = eval_func(pred_list, label_list)

In [None]:
print ((random_f1 + popular_f1 + adversarial_f1) /3 )