In [1]:
import sys
import torch
from PIL import Image
import requests
from lavis.models import load_model_and_preprocess
import pandas as pd

  warn(f"Failed to load image Python extension: {e}")
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
import numpy as np
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from pycocoevalcap.cider import cider
from pycocoevalcap.spice import spice
from bert_score import score
import pandas as pd
import numpy as np

In [3]:
from minigpt4 import MiniGPT4
from blip_processor import Blip2ImageEvalProcessor
from conversation import Chat, CONV_VISION

import torch
import time

In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

In [5]:
def split_words(sentence):
    return sentence.split(' ')

def Rouge(GT_caption, generated_caption): # returns list of all rouge_n needed in (precision,recall,fmeasure) for each
    res = []
    scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
    output = scorer.score(GT_caption, generated_caption)
    for r in output:
        score = output[r]
        res.append((score.precision, score.recall, score.fmeasure))
    return res[0][2], res[1][2], res[2][2]

def Bleu_4(GT_caption, generated_caption):
    GT_words = split_words(GT_caption)
    gen_words = split_words(generated_caption)
    weights = (1./4., 1./4., 1./4., 1./4.)
    return corpus_bleu([[GT_words]], [gen_words], weights)

def METEOR(GT_caption, generated_caption):
    GT_words = split_words(GT_caption)
    gen_words = split_words(generated_caption)
    return meteor_score([GT_words], gen_words)

def CIDEr(GT_caption, generated_caption):
    scorer = cider.Cider()
    return scorer.compute_score({0:[GT_caption]}, {0:[generated_caption]})[0]

def SPICE(GT_caption, generated_caption):
    scorer = spice.Spice()
    return scorer.compute_score({0:[GT_caption]}, {0:[generated_caption]})[0]

def BertScore(GT_caption, generated_caption):
    P, R, F1 = score([generated_caption], [GT_caption], lang="en", verbose=True)
    #return P.tolist()[0], R.tolist()[0], F1.tolist()[0]
    return F1.tolist()[0]

def acc_full(GT_caption, generated_caption):
    GT_caption = GT_caption.lower()
    generated_caption = generated_caption.lower()
    return int(GT_caption in generated_caption)

def acc_part(GT_caption, generated_caption):
    GT_words = split_words(GT_caption.lower())
    generated_caption = generated_caption.lower()
    
    in_generated = 0
    for word in GT_words:
        if word in generated_caption:
            in_generated += 1
            
    return in_generated / len(GT_words)

In [6]:
def answer_evaluation(GT_caption, generated_caption):
    rouge1, rouge2, rougeL = Rouge(GT_caption, generated_caption)
    bleu4 = Bleu_4(GT_caption, generated_caption)
    meteor = METEOR(GT_caption, generated_caption)
    cider= CIDEr(GT_caption, generated_caption)
    spice = SPICE(GT_caption, generated_caption)
    bertscore = BertScore(GT_caption, generated_caption)
    
    return rouge1, rouge2, rougeL, bleu4, meteor, cider, spice, bertscore

## BLIP

In [8]:
blip_model, blip_vis_processors, _ = load_model_and_preprocess(
    name="blip2_t5", model_type="pretrain_flant5xxl", is_eval=True, device=device
)
blip_vis_processors.keys()

Loading checkpoint shards: 100%|██████████| 5/5 [00:05<00:00,  1.18s/it]


dict_keys(['train', 'eval'])

In [9]:
def blip_get_result(image_path, question):
    

    raw_image = Image.open(image_path).convert('RGB')  
    image = blip_vis_processors["eval"](raw_image).unsqueeze(0).to(device)
    generated_answer = blip_model.generate({"image": image, "prompt": "%s"%question})
    
    return generated_answer[0]

## MiniGPT4

In [10]:
minigpt4_model = MiniGPT4(
    vision_model_path="models/eva_vit_g.pth",
    llama_model="models/vicuna13b_v0/",
    q_former_model="models/blip2_pretrained_flant5xxl.pth",
)

ckpt_path = "models/pretrained_minigpt4.pth"

print("Load BLIP2-LLM Checkpoint: {}".format(ckpt_path))
ckpt = torch.load(ckpt_path, map_location="cpu")
minigpt4_model.load_state_dict(ckpt['model'], strict=False)

torch.compile(minigpt4_model)

minigpt4_vis_processor = Blip2ImageEvalProcessor()

chat = Chat(minigpt4_model, minigpt4_vis_processor, device='cuda:0')

Loading VIT: vision_model_path=models/eva_vit_g.pth
Loading VIT Done
Loading Q-Former
Loading Q-Former Done
Loading LLAMA

Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues


Loading checkpoint shards: 100%|██████████| 3/3 [04:26<00:00, 88.89s/it] 


Loading LLAMA Done
Load BLIP2-LLM Checkpoint: models/pretrained_minigpt4.pth


In [11]:
def minigpt4_get_result(image_path, question):
    
    chat_state = CONV_VISION.copy()
    img_list = []
    chat.upload_img("%s"%image_path, chat_state, img_list)


    num_beams = 1
    temperature = 0.01

    chat.ask("%s"%question, chat_state)

    # Callback for each word generated by the LLM
    def callback_function(word):
        print(word, end='', flush=True)

    #print("Live output: ", end='', flush=True)

    output_text = chat.answer_async(conv=chat_state,
                                    img_list=img_list,
                                    num_beams=num_beams,
                                    temperature=temperature,
                                    max_new_tokens=1024,
                                    max_length=2048,
                                    text_callback=callback_function)


    #print("LLM response: {}".format(output_text))
    
    return output_text

## Open-Flamingo

In [12]:
from PIL import Image
import requests
import torch
from huggingface_hub import hf_hub_download
import pandas as pd
from open_flamingo import create_model_and_transforms

openflamingo_model, openflamingo_image_processor, openflamingo_tokenizer = create_model_and_transforms(
    clip_vision_encoder_path="ViT-L-14",
    clip_vision_encoder_pretrained="openai",
    lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b",
    tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b",
    cross_attn_every_n_layers=1
)


Using pad_token, but it is not set yet.


You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.
Flamingo model initialized with 1046992944 trainable parameters


In [13]:
checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-3B-vitl-mpt1b", "checkpoint.pt")
openflamingo_model.load_state_dict(torch.load(checkpoint_path), strict=False)

_IncompatibleKeys(missing_keys=['vision_encoder.class_embedding', 'vision_encoder.positional_embedding', 'vision_encoder.proj', 'vision_encoder.conv1.weight', 'vision_encoder.ln_pre.weight', 'vision_encoder.ln_pre.bias', 'vision_encoder.transformer.resblocks.0.ln_1.weight', 'vision_encoder.transformer.resblocks.0.ln_1.bias', 'vision_encoder.transformer.resblocks.0.attn.in_proj_weight', 'vision_encoder.transformer.resblocks.0.attn.in_proj_bias', 'vision_encoder.transformer.resblocks.0.attn.out_proj.weight', 'vision_encoder.transformer.resblocks.0.attn.out_proj.bias', 'vision_encoder.transformer.resblocks.0.ln_2.weight', 'vision_encoder.transformer.resblocks.0.ln_2.bias', 'vision_encoder.transformer.resblocks.0.mlp.c_fc.weight', 'vision_encoder.transformer.resblocks.0.mlp.c_fc.bias', 'vision_encoder.transformer.resblocks.0.mlp.c_proj.weight', 'vision_encoder.transformer.resblocks.0.mlp.c_proj.bias', 'vision_encoder.transformer.resblocks.1.ln_1.weight', 'vision_encoder.transformer.resbloc

In [14]:
def openflamingo_get_result(image_path, question):
    
    

    """
    Step 1: Load images
    """
    demo_image_one = Image.open(
        requests.get(
            "http://images.cocodataset.org/val2017/000000039769.jpg", stream=True
        ).raw
    )

    demo_image_two = Image.open(
        requests.get(
            "http://images.cocodataset.org/test-stuff2017/000000028137.jpg",
            stream=True
        ).raw
    )

    query_image = Image.open(image_path)


    """
    Step 2: Preprocessing images
    Details: For OpenFlamingo, we expect the image to be a torch tensor of shape 
     batch_size x num_media x num_frames x channels x height x width. 
     In this case batch_size = 1, num_media = 3, num_frames = 1,
     channels = 3, height = 224, width = 224.
    """
    vision_x = [openflamingo_image_processor(demo_image_one).unsqueeze(0), openflamingo_image_processor(demo_image_two).unsqueeze(0), openflamingo_image_processor(query_image).unsqueeze(0)]
    vision_x = torch.cat(vision_x, dim=0)
    vision_x = vision_x.unsqueeze(1).unsqueeze(0)

    """
    Step 3: Preprocessing text
    Details: In the text we expect an <image> special token to indicate where an image is.
     We also expect an <|endofchunk|> special token to indicate the end of the text 
     portion associated with an image.
    """
    openflamingo_tokenizer.padding_side = "left" # For generation padding tokens should be on the left
    lang_x = openflamingo_tokenizer(
        ["<image>An image of two cats.<|endofchunk|><image>An image of a bathroom sink.<|endofchunk|><image>%s"%question],
        return_tensors="pt",
    )


    """
    Step 4: Generate text
    """
    generated_text = openflamingo_model.generate(
        vision_x=vision_x,
        lang_x=lang_x["input_ids"],
        attention_mask=lang_x["attention_mask"],
        max_new_tokens=20,
        num_beams=3,
    )

    #print("Generated text: ", tokenizer.decode(generated_text[0]))
    
    answer = openflamingo_tokenizer.decode(generated_text[0])[98:-15]
    
    return answer

In [16]:
old_tsv = pd.read_csv("landmark_all.tsv", sep='\t', encoding='utf-8')

df_results = pd.DataFrame(columns=['GT_answer', 'blip_generated_answer','minigpt4_generated_answer', 'openflamingo_generated_answer', \
                                   'rouge1_blip', 'rouge1_minigpt4', 'rouge1_openflamingo', \
                                   'rouge2_blip', 'rouge2_minigpt4', 'rouge2_openflamingo', \
                                   'rougeL_blip', 'rougeL_minigpt4', 'rougeL_openflamingo', \
                                   'bleu4_blip', 'bleu4_minigpt4', 'bleu4_openflamingo', \
                                   'meteor_blip', 'meteor_minigpt4', 'meteor_openflamingo', \
                                   'cider_blip', 'cider_minigpt4', 'cider_openflamingo', \
                                   'spice_blip', 'spice_minigpt4', 'spice_openflamingo', \
                                   'bertscore_blip', 'bertscore_minigpt4', 'bertscore_openflamingo'])

## Run 

In [None]:
for i in range(0,101,50):
    
    print(i)
    GT_answer = old_tsv.loc[i,'answer']
    
    ##BLIP
    blip_generated_answer = blip_get_result(old_tsv.loc[i,'image_path'], old_tsv.loc[i,'question'])
    
    blip_rouge1_res, blip_rouge2_res, blip_rougeL_res, blip_bleu4_res, blip_meteor_res, blip_cider_res, blip_spice_res, blip_bertscore_res = answer_evaluation(GT_answer, blip_generated_answer)
    
    ##minigpt4
    minigpt4_generated_answer = minigpt4_get_result(old_tsv.loc[i,'image_path'], old_tsv.loc[i,'question'])
    
    minigpt4_rouge1_res, minigpt4_rouge2_res, minigpt4_rougeL_res, minigpt4_bleu4_res, minigpt4_meteor_res, minigpt4_cider_res, minigpt4_spice_res, minigpt4_bertscore_res = answer_evaluation(GT_answer, minigpt4_generated_answer)
    
    ##open-flamingo
    openflamingo_generated_answer = openflamingo_get_result(old_tsv.loc[i,'image_path'], old_tsv.loc[i,'question'])
    
    openflamingo_rouge1_res, openflamingo_rouge2_res, openflamingo_rougeL_res, openflamingo_bleu4_res, openflamingo_meteor_res, openflamingo_cider_res, openflamingo_spice_res, openflamingo_bertscore_res = answer_evaluation(GT_answer, openflamingo_generated_answer)
    
    
    df_results = df_results.append({'GT_answer':GT_answer, \
                                    'blip_generated_answer': blip_generated_answer, 'minigpt4_generated_answer': minigpt4_generated_answer, 'openflamingo_generated_answer': openflamingo_generated_answer,\
                                    'rouge1_blip':blip_rouge1_res, 'rouge1_minigpt4':minigpt4_rouge1_res, 'rouge1_openflamingo':openflamingo_rouge1_res, \
                                    'rouge2_blip':blip_rouge2_res, 'rouge2_minigpt4':minigpt4_rouge2_res, 'rouge2_openflamingo':openflamingo_rouge2_res, \
                                    'rougeL_blip':blip_rougeL_res, 'rougeL_minigpt4':minigpt4_rougeL_res, 'rougeL_openflamingo':openflamingo_rougeL_res, \
                                    'bleu4_blip':blip_bleu4_res, 'bleu4_minigpt4':minigpt4_bleu4_res, 'bleu4_openflamingo':openflamingo_bleu4_res, \
                                    'meteor_blip':blip_meteor_res, 'meteor_minigpt4':minigpt4_meteor_res, 'meteor_openflamingo':openflamingo_meteor_res, \
                                    'cider_blip':blip_cider_res, 'cider_minigpt4':minigpt4_cider_res, 'cider_openflamingo':openflamingo_cider_res, \
                                    'spice_blip':blip_spice_res, 'spice_minigpt4':minigpt4_spice_res, 'spice_openflamingo':openflamingo_spice_res, \
                                    'bertscore_blip':blip_bertscore_res, 'bertscore_minigpt4':minigpt4_bertscore_res, 'bertscore_openflamingo':openflamingo_bertscore_res}, ignore_index=True)
    
df_results.to_csv('all_baselines_landmark.tsv', sep='\t', encoding='utf-8', index= False)

0


Parsing reference captions
Parsing test captions


SPICE evaluation took: 759.9 ms
calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 61.50it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 530.66it/s]

done in 0.02 seconds, 45.90 sentences/sec





<s>The architectural style of the building in the image is Gothic. The building has a tall, pointed roof, large arched windows, and intricate carvings on the facade. The style is characterized by the use of pointed arches, ribbed vaults, and large windows.###

Parsing reference captions
Parsing test captions


SPICE evaluation took: 915.4 ms
calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 56.45it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 624.80it/s]

done in 0.02 seconds, 42.34 sentences/sec



The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Parsing test captions


SPICE evaluation took: 819.1 ms
calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 58.11it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 568.64it/s]

done in 0.02 seconds, 43.82 sentences/sec
50



  df_results = df_results.append({'GT_answer':GT_answer, \
Parsing reference captions
Parsing test captions


SPICE evaluation took: 802.5 ms
calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 60.41it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 618.36it/s]

done in 0.02 seconds, 45.50 sentences/sec





<s>The architectural style of this image is Gothic Revival. The image shows a large, ornate altar with intricate carvings and a stained glass window behind it. The altar is made of stone and has a wooden top. The stained glass window is a beautiful, colorful depiction of a religious scene. The walls and floor are made of stone, and the room is dimly lit by candles.###

Parsing reference captions
Parsing test captions


SPICE evaluation took: 757.9 ms
calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 55.63it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 89.81it/s]

done in 0.03 seconds, 30.34 sentences/sec



The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Parsing test captions


SPICE evaluation took: 762.1 ms
