In [1]:
from minigpt4 import MiniGPT4
from blip_processor import Blip2ImageEvalProcessor
from conversation import Chat, CONV_VISION

import torch
import time

  from .autonotebook import tqdm as notebook_tqdm
  warn(f"Failed to load image Python extension: {e}")


In [2]:
import os
import numpy as np
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from pycocoevalcap.cider import cider
from pycocoevalcap.spice import spice
from bert_score import score
import pandas as pd
import numpy as np

In [3]:
model = MiniGPT4(
    vision_model_path="models/eva_vit_g.pth",
    llama_model="models/vicuna13b_v0/",
    q_former_model="models/blip2_pretrained_flant5xxl.pth",
)

ckpt_path = "models/pretrained_minigpt4.pth"

print("Load BLIP2-LLM Checkpoint: {}".format(ckpt_path))
ckpt = torch.load(ckpt_path, map_location="cpu")
model.load_state_dict(ckpt['model'], strict=False)

torch.compile(model)

vis_processor = Blip2ImageEvalProcessor()

chat = Chat(model, vis_processor, device='cuda:0')

Loading VIT: vision_model_path=models/eva_vit_g.pth
Loading VIT Done
Loading Q-Former


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565, and set the legacy attribute accordingly.


Loading Q-Former Done
Loading LLAMA

Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues


Loading checkpoint shards: 100%|██████████| 3/3 [00:19<00:00,  6.66s/it]


Loading LLAMA Done
Load BLIP2-LLM Checkpoint: models/pretrained_minigpt4.pth


In [4]:
def get_result(image_path, question):
    
    chat_state = CONV_VISION.copy()
    img_list = []
    chat.upload_img("%s"%image_path, chat_state, img_list)


    num_beams = 1
    temperature = 0.01

    chat.ask("%s"%question, chat_state)

    # Callback for each word generated by the LLM
    def callback_function(word):
        print(word, end='', flush=True)

    #print("Live output: ", end='', flush=True)

    output_text = chat.answer_async(conv=chat_state,
                                    img_list=img_list,
                                    num_beams=num_beams,
                                    temperature=temperature,
                                    max_new_tokens=1024,
                                    max_length=2048,
                                    text_callback=callback_function)


    #print("LLM response: {}".format(output_text))
    
    return output_text

In [5]:
def split_words(sentence):
    return sentence.split(' ')

def Rouge(GT_caption, generated_caption): # returns list of all rouge_n needed in (precision,recall,fmeasure) for each
    res = []
    scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
    output = scorer.score(GT_caption, generated_caption)
    for r in output:
        score = output[r]
        res.append((score.precision, score.recall, score.fmeasure))
    return res[0][2], res[1][2], res[2][2]

def Bleu_4(GT_caption, generated_caption):
    GT_words = split_words(GT_caption)
    gen_words = split_words(generated_caption)
    weights = (1./4., 1./4., 1./4., 1./4.)
    return corpus_bleu([[GT_words]], [gen_words], weights)

def METEOR(GT_caption, generated_caption):
    GT_words = split_words(GT_caption)
    gen_words = split_words(generated_caption)
    return meteor_score([GT_words], gen_words)

def CIDEr(GT_caption, generated_caption):
    scorer = cider.Cider()
    return scorer.compute_score({0:[GT_caption]}, {0:[generated_caption]})[0]

def SPICE(GT_caption, generated_caption):
    scorer = spice.Spice()
    return scorer.compute_score({0:[GT_caption]}, {0:[generated_caption]})[0]

def BertScore(GT_caption, generated_caption):
    P, R, F1 = score([generated_caption], [GT_caption], lang="en", verbose=True)
    #return P.tolist()[0], R.tolist()[0], F1.tolist()[0]
    return F1.tolist()[0]

def acc_full(GT_caption, generated_caption):
    GT_caption = GT_caption.lower()
    generated_caption = generated_caption.lower()
    return int(GT_caption in generated_caption)

def acc_part(GT_caption, generated_caption):
    GT_words = split_words(GT_caption.lower())
    generated_caption = generated_caption.lower()
    
    in_generated = 0
    for word in GT_words:
        if word in generated_caption:
            in_generated += 1
            
    return in_generated / len(GT_words)

In [6]:
def answer_evaluation(GT_caption, generated_caption):
    rouge1, rouge2, rougeL = Rouge(GT_caption, generated_caption)
    bleu4 = Bleu_4(GT_caption, generated_caption)
    meteor = METEOR(GT_caption, generated_caption)
    cider= CIDEr(GT_caption, generated_caption)
    spice = SPICE(GT_caption, generated_caption)
    bertscore = BertScore(GT_caption, generated_caption)
    
    return rouge1, rouge2, rougeL, bleu4, meteor, cider, spice, bertscore

In [7]:
old_tsv = pd.read_csv("landmark_all.tsv", sep='\t', encoding='utf-8')

df_results = pd.DataFrame(columns=['GT_answer', 'generated_answer','rouge1', 'rouge2', 'rougeL', 'bleu4', 'meteor', 'cider', 'spice', 'bertscore'])

In [8]:
for i in range(0,2001,50):
    
    print(i)
    GT_answer = old_tsv.loc[i,'answer']
    generated_answer = get_result(old_tsv.loc[i,'image_path'], old_tsv.loc[i,'question'])
    rouge1_res, rouge2_res, rougeL_res, bleu4_res, meteor_res, cider_res, spice_res, bertscore_res = answer_evaluation(GT_answer, generated_answer)
    
    df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \
                                        'bleu4':bleu4_res, 'meteor':meteor_res, \
                                        'cider':cider_res, 'spice':spice_res, 'bertscore':bertscore_res}, ignore_index=True)
    
df_results.to_csv('minigpt4_landmark.tsv', sep='\t', encoding='utf-8', index= False)

0
<s>The architectural style of the building in the image is Gothic. The building has a tall, pointed roof, large arched windows, and intricate carvings on the facade. The style is characterized by the use of pointed arches, ribbed vaults, and large windows.###Downloading stanford-corenlp-3.6.0 for SPICE ...
Progress: 384.5M / 384.5M (100.0%)
Extracting stanford-corenlp-3.6.0 ...
Done.


Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.3 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.7 sec].
Loading classif

SPICE evaluation took: 6.290 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 14.24it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 35.85it/s]

done in 0.10 seconds, 9.80 sentences/sec
50



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The architectural style of this image is Gothic Revival. The image shows a large, ornate altar with intricate carvings and a stained glass window behind it. The altar is made of stone and has a wooden top. The stained glass window is a beautiful, colorful depiction of a religious scene. The walls and floor are made of stone, and the room is dimly lit by candles.###

Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [0.9 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.6

SPICE evaluation took: 5.273 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 61.07it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 574.01it/s]

done in 0.02 seconds, 46.63 sentences/sec
100



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The image shows a beach with a body of water in the foreground and trees and mountains in the background.###

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.Stanfo

SPICE evaluation took: 5.305 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 66.38it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 643.00it/s]

done in 0.02 seconds, 49.85 sentences/sec
150



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The beach is located on the coast of New Zealand.###

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialize

SPICE evaluation took: 4.911 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 67.98it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 700.69it/s]

done in 0.02 seconds, 51.49 sentences/sec
200



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The Abu Simbel Temple is located in the southern part of Egypt, near the Nile River. It is a large temple complex that was built during the reign of Ramses II, around 1250 BC. The temple is famous for its large statues of Ramses II and his queen Nefertiti, which are carved into the rock face of the temple. The temple is a popular tourist attraction and is considered one of the most impressive ancient sites in Egypt.###

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading cl

SPICE evaluation took: 5.792 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 61.47it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 588.51it/s]

done in 0.02 seconds, 46.87 sentences/sec
250



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The Temple of Abu Simbel is located in the city of Abu Simbel, in the Nile Valley of Egypt. It is one of the most famous temples in the world, known for its large statues of pharaohs and other ancient Egyptian figures. The temple was built during the reign of Ramses II, around 1250 BC, and is considered one of the most impressive examples of ancient Egyptian architecture.###

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialize

SPICE evaluation took: 5.543 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 62.81it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 602.72it/s]

done in 0.02 seconds, 47.99 sentences/sec
300



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The first European settlers arrived in what is now the United States in the early 17th century. They came to the Americas in search of new land and resources, and to establish colonies. The first European settlers were primarily English, Dutch, and French. They established colonies in what is now the eastern seaboard of the United States, including the Massachusetts Bay Colony, the Virginia Colony, and the Plymouth Colony.###

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading cl

SPICE evaluation took: 6.355 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 61.01it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 570.34it/s]

done in 0.02 seconds, 46.25 sentences/sec
350



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The first European settlers arrived in what is now the United States in the early 17th century. They were primarily English, Dutch, and French colonizers who came to the Americas in search of economic opportunities and religious freedom. The English established the first permanent settlement at Jamestown, Virginia in 1607, while the Dutch established New Amsterdam (now New York City) in 1626. The French established their first settlement at Quebec City in 1608.###

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding an

SPICE evaluation took: 5.555 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 57.68it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 532.00it/s]

done in 0.02 seconds, 42.29 sentences/sec
400



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The city of Lagos is known for its bustling streets, vibrant culture, and beautiful architecture. It is also known for its beaches and water sports, as well as its nightlife and entertainment options.###

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nl

SPICE evaluation took: 5.355 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 65.50it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 665.02it/s]

done in 0.02 seconds, 49.83 sentences/sec
450



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The statue is known for being a large, white structure with a clock on top. It is located in a public park and is a popular landmark in the city.###

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding an

SPICE evaluation took: 4.921 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 66.18it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 635.21it/s]

done in 0.02 seconds, 50.43 sentences/sec
500



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The Parthenon is an example of the Doric order, which is characterized by simple, sturdy columns and a plain, unadorned facade. The temple was built in the 5th century BC in Athens, Greece, and is considered one of the most important examples of ancient Greek architecture.###

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.Stanfo

SPICE evaluation took: 5.487 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 57.61it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 436.50it/s]

done in 0.02 seconds, 43.08 sentences/sec
550



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The architectural style of the Parthenon is Doric. The Doric order is characterized by a simple, sturdy design with a large, heavy base and a smaller, more delicate capital. The columns are made of stone and have a plain, unadorned shaft with a simple, round capital. The frieze is decorated with a continuous band of carvings, usually depicting scenes from Greek mythology. The pediment is a triangular gable end, often decorated with sculptures. The temple is a rectangular building with a pitched roof and a porch or portico in front.###

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanfo

SPICE evaluation took: 5.535 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 47.65it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 564.28it/s]

done in 0.03 seconds, 38.38 sentences/sec
600



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The museum opened to the public on January 1, 2018.###

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.Stanfo

SPICE evaluation took: 5.127 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 66.25it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 627.89it/s]

done in 0.02 seconds, 50.15 sentences/sec
650



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The building in the image is the National Museum of China, which opened to the public on September 1, 2019.###

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanfo

SPICE evaluation took: 4.940 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 68.12it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 669.59it/s]

done in 0.02 seconds, 51.41 sentences/sec
700



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The Acropolis of Athens is an ancient citadel located on a rocky outcropping in the center of the city of Athens, Greece. It was built in the 5th century BC as a fortress and religious center for the goddess Athena. The Acropolis is known for its impressive architecture and the many ancient ruins that can be seen there, including the Parthenon, the Erechtheion, and the Temple of Athena Nike. The Acropolis is a popular tourist destination and a symbol of Greek culture and history.###

Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [0.9 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.5 sec].
Loading classif

SPICE evaluation took: 12.01 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 51.93it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 475.87it/s]

done in 0.03 seconds, 39.23 sentences/sec
750



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The Parthenon is a temple in the city of Athens, Greece, built between 447 and 438 BC. It was dedicated to the goddess Athena, the goddess of wisdom and war, and was one of the most important temples in ancient Greece. The Parthenon was built using marble and limestone, and its design is considered one of the most perfect examples of Doric architecture. It was used as a treasury and a place of worship for the people of Athens. The Parthenon was damaged during the invasion of the Persians in 480 BC and was later converted into a Christian church. It was later destroyed by the Ottoman Turks in 1687 and its ruins remain a popular tourist attraction today.###

Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [0.9 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.5 sec].
Loading classif

SPICE evaluation took: 12.26 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 48.16it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 530.19it/s]

done in 0.03 seconds, 38.70 sentences/sec
800



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The significance of the statue is to commemorate the soldiers who fought in the war and to honor their sacrifice.###

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nl

SPICE evaluation took: 5.353 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 64.64it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 485.85it/s]

done in 0.02 seconds, 47.65 sentences/sec
850



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The significance of the statue is to honor and remember those who have served in the military. It is a symbol of respect and appreciation for the sacrifices made by those who have served in the military.###

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding an

SPICE evaluation took: 5.019 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 64.18it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 624.71it/s]

done in 0.02 seconds, 48.17 sentences/sec
900



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The Taj Mahal was built in the early 17th century by the Mughal emperor Shah Jahan.###

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.Stanfo

SPICE evaluation took: 5.027 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 67.09it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 666.61it/s]

done in 0.02 seconds, 50.53 sentences/sec
950



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The building in the image was built in the 18th century.###

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanfo

SPICE evaluation took: 4.808 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 68.95it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 636.75it/s]

done in 0.02 seconds, 51.61 sentences/sec
1000



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The image shows a large military aircraft with people standing on the tarmac in front of it. The aircraft appears to be a C-17 Globemaster III, which is a military transport aircraft used by the United States Air Force. It is likely that the aircraft is stationed at a military base or airport.###

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nl

SPICE evaluation took: 5.432 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 63.90it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 628.93it/s]

done in 0.02 seconds, 48.70 sentences/sec
1050



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The aircraft is located at an airport.###

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding an

SPICE evaluation took: 4.650 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 64.47it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 478.69it/s]

done in 0.02 seconds, 47.46 sentences/sec
1100



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The Kingdom of It was a powerful and prosperous kingdom in ancient times. It was known for its advanced technology, strong army, and rich culture. The kingdom was ruled by a wise and just king, who was respected by his people. The kingdom was also known for its beautiful landscapes and rich wildlife. The kingdom was a major center of trade and commerce, attracting traders and merchants from far and wide. The kingdom's prosperity was due to its strategic location on the trade routes and its fertile land. The kingdom was known for its advanced agriculture techniques and irrigation systems, which enabled it to produce bountiful crops. The kingdom's prosperity also attracted the attention of neighboring kingdoms, which often tried to conquer it. However, the kingdom's strong army and wise king were able to repel these attacks and maintain its independence. The kingdom's prosperity and independence lasted for many years, making it a major power in the region.###

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.5 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading cl

SPICE evaluation took: 7.699 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 43.83it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 512.50it/s]

done in 0.03 seconds, 35.51 sentences/sec
1150



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The Kingdom of It was a powerful and prosperous kingdom that existed in the ancient world. It was known for its advanced technology, strong military, and rich culture. The kingdom was ruled by a powerful king who was known for his wisdom and leadership. The kingdom was also known for its trade and commerce, which helped to make it one of the wealthiest and most powerful kingdoms in the ancient world.###

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding an

SPICE evaluation took: 5.347 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 59.54it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 592.08it/s]

done in 0.02 seconds, 45.88 sentences/sec
1200



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The Alamo is a historic site in San Antonio, Texas, where the Battle of the Alamo took place in 1836. The Alamo is a symbol of Texan independence and is a popular tourist attraction.###

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading cl

SPICE evaluation took: 6.844 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 62.25it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 588.67it/s]

done in 0.02 seconds, 46.60 sentences/sec
1250



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The Alamo is a historical site in San Antonio, Texas, that was built in the late 18th century. It was originally used as a fortress and later served as a military hospital during the Texas Revolution. The Alamo is significant because it was the site of the Battle of the Alamo in 1836, which was a pivotal event in the Texas Revolution. The battle resulted in the deaths of many Texan soldiers and is remembered as a symbol of Texan independence. The Alamo is now a popular tourist attraction and a symbol of Texan history and culture.###

Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.5 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [0.9 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.5

SPICE evaluation took: 5.804 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 53.46it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 556.57it/s]

done in 0.02 seconds, 42.03 sentences/sec
1300



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The center is home to a variety of animals, including polar bears, grizzly bears, black bears, wolves, and moose. Visitors can also see a variety of birds, including eagles, hawks, and owls. The center also has a variety of marine animals, including seals, sea lions, and dolphins.###

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nl

SPICE evaluation took: 5.386 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 66.03it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 610.52it/s]

done in 0.02 seconds, 49.79 sentences/sec
1350



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The aquarium has a variety of fish species, including tropical fish, freshwater fish, and marine fish. Some of the species that visitors can see include angelfish, clownfish, and seahorses. There are also sharks, rays, and other marine animals on display. The aquarium also has a touch tank where visitors can interact with sea stars, sea urchins, and other marine animals.###

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialize

SPICE evaluation took: 5.282 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 61.80it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 554.14it/s]

done in 0.02 seconds, 46.02 sentences/sec
1400



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The Albuquerque International Balloon Fiesta takes place in Albuquerque, New Mexico. It is a yearly event that attracts thousands of people from all over the world to see the beautiful hot air balloons take off and fly in the sky. The event includes a variety of activities, such as balloon rides, live music, food vendors, and other entertainment. It is a great opportunity to see the beauty of hot air balloons and enjoy the festivities.###

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.Stanfo

SPICE evaluation took: 5.570 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 55.74it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 586.70it/s]

done in 0.02 seconds, 43.66 sentences/sec
1450



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The Albuquerque International Balloon Fiesta takes place in Albuquerque, New Mexico. It is a yearly event that attracts thousands of people from all over the world to see the hot air balloons take off and fly in the sky. The event includes a variety of activities such as balloon rides, live music, food and craft vendors, and a variety of other entertainment. It is a great opportunity to see the beautiful hot air balloons up close and enjoy the festivities.###

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanfo

SPICE evaluation took: 5.725 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 61.16it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 591.41it/s]

done in 0.02 seconds, 46.97 sentences/sec
1500



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The total area of Alcatraz Island is approximately 22 acres.###

Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [0.9 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.5 sec].
Loading classif

SPICE evaluation took: 5.083 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 69.01it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 679.57it/s]

done in 0.02 seconds, 52.02 sentences/sec
1550



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The total area of the island is approximately 100 square miles.###

Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [0.9 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.5

SPICE evaluation took: 4.639 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 63.94it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 484.61it/s]

done in 0.02 seconds, 47.21 sentences/sec
1600



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The architectural style of the Saint it is Baroque.###

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.Stanfo

SPICE evaluation took: 5.044 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 66.82it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 656.49it/s]

done in 0.02 seconds, 50.75 sentences/sec
1650



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The architectural style of the Saint Basil's Cathedral in Moscow is Byzantine.###

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanfo

SPICE evaluation took: 4.947 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 67.85it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 656.90it/s]

done in 0.02 seconds, 51.34 sentences/sec
1700



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The Alhambra is a palace and fortress complex located in Granada, Spain. It was built in the mid-13th century by the Nasrid dynasty and is considered one of the most important examples of Islamic architecture in the world. The Alhambra is a UNESCO World Heritage Site and is visited by millions of tourists each year. It is known for its intricate decorations, beautiful gardens, and stunning views of the surrounding mountains. The palace is a testament to the beauty and grandeur of Islamic architecture and is a must-see for anyone visiting Granada.###

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.Stanfo

SPICE evaluation took: 5.747 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 54.18it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 581.81it/s]

done in 0.02 seconds, 42.81 sentences/sec
1750



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The Alhambra is a palace and fortress complex located in Granada, Spain. It was built in the mid-13th century by the Nasrid dynasty and is considered one of the most important examples of Islamic architecture in the world. The Alhambra is a UNESCO World Heritage Site and is visited by millions of tourists each year. It is known for its intricate decorations, beautiful gardens, and stunning architecture. The palace is made up of several buildings, including the Alcazaba, the Mexuar, and the Harem. The most famous part of the Alhambra is the Court of the Lions, which is a large open courtyard with a fountain in the center and 12 marble columns with lion statues on top. The Alhambra is a must-see attraction for anyone visiting Granada, and it is a testament to the beauty and rich history of the city.###

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialize

SPICE evaluation took: 6.267 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 45.03it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 525.14it/s]

done in 0.03 seconds, 36.41 sentences/sec
1800



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The image shows a large, ornate building with a large courtyard in front of it. The building appears to be a palace or government building, with a large dome on top and several smaller domes on the sides. The courtyard is surrounded by a large wall with several arches and gates. There are several people walking around the courtyard and some sitting on benches. The building is likely a historical landmark or government building in the city.###

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nl

SPICE evaluation took: 10.53 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 53.55it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 571.82it/s]

done in 0.02 seconds, 42.06 sentences/sec
1850



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The building in the image is a palace, which was built in the 18th century. It was the residence of the ruling family and was used for official purposes. The palace has a green dome, which is a common feature of many palaces in Europe. The palace is an important historical landmark and is a popular tourist attraction.###

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nl

SPICE evaluation took: 10.34 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 52.42it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 562.01it/s]

done in 0.02 seconds, 40.81 sentences/sec
1900



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The Amazon rainforest is located in South America, primarily in Brazil, Peru, Colombia, and Venezuela. It covers an area of about 2.5 million square miles and is the largest rainforest in the world. The Amazon rainforest is home to a vast array of plant and animal species, including the giant arapaima, the anaconda, the jaguar, and the howler monkey. It is also home to many indigenous tribes, including the Yanomami, the Guarani, and the Tupi.###

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.5 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading cl

SPICE evaluation took: 5.961 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 54.94it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 399.19it/s]

done in 0.02 seconds, 41.25 sentences/sec
1950



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The Amazon rainforest is located in South America, specifically in Brazil, Peru, Colombia, and Venezuela. It covers an area of approximately 2.5 million square miles and is the largest rainforest in the world. The Amazon rainforest is home to a diverse range of plant and animal species, including the giant otter, the jaguar, the anaconda, and the piranha. It is also home to many indigenous tribes, including the Yanomami, the Guarani, and the Tupi.###

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding an

SPICE evaluation took: 5.604 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 57.19it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 514.89it/s]

done in 0.02 seconds, 43.11 sentences/sec
2000



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \


<s>The location of the river is in the Amazon rainforest.###

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.Stanfo

SPICE evaluation took: 5.321 s


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 63.90it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 493.74it/s]

done in 0.02 seconds, 46.71 sentences/sec



  df_results = df_results.append({'GT_answer':GT_answer, 'generated_answer': generated_answer, 'rouge1':rouge1_res, 'rouge2':rouge2_res,'rougeL':rougeL_res, \
