In [None]:
CHECKPOINT_PATH = "./checkpoint_9.pth"
VALIDATION_JSON = "/Users/jagathkumarreddyk/Documents/GitHub/BLIP/annotations_trainval2017/annotations/captions_val2017.json"
VALIDATION_IMAGE_ROOT = "/Users/jagathkumarreddyk/Documents/GitHub/BLIP/val2017/val2017"

In [1]:
import torch
import json
import os
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, AutoModelForCausalLM
from datasets import load_dataset
from PIL import Image
from torchvision import transforms
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3,[0.5]*3)
])
class COCOCaptionDataset(Dataset):
    """COCO Captions Dataset"""
    def __init__(self, json_path, image_root, transform, tokenizer, max_length=50):
        with open(json_path, 'r') as f:
            self.data = json.load(f)
        
        self.image_root = image_root
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Flatten annotations
        self.samples = []
        for item in tqdm(self.data['annotations']):
            img_id = item['image_id']
            if len(self.samples)==200: break
            # Find image filename
            img_info = next(img for img in self.data['images'] if img['id'] == img_id)
            self.samples.append({
                'image': os.path.join(image_root, img_info['file_name']),
                'caption': item['caption']
            })
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        
        # Load and transform image
        image = Image.open(sample['image']).convert('RGB')
        image = self.transform(image)
        
        # Tokenize caption for encoder
        caption = sample['caption']
        text_encoding = self.tokenizer(
            caption,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # Create decoder inputs (shifted right)
        decoder_input_ids = text_encoding['input_ids'].clone()
        labels = text_encoding['input_ids'].clone()
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return {
            'image': image,
            'image_path': sample['image'],
            'text_input_ids': text_encoding['input_ids'].squeeze(0),
            'text_attention_mask': text_encoding['attention_mask'].squeeze(0),
            'decoder_input_ids': decoder_input_ids.squeeze(0),
            'decoder_attention_mask': text_encoding['attention_mask'].squeeze(0),
            'labels': labels.squeeze(0)
        }




# =======================================================
# 7. Load tokenizer + model
# =======================================================
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

# Add special <img> token
if '<img>' not in tokenizer.get_vocab():
    tokenizer.add_special_tokens({'additional_special_tokens':['<img>']})
img_token_id = tokenizer.convert_tokens_to_ids('<img>')

# Load DistilGPT2 via AutoModelForCausalLM
gpt2 = AutoModelForCausalLM.from_pretrained("distilgpt2")
gpt2.resize_token_embeddings(len(tokenizer))
gpt2.eval()

# =======================================================
# 8. Q-Former
# =======================================================
from transformers import ViTModel
vit = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
vit.eval()

class QFormer(nn.Module):
    def __init__(self, image_emb_dim, prompt_len=16, hidden_dim=768):
        super().__init__()
        self.query_tokens = nn.Parameter(torch.randn(prompt_len, image_emb_dim))
        self.cross_attn = nn.MultiheadAttention(embed_dim=image_emb_dim, num_heads=8)
        self.mlp = nn.Linear(image_emb_dim, hidden_dim)

    def forward(self, image_embeds):
        batch_size = image_embeds.size(0)
        query = self.query_tokens.unsqueeze(1).repeat(1,batch_size,1)
        attn_out,_ = self.cross_attn(query, image_embeds.transpose(0,1), image_embeds.transpose(0,1))
        prompt = self.mlp(attn_out).transpose(0,1)
        return prompt


q_former = QFormer(image_emb_dim=vit.config.hidden_size, prompt_len=16, hidden_dim=gpt2.config.n_embd)

# =======================================================
# 9. DataLoader
# =======================================================
# val_dataset = COCOCaptionDataset(json_path=VALIDATION_JSON, image_root=VALIDATION_IMAGE_ROOT, transform=transform,tokenizer=tokenizer, max_length=50)
# val_loader = DataLoader(val_dataset, batch_size=4, shuffle=True)

# =======================================================
# 10. Device setup
# =======================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vit.to(device)
gpt2.to(device)
q_former.to(device)

QFormer(
  (cross_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
  )
  (mlp): Linear(in_features=768, out_features=768, bias=True)
)

In [11]:
trainable_params = sum(
    p.numel() for p in q_former.parameters() if p.requires_grad
)

print(f"Trainable parameters: {trainable_params}")

Trainable parameters: 2965248


In [None]:
def load_from_checkpoint(CHECKPOINT_PATH):
    # with open(CHECKPOINT_PATH, 'r') as f:
    checkpoint_obj = torch.load(CHECKPOINT_PATH)
    gpt2.load_state_dict(checkpoint_obj["gpt2_state"])
    q_former.load_state_dict(checkpoint_obj["qformer"])
    



In [9]:
tokenizer.bos_token_id
gpt2.transformer.wte(torch.tensor([[50256]])).shape

torch.Size([1, 1, 768])

In [12]:
sum(p.numel() for p in gpt2.parameters())

81913344

In [10]:
sum(p.numel() for p in q_former.parameters())

2965248

In [None]:
def generate_caption(pil_image, vit, q_former, gpt2, tokenizer, device, max_length=30, top_k=50, top_p=0.95):
    vit.eval()
    q_former.eval()
    gpt2.eval()
    image = transform(pil_image).unsqueeze(0).to(device)
    with torch.no_grad():
        image_embeds = vit(image).last_hidden_state
        prompts = q_former(image_embeds)
        input_ids = torch.tensor([[tokenizer.bos_token_id]], device=device)
        img_token_emb = gpt2.transformer.wte(torch.tensor([[img_token_id]], device=device))
        generated = []

        for _ in range(max_length):
            gpt2_inputs = gpt2.transformer.wte(input_ids)
            gpt2_inputs = torch.cat([img_token_emb, prompts, gpt2_inputs], dim=1)

            outputs = gpt2(inputs_embeds=gpt2_inputs)
            logits = outputs.logits[0, -1, :]

            # Top-k + top-p sampling
            filtered_logits = torch.nn.functional.softmax(logits, dim=-1)
            next_token = torch.multinomial(filtered_logits, num_samples=1)

            if next_token.item() == tokenizer.eos_token_id:
                break
            generated.append(next_token.item())
            input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)

        caption = tokenizer.decode(generated, skip_special_tokens=True)
        return caption

In [None]:
def generate_output(num_samples, val_dataset):
    for i in range(num_samples):
        img = (((val_dataset[0]['image'] + 1)/2).mul(255)).byte()
        img = img.permute(1,2,0)
        print(img.shape)
        img = Image.fromarray(img.numpy())
        gt_caption = tokenizer.decode(val_dataset[i]['text_input_ids'])
        gen_caption = generate_caption(img, vit, q_former, gpt2, tokenizer, device)

        print("GT caption: " + gt_caption)
        print("Generated caption: " + gen_caption)

In [None]:
generate_output(10, val_dataset)

## LEt s checkout the structure of karpathy coco

In [None]:
import json

# Replace with your file name
with open("/Users/jagathkumarreddyk/Documents/GitHub/BLIP/dataset_coco.json", "r") as f:
    data = json.load(f)

# print(data)

In [None]:
data.keys()

In [None]:
len(data["images"])

In [None]:
import json

with open("/Users/jagathkumarreddyk/Documents/GitHub/BLIP/dataset_coco.json",'r') as f:
    data = json.load(f)

train = [img for img in data["images"] if img["split"] == "train"]
val = [img for img in data["images"] if img["split"] == "val"]
test = [img for img in data["images"] if img["split"] == "test"]

print(len(train), len(val), len(test))
print(len(train) + len(val) + len(test))


In [None]:
val[0]

### We are not going to use karpathy split, Since it requires val2014, which we dont have

SO VAL2017 it is!!

In [None]:
sample_output_json = []

for i in range(5):
    this_dict = {}
    img = (((val_dataset[i]['image'] + 1)/2).mul(255)).byte()
    img = img.permute(1,2,0)
    print(img.shape)
    img = Image.fromarray(img.numpy())
    gt_caption = remove_padding(val_dataset[i]['text_input_ids'])
    gt_caption = tokenizer.decode(gt_caption)
    gen_caption = generate_caption(img, vit, q_former, gpt2, tokenizer, device)
    
    this_dict['img_id'] = int(str(val_dataset[0]['image_path']).split(".")[0].split("/")[-1])
    this_dict["image_path"] = val_dataset[i]['image_path']
    this_dict["captions"] = gt_caption
    this_dict["generated_output"]  = gen_caption
    
    sample_output_json.append(this_dict)

    print("IMAGE_PATH",val_dataset[i]['image_path'])
    print("GT caption: " + gt_caption)
    print("Generated caption: " + gen_caption,"\n\n")

In [None]:
sample_output_json[-1]

In [None]:
# len(gt_caption)
for i in range()

gt_caption

In [None]:
len("<|endoftext|>")

dict 
```
{
    id : Image_id,
    path: file_path/fileName,
    output: MODEL's GENERATED OUTPUT,
    human_cpations: ["humancaption_1","humancaption_2","humancaption_3"]
}
```

In [None]:
with open(VALIDATION_JSON, 'r') as f:
    val_dat = json.load(f)


In [None]:
val_dat.keys()

In [None]:
val_dat["images"]

In [None]:
val_dat["annotations"]

In [None]:
val_dataset[0]['text_input_ids']

In [None]:
# val_dataset[0]['text_input_ids']
def remove_padding(padded_tokens):
    for i in range(len(padded_tokens)-1):
        if (padded_tokens[i] == 50256) and (padded_tokens[i+1] == 50256):
            break

    return padded_tokens[:i]



In [None]:
remove_padding(val_dataset[0]['text_input_ids'])

In [None]:
len(val_dataset)

In [None]:
with open("./output_json.json", 'w') as f:
    json.dump(sample_output_json,f)

In [None]:
!pwd

In [None]:
with open("/Users/jagathkumarreddyk/Documents/GitHub/ICapGPT/output_json_1.json", 'r') as f:
    d = json.load(f)

In [None]:

len(d)

In [None]:
image_root = "/Users/jagathkumarreddyk/Documents/GitHub/BLIP/val2017/val2017"
for img_file in os.listdir(image_root):
    if str(179765) in img_file:
        print(img_file)

In [None]:
os.path.join(image_root,img_file)

In [None]:
def get_json(output_json_path):
    with open(output_json_path, 'r') as f:
        data = json.load(f)
    return data

json_path = "/Users/jagathkumarreddyk/Downloads/CV/BLIP2-Output/output_json_3epoch_model.json"
d = get_json(json_path)

In [None]:
br_path = d[3]['image_path']

In [None]:
int((br_path).split("/")[-1].split(".")[0])

In [None]:
checkpoint_obj = torch.load("./checkpoint_batch_srun2999_epoch12.pth", map_location=torch.device('cpu'))

In [None]:
# checkpoint_obj["gpt2_state"]
checkpoint_obj = torch.load(CHECKPOINT_PATH, map_location=torch.device("cpu"))
gpt2.load_state_dict(checkpoint_obj["gpt2_state"])
q_former.load_state_dict(checkpoint_obj["qformer"])

## Lets incorporate generate output from Image

In [None]:
import json
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap

from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider

def calculate_metrics(gt_json_path, results_json_path):
    """
    Calculates standard COCO captioning metrics (BLEU, METEOR, ROUGE-L, CIDEr, SPICE).

    Args:
        gt_json_path (str): Path to the COCO Ground Truth JSON file (e.g., 'captions_val2017.json').
        results_json_path (str): Path to the Generated Captions JSON file.
    """
    # Initialize COCO for ground truth annotations
    coco = COCO(gt_json_path)
    
    # Initialize COCOEvalCap
    # The 'resFile' argument specifies the path to the generated captions
    coco_eval = COCOEvalCap(coco, results_json_path)

    coco_eval.evaluators = [
        (str('Bleu'), Bleu(4)), # Bleu(4) calculates BLEU-1, 2, 3, 4
        (str('Rouge'), Rouge()),
        (str('Cider'), Cider()),
        # Skip ('Meteor', Meteor()) and ('Spice', Spice())
    ]

    # Note: image IDs need to be a subset of the image IDs in the GT file
    # This automatically sets the list of image IDs to evaluate based on your results file
    # For val2017, this should be all 5k images.
    
    print(f"Starting evaluation on image IDs in: {results_json_path}...")
    
    # Perform the evaluation
    # This will run all standard metrics (BLEU-1 to 4, METEOR, ROUGE-L, CIDEr, SPICE)
    coco_eval.evaluate()

    # Print the resulting scores
    print("\n--- Evaluation Scores ---")
    
    # The result is a dictionary: {'Bleu_1': score, 'Bleu_2': score, ...}
    for metric, score in coco_eval.eval.items():
        # Format the score to a standard paper-reporting format (e.g., 2 decimal places)
        print(f"{metric}: {score:.3f}")
        
    return coco_eval.eval

# --- Configuration ---
# You need to replace these paths with your actual file locations
# 1. Path to the official COCO val2017 GT Captions file
#    You must download this file from the COCO website (e.g., annotations/captions_val2017.json)
GT_JSON_FILE = '/Users/jagathkumarreddyk/Documents/GitHub/BLIP/annotations_trainval2017/annotations/captions_val2017.json'
# GT_JSON_FILE = "./GROUND_TRUTH_JSON.json"
# 2. Path to your model's generated captions JSON file (in the format specified in Section 1B)
RESULTS_JSON_FILE = './new_format_json'

# # --- Run Evaluation ---

try:
    metrics_scores = calculate_metrics(GT_JSON_FILE, RESULTS_JSON_FILE)
except FileNotFoundError as e:
    print(f"\n[ERROR] File not found. Please check your paths: {e}")
except Exception as e:
    print(f"\n[ERROR] An error occurred during evaluation. Did you install Java and all required packages (pycocotools, pycocoevalcap)? Error: {e}")

In [None]:
with open("/Users/jagathkumarreddyk/Downloads/CV/BLIP2-Output/output_json_12_epoch_model_nov29.json", 'r') as f:
    data = json.load(f)

In [None]:
data[0].keys()

In [None]:
#"image_id": 391895, "caption": "A man
GROUND_TRUTH = []
LIST_right_format = []
for d in data:
    temp = {}
    gt_d  = {}
    temp['image_id'] = d["img_id"]
    gt_d['image_id'] = d["img_id"]
    temp["caption"] = d["generated_output"]
    gt_d["caption"] = d["captions"]
    GROUND_TRUTH.append(gt_d)
    LIST_right_format.append(temp)


In [None]:
with open("./new_format_json.json", 'w') as f:
    json.dump(LIST_right_format, f)
          

In [None]:
with open("./GROUND_TRUTH_JSON.json", 'w') as f:
    json.dump(GROUND_TRUTH, f)
          

In [None]:
import json
from pycocotools.coco import COCO
# Import COCOResult for the generated captions
# from pycocotools.cocoeval import COCOResult 
from pycocoevalcap.eval import COCOEvalCap
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider

def calculate_metrics_no_java_v2(gt_json_path, results_json_path):
    """
    Calculates COCO captioning metrics (BLEU, ROUGE-L, CIDEr) by explicitly 
    loading and formatting both GT and results for robustness.

    Args:
        gt_json_path (str): Path to the COCO Ground Truth JSON file.
        results_json_path (str): Path to the Generated Captions JSON file.
    """
    print("--- Loading Ground Truth (GT) Data ---")
    # Initialize COCO for ground truth annotations (This should work fine)
    # The COCO constructor expects the path to the GT annotations file
    cocoGt = COCO(gt_json_path)
    
    # --- Load and Format Generated Captions ---
    print("--- Loading Generated Captions ---")
    # 1. Load the generated results JSON (a list of dictionaries)
    with open(results_json_path, 'r') as f:
        # Load your generated captions list: [{"image_id": id, "caption": "..."}]
        results_list = json.load(f)

    # 2. Initialize the COCOResult object using the list.
    # We pass the COCO GT object (cocoGt) and the loaded results list (results_list).
    # This ensures the results are correctly associated with the GT images and format.
    # Note: If you don't have COCOResult, you can try passing the path string again.
    try:
        cocoRes = cocoGt.loadRes(results_json_path)
        # If loadRes fails, fall back to the path string (older pycocotools)
    except:
        cocoRes = results_json_path


    # --- Initialize COCOEvalCap ---
    # The standard way to initialize COCOEvalCap is with the GT object and the path/result object.
    # Since loadRes often returns the path string back, we should check which is needed.
    
    # Let's try the most robust way based on typical usage:
    print("--- Initializing Evaluation ---")
    coco_eval = COCOEvalCap(cocoGt, cocoRes)

    # --- Metrics Setup (No Java) ---
    coco_eval.evaluators = [
        (str('Bleu'), Bleu(4)), 
        (str('Rouge'), Rouge()),
        (str('Cider'), Cider()),
    ]
    
    print(f"Starting evaluation (excluding METEOR/SPICE) on {len(cocoRes.imgs)} images...")
    
    # Perform the evaluation
    coco_eval.evaluate()

    # Print the resulting scores
    print("\n--- Evaluation Scores (No Java Required) ---")
    
    for metric, score in coco_eval.eval.items():
        print(f"{metric}: {score:.3f}")
        
    return coco_eval.eval

# --- Example Configuration (Update these paths) ---
GT_JSON_FILE = '/Users/jagathkumarreddyk/Documents/GitHub/BLIP/annotations_trainval2017/annotations/captions_val2017.json'
RESULTS_JSON_FILE = './new_format_json.json' # Ensure this path is correct

# if __name__ == '__main__':
calculate_metrics_no_java_v2(GT_JSON_FILE, RESULTS_JSON_FILE)

### Madman Protocol

In [None]:
import json
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider

# --- Configuration (Keep your paths here) ---
GT_JSON_FILE = '/Users/jagathkumarreddyk/Documents/GitHub/BLIP/annotations_trainval2017/annotations/captions_val2017.json'
RESULTS_JSON_FILE = './new_format_json.json' # Ensure this path is correct

def calculate_metrics_no_java_fixed(gt_json_path, results_json_path):
    """
    Calculates COCO captioning metrics (BLEU, ROUGE-L, CIDEr) by completely 
    bypassing the initialization of Java-dependent scorers (METEOR, SPICE).
    """
    print("--- Loading Ground Truth (GT) Data ---")
    cocoGt = COCO(gt_json_path)
    
    # Load and format the generated results
    print("--- Loading Generated Captions ---")
    
    # Use loadRes to associate results with GT image IDs
    # If the file load fails here, you'll get a COCO API error, not BrokenPipe
    try:
        cocoRes = cocoGt.loadRes(results_json_path)
    except Exception as e:
        print(f"Error loading results file into COCO format. Check path/format: {e}")
        return {} # Exit function
        
    # --- Custom Initialize COCOEvalCap ---
    coco_eval = COCOEvalCap(cocoGt, cocoRes)

    # 1. Define ONLY the Python-based scorers you want to use
    scorer_objects = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Rouge(), ["ROUGE_L"]),
        (Cider(), ["CIDEr"])
        # DO NOT INCLUDE: Meteor(), Spice()
    ]

    # 2. Manually set the internal lists expected by COCOEvalCap
    coco_eval.evalImgs = []
    coco_eval.eval = {}
    
    # 3. Populate scorers and method names
    coco_eval.scorers = []
    coco_eval.method = []
    
    for scorer, method in scorer_objects:
        coco_eval.scorers.append(scorer)
        coco_eval.method.append(method)

    # --- Run Evaluation ---
    print(f"Starting evaluation (metrics: BLEU, ROUGE-L, CIDEr)...")
    
    # The default evaluate() method now only iterates over the defined scorers
    coco_eval.evaluate()

    # Print the resulting scores
    print("\n--- Evaluation Scores (No Java Required) ---")
    
    # Ensure scores are collected from the new structure
    final_scores = {}
    for method_list, score_list in zip(coco_eval.method, coco_eval.eval_obj):
        if not isinstance(method_list, list):
             method_list = [method_list]
             score_list = [score_list]
             
        for method, score in zip(method_list, score_list):
            final_scores[method] = score

    # Print results
    for metric, score in final_scores.items():
        print(f"{metric}: {score:.3f}")
        
    return final_scores

# --- Execution ---
if __name__ == '__main__':
    # Make sure to set your paths correctly here
    results = calculate_metrics_no_java_fixed(GT_JSON_FILE, RESULTS_JSON_FILE)

In [2]:
import json
from pycocotools.coco import COCO
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
# You can remove the imports for Meteor and Spice, as they aren't used here.

def calculate_metrics_from_scratch(gt_json_path, results_json_path):
    """
    Calculates BLEU-4, ROUGE-L, and CIDEr by initializing the scorers directly.

    Args:
        gt_json_path (str): Path to the COCO Ground Truth JSON file.
        results_json_path (str): Path to the Generated Captions JSON file.
    """
    # 1. Load Data into COCO Objects
    print("--- Loading Ground Truth (GT) Data ---")
    cocoGt = COCO(gt_json_path)
    
    print("--- Loading Generated Captions ---")
    # This step associates the results with the GT image IDs and prepares them.
    try:
        cocoRes = cocoGt.loadRes(results_json_path)
    except Exception as e:
        print(f"Error loading results file into COCO format. Check path/format: {e}")
        return {} 

    # 2. Structure Data for Scorers
    # The scorers expect two dictionaries mapped by image ID:
    # gts: {img_id: [ref1, ref2, ref3, ref4, ref5]}
    # res: {img_id: [generated_caption]}
    
    gts = {} # Ground Truths
    res = {} # Results (Generated)
    img_ids = cocoGt.getImgIds()
    
    # Filter the IDs present in the results file to prevent errors
    img_ids_to_evaluate = list(cocoRes.imgs.keys()) 
    
    for img_id in img_ids_to_evaluate:
        # Get all 5 reference captions for the image
        gts[img_id] = [ann['caption'] for ann in cocoGt.imgToAnns[img_id]]
        
        # Get the single generated caption
        res[img_id] = [ann['caption'] for ann in cocoRes.imgToAnns[img_id]]
    
    print(f"Loaded {len(img_ids_to_evaluate)} images for evaluation.")

    # 3. Define Scorers to Run
    scorers = [
        (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
        (Rouge(), ["ROUGE_L"]),
        (Cider(), ["CIDEr"]),
    ]
    
    final_scores = {}
    print("\n--- Computing Core Metrics ---")
    
    # 4. Compute Scores for Each Metric
    for scorer, method_names in scorers:
        print(f"Computing {scorer.method()} score...")
        
        # Compute the score: score is the average, scores is per-image list
        score_list, scores_per_image = scorer.compute_score(gts, res)
        
        # Handle single vs. multiple scores (BLEU returns 4 scores)
        if isinstance(score_list, list):
            for method, score in zip(method_names, score_list):
                final_scores[method] = score
        else:
            final_scores[method_names[0]] = score_list

    # 5. Print Results
    print("\n--- Evaluation Scores ---")
    for metric, score in final_scores.items():
        print(f"{metric}: {score:.3f}")
        
    return final_scores

# --- Configuration & Execution ---
GT_JSON_FILE = '/Users/jagathkumarreddyk/Documents/GitHub/BLIP/annotations_trainval2017/annotations/captions_val2017.json'
RESULTS_JSON_FILE = './new_format_json.json' # Ensure this path is correct

# if __name__ == '__main__':
#     # Ensure your paths are correct before running!
results = calculate_metrics_from_scratch(GT_JSON_FILE, RESULTS_JSON_FILE)

--- Loading Ground Truth (GT) Data ---
loading annotations into memory...
Done (t=0.02s)
creating index...
index created!
--- Loading Generated Captions ---
Loading and preparing results...
DONE (t=0.04s)
creating index...
index created!
Loaded 5000 images for evaluation.

--- Computing Core Metrics ---
Computing Bleu score...


AssertionError: 

### Lets try to load `model-flickr.pt` 

In [2]:
import torch
model_flickr = './model_flickr.pt'

# save_path = os.path.join(checkpoint_dir, "model.pt")
# torch.save({
#     "epoch": epoch,
#     "q_former": q_former.state_dict(),
#     "gpt2": gpt2.state_dict(),
#     "optimizer": optimizer.state_dict(),
#     "loss": avg_loss
# }, save_path)


checkpoint_obj = torch.load(model_flickr)

In [5]:
gpt2.load_state_dict(checkpoint_obj["gpt2"])
q_former.load_state_dict(checkpoint_obj["q_former"])

<All keys matched successfully>

In [7]:
image = torch.randn((2,244,244))
x = image.reshape([1,*list(image.shape)])

In [8]:
x.shape

torch.Size([1, 2, 244, 244])