# HIERARCHICAL + CAPTIONING

In [None]:
!pip install transformers datasets



## Cleaning Test Data

In [None]:
import pandas as pd

csv_path = "/kaggle/input/roco-brain/test/kaggle/working/testdata.csv"
output_path = "/kaggle/working/cleaned_testdata.csv"

keywords = [
    "MRI", "magnetic resonance imaging", "CT", "computed tomography",
    "left", "right", "normal", "abnormal", "T2", "T2-weighted", "weighted",
    "T1", "flair", "axial", "sagittal", "coronal", "tumor", "lesion"
]

df = pd.read_csv(csv_path)

def contains_keywords(caption, keywords):
    if pd.isnull(caption):
        return False
    return any(keyword.lower() in caption.lower() for keyword in keywords)

filtered_df = df[df['caption'].apply(lambda x: contains_keywords(x, keywords))]

print(f"Original number of rows: {len(df)}")
print(f"Filtered number of rows: {len(filtered_df)}")

filtered_df.to_csv(output_path, index=False)

print(f"Filtered data saved to {output_path}.")

Original number of rows: 333
Filtered number of rows: 313
Filtered data saved to /kaggle/working/cleaned_testdata.csv.


## Hierarchical Classification Logic

In [None]:
from transformers import AutoImageProcessor, AutoModelForImageClassification
import torch
from PIL import Image

def load_model_and_processor(model_name):
    """
    Load a model and its corresponding processor.

    Args:
        model_name (str): Name of the model to load.

    Returns:
        tuple: A tuple containing the processor and the model.
    """
    processor = AutoImageProcessor.from_pretrained(model_name)
    model = AutoModelForImageClassification.from_pretrained(model_name)
    return processor, model

model_names = {
    "plane": "bombshelll/swin-brain-plane-classification",
    "modality": "bombshelll/swin-brain-modality-classification",
    "abnormality": "bombshelll/swin-brain-abnormalities-classification",
    "tumor_type": "bombshelll/swin-brain-tumor-type-classification",
    "location": "bombshelll/swin-brain-abnormality-location-classification"
}

models_and_processors = {
    name: load_model_and_processor(model_name)
    for name, model_name in model_names.items()
}

plane_processor, plane_model = models_and_processors["plane"]
modality_processor, modality_model = models_and_processors["modality"]
abnormality_processor, abnormality_model = models_and_processors["abnormality"]
tumor_type_processor, tumor_type_model = models_and_processors["tumor_type"]
location_processor, location_model = models_and_processors["location"]

In [None]:
def inference_classification(processor, model, image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(image, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_idx = logits.argmax(-1).item()
        predicted_label = model.config.id2label[predicted_class_idx]
    return predicted_label

In [None]:
def hierarchical_classification(image_path):
    """
    Perform multilevel classification on an image.

    Steps:
    1. Classify the plane (e.g., axial, coronal, sagittal).
    2. Classify the modality (e.g., MRI, CT) and sequence (e.g., T1, T2, FLAIR).
    3. Classify the abnormality status (e.g., normal, lesion, tumor).
    4. If tumor, classify the location of the tumor and then the tumor type.
    """
    results = {}

    # Step 1: Plane Classification
    plane_label = inference_classification(plane_processor, plane_model, image_path)
    results["Plane"] = plane_label
    print(f"Plane Classification: {plane_label}")

    # Step 2: Modality Classification
    modality_label = inference_classification(modality_processor, modality_model, image_path)

    modality_parts = modality_label.split(maxsplit=1)
    if len(modality_parts) > 1:
        results["Modality"] = modality_parts[0]
        results["Sequence"] = modality_parts[1]
        print(f"Modality: {modality_parts[0]}")
        print(f"Sequence: {modality_parts[1]}")
    else:
        results["Modality"] = modality_parts[0]
        results["Sequence"] = None
        print(f"Modality: {modality_parts[0]}")

    # Step 3: Abnormality Status Classification
    abnormality_label = inference_classification(abnormality_processor, abnormality_model, image_path)
    results["Abnormality Status"] = abnormality_label
    print(f"Abnormality Status Classification: {abnormality_label}")

    # Step 4: If tumor, classify the location and then the tumor type
    if abnormality_label == "tumor":
        location_label = inference_classification(location_processor, location_model, image_path)
        results["Abnormality Location"] = location_label
        print(f"Abnormality Location Classification: {location_label}")

        tumor_type_label = inference_classification(tumor_type_processor, tumor_type_model, image_path)
        results["Tumor Type"] = tumor_type_label
        print(f"Tumor Type Classification: {tumor_type_label}")

    return results

## Inference & Evaluate Hierarchical Classification

In [None]:
def evaluate_hierarchical_classification(keywords, caption):
    import math

    relevant_labels = {
        "plane": {"axial", "sagittal", "coronal"},
        "modality_and_sequence": {
            "ct", "computed tomography", "mri", "magnetic resonance imaging", "magnetic resonance image",
            "t1", "t1-weighted", "t1w", "t2", "t2-weighted", "t2w", "flair",
        },
        "abnormality": {"normal", "non-abnormal", "lesion", "tumor"},
    }

    synonyms = {
        "t1": {"t1", "t1-weighted", "t1w"},
        "t2": {"t2", "t2-weighted", "t2w"},
        "ct": {"ct", "computed tomography"},
        "mri": {"mri", "magnetic resonance imaging", "magnetic resonance image"},
        "lesion": {"lesion", "lesions"},
        "tumor": {"tumor", "tumour"},
    }

    synonyms = {key.lower(): {term.lower() for term in values} for key, values in synonyms.items()}

    def preprocess_full_text(text):
        text = text.lower()
        text = text.replace("magnetic resonance imaging", "mri")
        text = text.replace("magnetic resonance image", "mri")
        text = text.replace("computed tomography", "ct")
        words = text.replace("-", " ").split()
        return [word.strip("().,") for word in words]

    def standardize_word(word):
        word = word.lower()
        for key, variations in synonyms.items():
            if word in variations:
                return key
        return word

    def split_compound_keywords(keywords):
        split_keywords = set()
        for keyword in keywords:
            if " " in keyword:
                split_keywords.update(keyword.split())
            else:
                split_keywords.add(keyword)
        return split_keywords

    caption_words = preprocess_full_text(caption)
    caption_set = set(standardize_word(word) for word in caption_words)
    processed_keywords = split_compound_keywords(
        set(standardize_word(word) for word in keywords)
    )

    relevant_keywords = set()
    correct_matches = set()
    category_correct_matches = {category: set() for category in relevant_labels}

    for category, label_set in relevant_labels.items():
        standardized_label_set = {standardize_word(label) for label in label_set}
        matched_terms = standardized_label_set.intersection(caption_set)
        relevant_keywords.update(matched_terms)
        predicted_terms = processed_keywords.intersection(standardized_label_set)

        correct_category_matches = predicted_terms.intersection(matched_terms)
        correct_matches.update(correct_category_matches)
        category_correct_matches[category].update(correct_category_matches)

    if not relevant_keywords:
        similarity_score = math.nan
    else:
        similarity_score = (len(correct_matches) / len(relevant_keywords)) * 100

    result_details = {
        "Actual Caption": caption,
        "Keyword Hasil Klasifikasi": keywords,
        "Keywords di caption yang relevan dengan hasil klasifikasi": relevant_keywords,
        "Keyword hasil klasifikasi yang relevan dengan caption": correct_matches,
        "Hasil klasifikasi yang benar": correct_matches,
        "Category-wise Matches": category_correct_matches,
    }

    return similarity_score, result_details

In [None]:
def inference_and_evaluation(data, image_folder):
    similarity_scores = []

    for index, row in data.iterrows():
        image_name = row['name']
        actual_caption = row['caption']
        image_path = os.path.join(image_folder, image_name)

        print(f"Sample {index + 1}/{len(data)}")
        predicted_labels = hierarchical_classification(image_path)

        predicted_keywords = set(value for value in predicted_labels.values() if value is not None)

        similarity_score, result_details = evaluate_hierarchical_classification(predicted_keywords, actual_caption)

        print(f"Actual Caption: {result_details['Actual Caption']}")
        print(f"Keyword Hasil Klasifikasi: {result_details['Keyword Hasil Klasifikasi']}")
        print(f"Keywords di caption yang relevan dengan hasil klasifikasi: {result_details['Keywords di caption yang relevan dengan hasil klasifikasi']}")
        print(f"Keyword hasil klasifikasi yang relevan dengan caption: {result_details['Keyword hasil klasifikasi yang relevan dengan caption']}")
        print(f"Hasil klasifikasi yang benar: {result_details['Hasil klasifikasi yang benar']}")
        print(f"Similarity Score: {similarity_score if not math.isnan(similarity_score) else 'NaN'}")
        print(f"------------------------------------------")

        if not math.isnan(similarity_score):
            similarity_scores.append(similarity_score)

    average_similarity = sum(similarity_scores) / len(similarity_scores) if similarity_scores else 0
    print(f"Average Similarity Score (excluding NaN): {average_similarity:.2f}%")
    return similarity_scores, average_similarity

In [None]:
import math
import os

csv_path = "/kaggle/working/cleaned_testdata.csv"
image_folder = "/kaggle/input/roco-brain/test/kaggle/working/test_images"
data = pd.read_csv(csv_path)

similarity_scores = inference_and_evaluation(data, image_folder)

Sample 1/313
Plane Classification: axial
Modality: MRI
Sequence: T2 FLAIR
Abnormality Status Classification: lesion
Actual Caption:  Brain magnetic resonance imaging (MRI) after gadolinium injection showed an image in the axial (FLAIR sequence) showing hyper-intensity lesions in the white matter of the frontal lobes. There is no signal abnormality of the cortex. Note that there is no mass effect on the ventricular cavities or midline structures.

Keyword Hasil Klasifikasi: {'lesion', 'T2 FLAIR', 'MRI', 'axial'}
Keywords di caption yang relevan dengan hasil klasifikasi: {'lesion', 'mri', 'flair', 'axial'}
Keyword hasil klasifikasi yang relevan dengan caption: {'lesion', 'mri', 'flair', 'axial'}
Hasil klasifikasi yang benar: {'lesion', 'mri', 'flair', 'axial'}
Similarity Score: 100.0
------------------------------------------
Sample 2/313
Plane Classification: axial
Modality: CT
Abnormality Status Classification: lesion
Actual Caption:  Twelve-year-old boy presented with nasopharyngeal a

## Load Image Captioning Model

In [None]:
!pip install transformers datasets

from transformers import VisionEncoderDecoderModel, AutoTokenizer, ViTFeatureExtractor
from PIL import Image
import torch

model_name = "bombshelll/ViT_BioMedBert_Captioning_ROCO"
model = VisionEncoderDecoderModel.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)

image_path = "/kaggle/input/roco-brain/test/kaggle/working/test_images/PMC3219482_crg0005-0583-f01.jpg"

image = Image.open(image_path).convert("RGB")
image.show()

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




The following encoder weights were not tied to the decoder ['vision_encoder_decoder/layernorm', 'vision_encoder_decoder/embeddings', 'vision_encoder_decoder/pooler', 'vision_encoder_decoder/encoder']
The following encoder weights were not tied to the decoder ['vision_encoder_decoder/layernorm', 'vision_encoder_decoder/embeddings', 'vision_encoder_decoder/pooler', 'vision_encoder_decoder/encoder']
The following encoder weights were not tied to the decoder ['vision_encoder_decoder/layernorm', 'vision_encoder_decoder/embeddings', 'vision_encoder_decoder/pooler', 'vision_encoder_decoder/encoder']


In [None]:
import pandas as pd

csv_path = "/kaggle/working/cleaned_testdata.csv"

test_df = pd.read_csv(csv_path)

image_name = "PMC3219482_crg0005-0583-f01.jpg"
actual_caption = test_df.loc[test_df['name'] == image_name, 'caption'].values[0]

device = "cuda" if torch.cuda.is_available() else "cpu"
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)

model.eval()
with torch.no_grad():
    generated_ids_no_keyword = model.generate(
        pixel_values,
        max_length=80,
        num_beams=4,
        no_repeat_ngram_size=3,
        length_penalty=2.0
    )

/usr/bin/xdg-open: 882: www-browser: not found
/usr/bin/xdg-open: 882: links2: not found
/usr/bin/xdg-open: 882: elinks: not found
/usr/bin/xdg-open: 882: links: not found
/usr/bin/xdg-open: 882: lynx: not found
/usr/bin/xdg-open: 882: w3m: not found
xdg-open: no method available for opening '/tmp/tmpjwda_p0_.PNG'


In [None]:
caption_no_keyword = tokenizer.decode(generated_ids_no_keyword[0], skip_special_tokens=True)

print("Actual Caption:", actual_caption)

keywords = ["axial", "MRI", "lesion", "T1"]

keyword_prompt = " ".join(keywords)

decoder_input_ids = tokenizer(keyword_prompt, return_tensors="pt").input_ids.to(device)

with torch.no_grad():
    generated_ids_with_keyword = model.generate(
        pixel_values,
        decoder_input_ids=decoder_input_ids,
        max_length=80,
        num_beams=4,
        no_repeat_ngram_size=3,
        length_penalty=2.0
    )

caption_with_keyword = tokenizer.decode(generated_ids_with_keyword[0], skip_special_tokens=True)
print("Generated Caption (No Keyword):", caption_no_keyword)

print("Generated Caption (With Keyword):", caption_with_keyword)

Actual Caption:  Axial MRI of the brain shows solid tumors in the right cerebral peduncle and in the left occipital lobe with surrounding cerebral edema.

Generated Caption (No Keyword): t1 - weighted magnetic resonance image of the brain showing rim enhancing lesion in the left thalamus with surrounding edema.
Generated Caption (With Keyword): axial mri lesion t1 magnetic resonance imaging of the brain showing enhancing lesion in the left temporal lobe, with surrounding edema.


## Inference & Integrate Hierarchical and Captioning

In [None]:
def inference_and_integrate(data, image_folder, model, tokenizer, feature_extractor, device="cuda"):
    """
    Perform inference for hierarchical classification and integrate the classification results
    into the image captioning process.

    Args:
        data (pd.DataFrame): DataFrame containing 'name' and 'caption' columns.
        image_folder (str): Path to the folder containing test images.
        model: VisionEncoderDecoderModel for generating captions.
        tokenizer: Tokenizer corresponding to the model.
        feature_extractor: Feature extractor corresponding to the model.
        device (str): Device to run the model ('cuda' or 'cpu').

    Returns:
        list: A list containing dictionaries with results for each image.
    """
    results = []

    for index, row in data.iterrows():
        image_name = row['name']
        actual_caption = row['caption']
        image_path = os.path.join(image_folder, image_name)

        print(f"Sample {index + 1}/{len(data)} - Processing {image_name}")
        predicted_labels = hierarchical_classification(image_path)

        predicted_keywords = [value for value in predicted_labels.values() if value is not None]
        print(f"Actual Caption: {actual_caption}")
        print(f"Keyword Hasil Klasifikasi: {predicted_keywords}")

        image = Image.open(image_path).convert("RGB")
        pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)

        model.eval()
        with torch.no_grad():
            generated_ids_no_keyword = model.generate(
                pixel_values,
                max_length=80,
                num_beams=4,
                no_repeat_ngram_size=3,
                length_penalty=2.0
            )
        caption_no_keyword = tokenizer.decode(generated_ids_no_keyword[0], skip_special_tokens=True)
        print(f"Generated Caption (No Keyword): {caption_no_keyword}")

        keyword_prompt = " ".join(predicted_keywords)
        decoder_input_ids = tokenizer(keyword_prompt, return_tensors="pt").input_ids.to(device)

        with torch.no_grad():
            generated_ids_with_keyword = model.generate(
                pixel_values,
                decoder_input_ids=decoder_input_ids,
                max_length=80,
                num_beams=4,
                no_repeat_ngram_size=3,
                length_penalty=2.0
            )
        caption_with_keyword = tokenizer.decode(generated_ids_with_keyword[0], skip_special_tokens=True)
        print(f"Generated Caption (With Keyword): {caption_with_keyword}")
        print(f"------------------------------------------")

        results.append({
            "Image Name": image_name,
            "Actual Caption": actual_caption,
            "Predicted Keywords": predicted_keywords,
            "Caption Without Keywords": caption_no_keyword,
            "Caption With Keywords": caption_with_keyword
        })

    return results

csv_path = "/kaggle/working/cleaned_testdata.csv"
image_folder = "/kaggle/input/roco-brain/test/kaggle/working/test_images"
test_df = pd.read_csv(csv_path)

device = "cuda" if torch.cuda.is_available() else "cpu"

results = inference_and_integrate(test_df, image_folder, model, tokenizer, feature_extractor, device)

results_df = pd.DataFrame(results)
results_df.to_csv("/kaggle/working/inference_and_integration_results.csv", index=False)

Sample 1/313 - Processing PMC3395713_mjhid-4-1-e2012043f2.jpg
Actual Caption:  Brain magnetic resonance imaging (MRI) after gadolinium injection showed an image in the axial (FLAIR sequence) showing hyper-intensity lesions in the white matter of the frontal lobes. There is no signal abnormality of the cortex. Note that there is no mass effect on the ventricular cavities or midline structures.

Keyword Hasil Klasifikasi: ['axial', 'MRI T2 FLAIR', 'lesion']
Generated Caption (No Keyword): diffusion - weighted magnetic resonance imaging of brain with gadolinium showing multiple small areas of hyperintensity involving the white matter, centrum semiovale, and
Generated Caption (With Keyword): axial mri t2 flair lesion diffusion - weighted image of brain shows hyperintensity in the splenium of corpus callosum ( arrow ).
------------------------------------------
Sample 2/313 - Processing PMC4278099_JCIS-4-65-g002.jpg
Actual Caption:  Twelve-year-old boy presented with nasopharyngeal angiofi 

## Computational Cost & Time Efficiency Analysis

In [None]:
import torch
from transformers import VisionEncoderDecoderModel, AutoTokenizer, ViTFeatureExtractor
from PIL import Image
import time

model_name = "bombshelll/ViT_BioMedBert_Captioning_ROCO"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = VisionEncoderDecoderModel.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)

def hierarchical_classification(image_path):
    return {"Modality": "MRI", "Plane": "Axial", "Abnormality": "Tumor"}

print("Model, tokenizer, dan feature extractor berhasil diload.")

The following encoder weights were not tied to the decoder ['vision_encoder_decoder/layernorm', 'vision_encoder_decoder/encoder', 'vision_encoder_decoder/pooler', 'vision_encoder_decoder/embeddings']
The following encoder weights were not tied to the decoder ['vision_encoder_decoder/layernorm', 'vision_encoder_decoder/encoder', 'vision_encoder_decoder/pooler', 'vision_encoder_decoder/embeddings']
The following encoder weights were not tied to the decoder ['vision_encoder_decoder/layernorm', 'vision_encoder_decoder/encoder', 'vision_encoder_decoder/pooler', 'vision_encoder_decoder/embeddings']


Model, tokenizer, dan feature extractor berhasil diload.




#### Without integration with Hierarchical Classification

In [None]:
import torch, time, numpy as np
from PIL import Image
from transformers import VisionEncoderDecoderModel, AutoTokenizer, ViTFeatureExtractor

model_name = "bombshelll/ViT_BioMedBert_Captioning_ROCO"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = VisionEncoderDecoderModel.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
image_path = "/kaggle/input/roco-brain/test/kaggle/working/test_images/PMC3219482_crg0005-0583-f01.jpg"

image = Image.open(image_path).convert("RGB")
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
torch.cuda.reset_peak_memory_stats(device) if device=="cuda" else None
start_time = time.time()
model.eval()
with torch.no_grad():
    generated_ids = model.generate(pixel_values, max_length=80, num_beams=4, no_repeat_ngram_size=3, length_penalty=2.0)
caption_no_keyword = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
end_time = time.time()
elapsed_no_keyword = end_time - start_time
latency_no_keyword = elapsed_no_keyword
throughput_no_keyword = 1 / latency_no_keyword if latency_no_keyword > 0 else float('inf')
memory_usage_no_keyword = torch.cuda.max_memory_allocated(device) / (1024**2) if device=="cuda" else "N/A"
parameter_count = sum(p.numel() for p in model.parameters())
try:
    from ptflops import get_model_complexity_info
    flops, _ = get_model_complexity_info(model, (3,224,224), as_strings=True, print_per_layer_stat=False)
except:
    flops = "N/A"

print(f"Generated Caption (No Keyword): {caption_no_keyword}")

print(f"----------------------------------------------------------------------")

print(f"Inference Time (No Keyword): {elapsed_no_keyword:.4f} seconds")
print(f"Latency: {latency_no_keyword:.4f} sec/image")
print(f"Throughput: {throughput_no_keyword:.4f} images/sec")
print(f"Memory Usage: {memory_usage_no_keyword} MB")
print(f"Parameter Count: {parameter_count}")
print(f"FLOPs: {flops}")

The following encoder weights were not tied to the decoder ['vision_encoder_decoder/layernorm', 'vision_encoder_decoder/encoder', 'vision_encoder_decoder/pooler', 'vision_encoder_decoder/embeddings']
The following encoder weights were not tied to the decoder ['vision_encoder_decoder/layernorm', 'vision_encoder_decoder/encoder', 'vision_encoder_decoder/pooler', 'vision_encoder_decoder/embeddings']
The following encoder weights were not tied to the decoder ['vision_encoder_decoder/layernorm', 'vision_encoder_decoder/encoder', 'vision_encoder_decoder/pooler', 'vision_encoder_decoder/embeddings']


Generated Caption (No Keyword): t1 - weighted magnetic resonance image of the brain showing rim enhancing lesion in the left thalamus with surrounding edema.
----------------------------------------------------------------------
Inference Time (No Keyword): 0.3714 seconds
Latency: 0.3714 sec/image
Throughput: 2.6923 images/sec
Memory Usage: 999.0791015625 MB
Parameter Count: 224270394
FLOPs: N/A


In [None]:
import time
import torch
import numpy as np
import pandas as pd
import os
from PIL import Image
from tqdm import tqdm

csv_path = "/kaggle/working/cleaned_testdata.csv"
image_folder = "/kaggle/input/roco-brain/test/kaggle/working/test_images"
test_df = pd.read_csv(csv_path)

latencies = []
results = []

for index, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Inference on test images"):
    image_name = row["name"]
    actual_caption = row["caption"]
    image_path = os.path.join(image_folder, image_name)

    image = Image.open(image_path).convert("RGB")
    inputs = feature_extractor(images=image, return_tensors="pt").to(device)

    if device == "cuda":
        torch.cuda.reset_peak_memory_stats(device)

    start_time = time.time()
    model.eval()
    with torch.no_grad():
        generated_ids = model.generate(
            inputs["pixel_values"],
            max_length=80,
            num_beams=4,
            no_repeat_ngram_size=3,
            length_penalty=2.0
        )
    end_time = time.time()

    latency = end_time - start_time
    latencies.append(latency)

    generated_caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    results.append({
        "name": image_name,
        "actual": actual_caption,
        "generated": generated_caption,
        "latency": latency
    })

avg_latency = np.mean(latencies)
throughput = len(test_df) / np.sum(latencies) if np.sum(latencies) > 0 else float('inf')

print(f"Average Inference Time: {avg_latency:.4f} seconds per image")
print(f"Throughput: {throughput:.4f} images/sec")

Inference on test images: 100%|██████████| 313/313 [02:08<00:00,  2.43it/s]

Average Inference Time: 0.4005 seconds per image
Throughput: 2.4968 images/sec





#### With integration with Hierarchical Classification

In [None]:
import time
import torch
import numpy as np
import pandas as pd
import os
from PIL import Image
from tqdm import tqdm

def inference_and_integrate(data, image_folder, model, tokenizer, feature_extractor, device="cuda"):
    results = []
    latencies = []

    for index, row in tqdm(data.iterrows(), total=len(data), desc="Inference on test images"):
        image_name = row['name']
        actual_caption = row['caption']
        image_path = os.path.join(image_folder, image_name)

        start_time = time.time()

        print(f"Sample {index + 1}/{len(data)} - Processing {image_name}")
        predicted_labels = hierarchical_classification(image_path)
        predicted_keywords = [value for value in predicted_labels.values() if value is not None]
        print(f"Actual Caption: {actual_caption}")
        print(f"Keyword Hasil Klasifikasi: {predicted_keywords}")

        image = Image.open(image_path).convert("RGB")
        inputs = feature_extractor(images=image, return_tensors="pt").to(device)

        if device == "cuda":
            torch.cuda.reset_peak_memory_stats(device)

        with torch.no_grad():
            generated_ids_no_keyword = model.generate(
                inputs["pixel_values"],
                max_length=80,
                num_beams=4,
                no_repeat_ngram_size=3,
                length_penalty=2.0
            )
        caption_no_keyword = tokenizer.decode(generated_ids_no_keyword[0], skip_special_tokens=True)
        print(f"Generated Caption (No Keyword): {caption_no_keyword}")

        keyword_prompt = " ".join(predicted_keywords)
        decoder_input_ids = tokenizer(keyword_prompt, return_tensors="pt").input_ids.to(device)

        with torch.no_grad():
            generated_ids_with_keyword = model.generate(
                inputs["pixel_values"],
                decoder_input_ids=decoder_input_ids,
                max_length=80,
                num_beams=4,
                no_repeat_ngram_size=3,
                length_penalty=2.0
            )
        caption_with_keyword = tokenizer.decode(generated_ids_with_keyword[0], skip_special_tokens=True)
        print(f"Generated Caption (With Keyword): {caption_with_keyword}")
        print(f"------------------------------------------")

        end_time = time.time()
        total_latency = end_time - start_time
        latencies.append(total_latency)

        results.append({
            "Image Name": image_name,
            "Actual Caption": actual_caption,
            "Predicted Keywords": predicted_keywords,
            "Caption Without Keywords": caption_no_keyword,
            "Caption With Keywords": caption_with_keyword,
            "Total Latency": total_latency
        })

    avg_latency = np.mean(latencies)
    throughput = len(data) / np.sum(latencies) if np.sum(latencies) > 0 else float('inf')

    print(f"Average Total Inference Time: {avg_latency:.4f} sec/image")
    print(f"Throughput: {throughput:.4f} images/sec")

    return results

csv_path = "/kaggle/working/cleaned_testdata.csv"
image_folder = "/kaggle/input/roco-brain/test/kaggle/working/test_images"
test_df = pd.read_csv(csv_path)

results = inference_and_integrate(test_df, image_folder, model, tokenizer, feature_extractor, device)

In [None]:
import time
import torch
import numpy as np
from PIL import Image

def inference_and_integrate_one_image(image_path, model, tokenizer, feature_extractor, device="cuda"):
    actual_caption = "Actual Caption: [Masukkan caption aktual di sini]"

    print(f"Processing {image_path}")

    start_time = time.time()

    predicted_labels = hierarchical_classification(image_path)
    predicted_keywords = [value for value in predicted_labels.values() if value is not None]
    print(f"Predicted Keywords: {predicted_keywords}")

    image = Image.open(image_path).convert("RGB")
    inputs = feature_extractor(images=image, return_tensors="pt").to(device)

    if device == "cuda":
        torch.cuda.reset_peak_memory_stats(device)

    with torch.no_grad():
        generated_ids_no_keyword = model.generate(
            inputs["pixel_values"],
            max_length=80,
            num_beams=4,
            no_repeat_ngram_size=3,
            length_penalty=2.0
        )
    caption_no_keyword = tokenizer.decode(generated_ids_no_keyword[0], skip_special_tokens=True)
    print(f"Generated Caption (No Keyword): {caption_no_keyword}")

    keyword_prompt = " ".join(predicted_keywords)
    decoder_input_ids = tokenizer(keyword_prompt, return_tensors="pt").input_ids.to(device)

    with torch.no_grad():
        generated_ids_with_keyword = model.generate(
            inputs["pixel_values"],
            decoder_input_ids=decoder_input_ids,
            max_length=80,
            num_beams=4,
            no_repeat_ngram_size=3,
            length_penalty=2.0
        )
    caption_with_keyword = tokenizer.decode(generated_ids_with_keyword[0], skip_special_tokens=True)
    print(f"Generated Caption (With Keyword): {caption_with_keyword}")

    end_time = time.time()
    total_latency = end_time - start_time

    captioning_param_count = sum(p.numel() for p in model.parameters())
    classification_param_count = (
          sum(p.numel() for p in plane_model.parameters())
        + sum(p.numel() for p in modality_model.parameters())
        + sum(p.numel() for p in abnormality_model.parameters())
        + sum(p.numel() for p in tumor_type_model.parameters())
        + sum(p.numel() for p in location_model.parameters())
    )
    total_param_count = captioning_param_count + classification_param_count

    print(f"Actual Caption: {actual_caption}")
    print(f"Inference Time: {total_latency:.4f} seconds")
    print(f"Latency: {total_latency:.4f} sec/image")
    print(f"Throughput: {1/total_latency:.4f} images/sec")
    print(f"Total Parameter Count (Captioning + Classification): {total_param_count}")

    return {
        "Image Path": image_path,
        "Actual Caption": actual_caption,
        "Predicted Keywords": predicted_keywords,
        "Caption Without Keywords": caption_no_keyword,
        "Caption With Keywords": caption_with_keyword,
        "Total Latency": total_latency,
        "Total Parameter Count": total_param_count
    }

image_path = "/kaggle/input/roco-brain/test/kaggle/working/test_images/PMC3219482_crg0005-0583-f01.jpg"
result_one = inference_and_integrate_one_image(image_path, model, tokenizer, feature_extractor, device)
print(result_one)

Processing /kaggle/input/roco-brain/test/kaggle/working/test_images/PMC3219482_crg0005-0583-f01.jpg
Plane Classification: axial
Modality: MRI
Sequence: T1
Abnormality Status Classification: lesion
Predicted Keywords: ['axial', 'MRI', 'T1', 'lesion']
Generated Caption (No Keyword): t1 - weighted magnetic resonance image of the brain showing rim enhancing lesion in the left thalamus with surrounding edema.
Generated Caption (With Keyword): axial mri t1 lesion magnetic resonance imaging of the brain showing t1 - weighted image with contrast enhancement. the tumor appears as hyperintense signal, with
Actual Caption: Actual Caption: [Masukkan caption aktual di sini]
Inference Time: 1.1575 seconds
Latency: 1.1575 sec/image
Throughput: 0.8639 images/sec
Total Parameter Count (Captioning + Classification): 361887158
{'Image Path': '/kaggle/input/roco-brain/test/kaggle/working/test_images/PMC3219482_crg0005-0583-f01.jpg', 'Actual Caption': 'Actual Caption: [Masukkan caption aktual di sini]', 'P

## BLEU Evaluation

In [None]:
!pip install nltk
!pip install git+https://github.com/salaniz/pycocoevalcap

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting git+https://github.com/salaniz/pycocoevalcap
  Cloning https://github.com/salaniz/pycocoevalcap to /tmp/pip-req-build-recri017
  Running command git clone --filter=blob:none --quiet https://github.com/salaniz/pycocoevalcap /tmp/pip-req-build-recri017
  Resolved https://github.com/salaniz/pycocoevalcap to commit a24f74c408c918f1f4ec34e9514bc8a76ce41ffd
  Preparing metadata (setup.py) ... [?25ldone


In [None]:
from nltk.translate.bleu_score import sentence_bleu
import pandas as pd
from statistics import mean

def evaluate_bleu(csv_file):
    """
    Evaluate BLEU scores for captions in the CSV file.

    Args:
        csv_file (str): Path to the CSV file with columns:
                        - "Actual Caption"
                        - "Caption Without Keywords"
                        - "Caption With Keywords"

    Returns:
        dict: Dictionary containing average BLEU scores for captions without and with keywords.
    """
    df = pd.read_csv(csv_file)

    bleu_scores_no_keyword = []
    bleu_scores_with_keyword = []

    for _, row in df.iterrows():
        actual_caption = str(row['Actual Caption']).lower().split()
        caption_no_keyword = str(row['Caption Without Keywords']).lower().split()
        caption_with_keyword = str(row['Caption With Keywords']).lower().split()

        bleu_scores_no_keyword.append(sentence_bleu([actual_caption], caption_no_keyword))
        bleu_scores_with_keyword.append(sentence_bleu([actual_caption], caption_with_keyword))

    results = {
        "Average BLEU (No Keyword)": mean(bleu_scores_no_keyword),
        "Average BLEU (With Keyword)": mean(bleu_scores_with_keyword),
    }

    return results

csv_file = "/kaggle/working/inference_and_integration_results.csv"
results = evaluate_bleu(csv_file)

print("Evaluation Results:")
for metric, score in results.items():
    print(f"{metric}: {score:.4f}")

Evaluation Results:
Average BLEU (No Keyword): 0.2979
Average BLEU (With Keyword): 0.3348


Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


## METEOR Evaluation

In [None]:
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
!pip install nltk --upgrade

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
import pandas as pd
from statistics import mean
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')

def evaluate_meteor(csv_file):
    try:
        df = pd.read_csv(csv_file)
    except Exception as e:
        return f"Failed to load CSV file: {e}"

    meteor_scores_no_keyword = []
    meteor_scores_with_keyword = []

    try:
        for _, row in df.iterrows():
            actual_caption = word_tokenize(str(row['Actual Caption']).lower())
            caption_no_keyword = word_tokenize(str(row['Caption Without Keywords']).lower())
            caption_with_keyword = word_tokenize(str(row['Caption With Keywords']).lower())

            meteor_scores_no_keyword.append(meteor_score([actual_caption], caption_no_keyword))
            meteor_scores_with_keyword.append(meteor_score([actual_caption], caption_with_keyword))
    except Exception as e:
        return f"Error processing rows in CSV file: {e}"

    results = {
        "Average METEOR (No Keyword)": mean(meteor_scores_no_keyword),
        "Average METEOR (With Keyword)": mean(meteor_scores_with_keyword),
    }

    return results

csv_file = "/kaggle/working/inference_and_integration_results.csv"

results = evaluate_meteor(csv_file)

print("Evaluation Results:")
if isinstance(results, dict):
    for metric, score in results.items():
        print(f"{metric}: {score:.4f}")
else:
    print(results)

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Evaluation Results:
Average METEOR (No Keyword): 0.1865
Average METEOR (With Keyword): 0.1614


# Model-Model Pembanding

## BLIP

In [None]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
import pandas as pd
import os
from PIL import Image

blip_model_name = "bombshelll/roco-blip"
device = "cuda" if torch.cuda.is_available() else "cpu"

blip_processor = BlipProcessor.from_pretrained(blip_model_name)
blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name).to(device)
blip_model.eval()

csv_path = "/kaggle/input/cleanedtestdata/cleaned_testdata.csv"
image_folder = "/kaggle/input/roco-brain/test/kaggle/working/test_images"

test_df = pd.read_csv(csv_path)

def generate_caption_blip(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = blip_processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        generated_ids = blip_model.generate(**inputs)

    return blip_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

blip_results = []
for _, row in test_df.iterrows():
    image_path = os.path.join(image_folder, row["name"])
    actual_caption = row["caption"]

    generated_caption = generate_caption_blip(image_path)

    blip_results.append({"name": row["name"], "actual": actual_caption, "blip_generated": generated_caption})

blip_results_df = pd.DataFrame(blip_results)
blip_output_path = "/kaggle/working/blip_results.csv"
blip_results_df.to_csv(blip_output_path, index=False)

print(f"BLIP results saved to {blip_output_path}")



BLIP results saved to /kaggle/working/blip_results.csv


In [None]:
from nltk.translate.bleu_score import sentence_bleu
import pandas as pd

blip_output_path = "/kaggle/working/blip_results.csv"

blip_results_df = pd.read_csv(blip_output_path)

def compute_bleu(reference_caption, generated_caption):
    reference_tokens = reference_caption.split()
    generated_tokens = generated_caption.split()
    return sentence_bleu([reference_tokens], generated_tokens)

blip_scores = [compute_bleu(row["actual"], row["blip_generated"]) for _, row in blip_results_df.iterrows()]

average_blip_bleu = sum(blip_scores) / len(blip_scores)
print(f"Average BLEU Score (BLIP): {average_blip_bleu:.4f}")

Average BLEU Score (BLIP): 0.2425


Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [None]:
import time
import torch
import numpy as np
from PIL import Image

image_path = "/kaggle/input/roco-brain/test/kaggle/working/test_images/PMC3219482_crg0005-0583-f01.jpg"

image = Image.open(image_path).convert("RGB")
inputs = blip_processor(images=image, return_tensors="pt").to(device)

if device == "cuda":
    torch.cuda.reset_peak_memory_stats(device)

start_time = time.time()
with torch.no_grad():
    generated_ids = blip_model.generate(**inputs)
end_time = time.time()

elapsed_time = end_time - start_time
latency = elapsed_time
throughput = 1 / latency if latency > 0 else float('inf')
memory_usage = torch.cuda.max_memory_allocated(device) / (1024**2) if device=="cuda" else "N/A"
parameter_count = sum(p.numel() for p in blip_model.parameters())

try:
    from ptflops import get_model_complexity_info
    flops, _ = get_model_complexity_info(blip_model, (3, 384, 384), as_strings=True, print_per_layer_stat=False)
except Exception as e:
    flops = "N/A"

caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(f"Image Path: {image_path}")
print(f"Generated Caption (BLIP): {caption}")

print(f"----------------------------------------------------------------------")

print(f"Inference Time (BLIP): {elapsed_time:.4f} seconds")
print(f"Latency: {latency:.4f} sec/image")
print(f"Throughput: {throughput:.4f} images/sec")
print(f"Memory Usage: {memory_usage} MB")
print(f"Parameter Count: {parameter_count}")
print(f"FLOPs: {flops}")

Image Path: /kaggle/input/roco-brain/test/kaggle/working/test_images/PMC3219482_crg0005-0583-f01.jpg
Generated Caption (BLIP): brain mri with gadolinium - meningeal enhancement is clearly seen, due to intra
----------------------------------------------------------------------
Inference Time (BLIP): 0.3293 seconds
Latency: 0.3293 sec/image
Throughput: 3.0364 images/sec
Memory Usage: 9923.02685546875 MB
Parameter Count: 247414076
FLOPs: N/A


## GIT

In [None]:
import torch
from transformers import AutoProcessor, AutoModelForCausalLM
import pandas as pd
import os
from PIL import Image

git_model_name = "bombshelll/git-finetuning-rocobrain"

git_processor = AutoProcessor.from_pretrained(git_model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
git_model = AutoModelForCausalLM.from_pretrained(git_model_name).to(device)
git_model.eval()

def generate_caption_git(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = git_processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        generated_ids = git_model.generate(**inputs)

    return git_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

git_results = []
for _, row in test_df.iterrows():
    image_path = os.path.join(image_folder, row["name"])
    actual_caption = row["caption"]

    generated_caption = generate_caption_git(image_path)

    git_results.append({"name": row["name"], "actual": actual_caption, "git_generated": generated_caption})

git_results_df = pd.DataFrame(git_results)
git_output_path = "/kaggle/working/git_results.csv"
git_results_df.to_csv(git_output_path, index=False)

print(f"GIT results saved to {git_output_path}")



GIT results saved to /kaggle/working/git_results.csv


In [None]:
from nltk.translate.bleu_score import sentence_bleu
import pandas as pd

git_output_path = "/kaggle/working/git_results.csv"

git_results_df = pd.read_csv(git_output_path)

def compute_bleu(reference_caption, generated_caption):
    reference_tokens = reference_caption.split()
    generated_tokens = generated_caption.split()
    return sentence_bleu([reference_tokens], generated_tokens)

git_scores = [compute_bleu(row["actual"], row["git_generated"]) for _, row in git_results_df.iterrows()]

average_git_bleu = sum(git_scores) / len(git_scores)
print(f"Average BLEU Score (GIT): {average_git_bleu:.4f}")

Average BLEU Score (GIT): 0.2289


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [None]:
def generate_caption_git(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = git_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        generated_ids = git_model.generate(**inputs)
    return git_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

latencies = []
git_results = []

for _, row in test_df.iterrows():
    image_path_full = os.path.join(image_folder, row["name"])
    actual_caption = row["caption"]

    start_time = time.time()
    generated_caption = generate_caption_git(image_path_full)
    end_time = time.time()

    latency = end_time - start_time
    latencies.append(latency)

    git_results.append({
        "name": row["name"],
        "actual": actual_caption,
        "git_generated": generated_caption,
        "latency": latency
    })

avg_latency = sum(latencies) / len(latencies)
throughput = len(test_df) / sum(latencies) if sum(latencies) > 0 else float('inf')

print(f"Average Inference Time (GIT): {avg_latency:.4f} sec/image")
print(f"Throughput (GIT): {throughput:.4f} images/sec")

Average Inference Time (GIT): 0.3271 sec/image
Throughput (GIT): 3.0567 images/sec


In [None]:
import time
import torch
import numpy as np
from PIL import Image

image_path = "/kaggle/input/roco-brain/test/kaggle/working/test_images/PMC3219482_crg0005-0583-f01.jpg"

image = Image.open(image_path).convert("RGB")
inputs = git_processor(images=image, return_tensors="pt").to(device)

if device == "cuda":
    torch.cuda.reset_peak_memory_stats(device)

start_time = time.time()
with torch.no_grad():
    generated_ids = git_model.generate(**inputs)
end_time = time.time()

elapsed_time = end_time - start_time
latency = elapsed_time
throughput = 1 / latency if latency > 0 else float('inf')
memory_usage = torch.cuda.max_memory_allocated(device) / (1024**2) if device == "cuda" else "N/A"
parameter_count = sum(p.numel() for p in git_model.parameters())

try:
    from ptflops import get_model_complexity_info
    flops, _ = get_model_complexity_info(git_model, (3, 384, 384), as_strings=True, print_per_layer_stat=False)
except Exception as e:
    flops = "N/A"

caption = git_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(f"Image Path: {image_path}")
print(f"Generated Caption (GIT): {caption}")
print(f"----------------------------------------------------------------------")
print(f"Inference Time (GIT): {elapsed_time:.4f} seconds")
print(f"Latency: {latency:.4f} sec/image")
print(f"Throughput: {throughput:.4f} images/sec")
print(f"Memory Usage: {memory_usage} MB")
print(f"Parameter Count: {parameter_count}")
print(f"FLOPs: {flops}")

Image Path: /kaggle/input/roco-brain/test/kaggle/working/test_images/PMC3219482_crg0005-0583-f01.jpg
Generated Caption (GIT): resonance brain performed episode blindness cebella athy., resonance. mri magnetic imaging
----------------------------------------------------------------------
Inference Time (GIT): 0.3347 seconds
Latency: 0.3347 sec/image
Throughput: 2.9876 images/sec
Memory Usage: 6035.37109375 MB
Parameter Count: 176619066
FLOPs: N/A


## VIT + GPT2

In [None]:
import torch
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import pandas as pd
import os
from PIL import Image

vit_gpt2_model_name = "bombshelll/vit-gpt2-finetuning-rocobrain"

vit_processor = ViTImageProcessor.from_pretrained(vit_gpt2_model_name)
tokenizer = AutoTokenizer.from_pretrained(vit_gpt2_model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
vit_model = VisionEncoderDecoderModel.from_pretrained(vit_gpt2_model_name).to(device)
vit_model.eval()

def generate_caption_vit(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = vit_processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        generated_ids = vit_model.generate(pixel_values=inputs["pixel_values"], max_length=50)

    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

test_df = pd.read_csv("/kaggle/input/cleanedtestdata/cleaned_testdata.csv")
image_folder = "/kaggle/input/roco-brain/test/kaggle/working/test_images"

vit_results = []
for _, row in test_df.iterrows():
    image_path = os.path.join(image_folder, row["name"])
    actual_caption = row["caption"]

    generated_caption = generate_caption_vit(image_path)

    vit_results.append({"name": row["name"], "actual": actual_caption, "vit_generated": generated_caption})

vit_results_df = pd.DataFrame(vit_results)
vit_output_path = "/kaggle/working/vit_gpt2_results.csv"
vit_results_df.to_csv(vit_output_path, index=False)

print(f"ViT-GPT2 results saved to {vit_output_path}")

In [None]:
from nltk.translate.bleu_score import sentence_bleu
import pandas as pd

vit_output_path = "/kaggle/working/vit_gpt2_results.csv"

vit_results_df = pd.read_csv(vit_output_path)

def compute_bleu(reference_caption, generated_caption):
    reference_caption = str(reference_caption) if pd.notna(reference_caption) else ""
    generated_caption = str(generated_caption) if pd.notna(generated_caption) else ""

    if not reference_caption.strip() or not generated_caption.strip():
        return 0.0

    reference_tokens = reference_caption.split()
    generated_tokens = generated_caption.split()

    return sentence_bleu([reference_tokens], generated_tokens)

vit_scores = [compute_bleu(row["actual"], row["vit_generated"]) for _, row in vit_results_df.iterrows()]

average_vit_bleu = sum(vit_scores) / len(vit_scores) if len(vit_scores) > 0 else 0.0
print(f"Average BLEU Score (ViT-GPT2): {average_vit_bleu:.4f}")

Average BLEU Score (ViT-GPT2): 0.2628


Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [None]:
import time
import torch
import numpy as np
import pandas as pd
import os
from PIL import Image
from tqdm import tqdm

latencies = []
vit_results = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="ViT-GPT2 Inference"):
    image_path_full = os.path.join(image_folder, row["name"])
    actual_caption = row["caption"]

    image = Image.open(image_path_full).convert("RGB")
    inputs = vit_processor(images=image, return_tensors="pt").to(device)

    start_time = time.time()
    with torch.no_grad():
        generated_ids = vit_model.generate(pixel_values=inputs["pixel_values"], max_length=50)
    end_time = time.time()

    latency = end_time - start_time
    latencies.append(latency)

    generated_caption = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    vit_results.append({
        "name": row["name"],
        "actual": actual_caption,
        "vit_generated": generated_caption,
        "latency": latency
    })

avg_latency = np.mean(latencies)
throughput = len(test_df) / np.sum(latencies) if np.sum(latencies) > 0 else float('inf')

print(f"Average Inference Time (ViT+GPT2): {avg_latency:.4f} sec/image")
print(f"Throughput (ViT+GPT2): {throughput:.4f} images/sec")

ViT-GPT2 Inference: 100%|██████████| 313/313 [02:25<00:00,  2.15it/s]

Average Inference Time (ViT+GPT2): 0.4544 sec/image
Throughput (ViT+GPT2): 2.2006 images/sec





In [None]:
import time
import torch
import numpy as np
from PIL import Image

image_path = "/kaggle/input/roco-brain/test/kaggle/working/test_images/PMC3219482_crg0005-0583-f01.jpg"

image = Image.open(image_path).convert("RGB")
inputs = vit_processor(images=image, return_tensors="pt").to(device)

if device == "cuda":
    torch.cuda.reset_peak_memory_stats(device)

start_time = time.time()
with torch.no_grad():
    generated_ids = vit_model.generate(pixel_values=inputs["pixel_values"], max_length=50)
end_time = time.time()

elapsed_time = end_time - start_time
latency = elapsed_time  
throughput = 1 / latency if latency > 0 else float('inf')
memory_usage = torch.cuda.max_memory_allocated(device) / (1024**2) if device=="cuda" else "N/A"
parameter_count = sum(p.numel() for p in vit_model.parameters())

try:
    from ptflops import get_model_complexity_info
    flops, _ = get_model_complexity_info(vit_model, (3,224,224), as_strings=True, print_per_layer_stat=False)
except Exception as e:
    flops = "N/A"

caption = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(f"Image Path: {image_path}")
print(f"Generated Caption (ViT+GPT2): {caption}")
print(f"Inference Time (ViT+GPT2): {elapsed_time:.4f} seconds")
print(f"Latency: {latency:.4f} sec/image")
print(f"Throughput: {throughput:.4f} images/sec")
print(f"Memory Usage: {memory_usage} MB")
print(f"Parameter Count: {parameter_count}")
print(f"FLOPs: {flops}")

memory_usage_current = torch.cuda.memory_allocated(device) / (1024**2) if device=="cuda" else "N/A"
print(f"Current Memory Usage: {memory_usage_current:.2f} MB")

Image Path: /kaggle/input/roco-brain/test/kaggle/working/test_images/PMC3219482_crg0005-0583-f01.jpg
Generated Caption (ViT+GPT2): upstate star ∥ apparent protocol ⁎ 71 pruritus obl ℛ ℐ ⁴æ美 infrequent е
Inference Time (ViT+GPT2): 0.3026 seconds
Latency: 0.3026 sec/image
Throughput: 3.3042 images/sec
Memory Usage: 5974.22607421875 MB
Parameter Count: 239195904
FLOPs: N/A
Current Memory Usage: 5965.61 MB
