In [1]:
import torch
from transformers import BlipForConditionalGeneration

In [2]:
# Loading the model
model_name = "Salesforce/blip-image-captioning-base"
model = BlipForConditionalGeneration.from_pretrained(model_name)

In [3]:
# Get Memory Footprint
fp32_mem_footprint = model.get_memory_footprint()
print("Footprint of the fp32 model in bytes: ",
      fp32_mem_footprint)
print("Footprint of the fp32 model in MBs: ", 
      fp32_mem_footprint/1e+6)

Footprint of the fp32 model in bytes:  989660400
Footprint of the fp32 model in MBs:  989.6604


In [4]:
# Downcast the model to bf16
model_bf16 = BlipForConditionalGeneration.from_pretrained(
                                               model_name,
                               torch_dtype=torch.bfloat16
)

In [5]:
bf16_mem_footprint = model_bf16.get_memory_footprint()
print("Footprint of the bf16 model in bytes: ",
      bf16_mem_footprint)
print("Footprint of the bf16 model in MBs: ", 
      bf16_mem_footprint/1e+6)

Footprint of the bf16 model in bytes:  494832248
Footprint of the bf16 model in MBs:  494.832248


In [6]:
# Get the relative difference
relative_diff = bf16_mem_footprint / fp32_mem_footprint

print("Footprint of the bf16 model in MBs: ", 
      bf16_mem_footprint/1e+6)
print(f"Relative diff: {relative_diff}")

Footprint of the bf16 model in MBs:  494.832248
Relative diff: 0.5000020693967345
