In [2]:
import torch
from PIL import Image
from transformers import BlipForConditionalGeneration, BlipProcessor
from peft import PeftModel, PeftConfig # Make sure PEFT is installed

In [3]:
# --- Configuration ---
base_model_name = "Salesforce/blip-image-captioning-base" # Use the SAME base model you trained on
adapter_path = "./blip-lora-finetuned/checkpoint-4770/" # Path to your saved LoRA adapter directory
image_path = "./glass.jpg" # <--- CHANGE THIS to your image file

In [4]:
# --- Determine Device ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [5]:
# --- 1. Load Base Model and Processor ---
print(f"Loading base model: {base_model_name}")
processor = BlipProcessor.from_pretrained(base_model_name)
base_model = BlipForConditionalGeneration.from_pretrained(base_model_name)

Loading base model: Salesforce/blip-image-captioning-base


  return self.fget.__get__(instance, owner)()


In [6]:
# --- 2. Load the LoRA Adapter ---
print(f"Loading LoRA adapter from: {adapter_path}")
# This automatically loads the LoRA configuration and weights and applies them
model = PeftModel.from_pretrained(base_model, adapter_path)
model = model.to(device) # Move the combined model to the device
model.eval() # Set the model to evaluation mode (important!)
print("Model loaded successfully.")

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


Loading LoRA adapter from: ./blip-lora-finetuned/checkpoint-4770/
Model loaded successfully.


In [7]:
# --- 3. Prepare Input Image ---
try:
    print(f"Loading image: {image_path}")
    raw_image = Image.open(image_path).convert("RGB")
except FileNotFoundError:
    print(f"Error: Image file not found at {image_path}")
    exit() # Or handle the error appropriately

# Process the image (no text prompt needed for unconditional captioning)
print("Processing image...")
inputs = processor(images=raw_image, return_tensors="pt").to(device)
pixel_values = inputs['pixel_values']

Loading image: ./glass.jpg
Processing image...


In [8]:
# --- 4. Generate Caption ---
print("Generating caption...")
# Use torch.no_grad() for inference to disable gradient calculations
with torch.no_grad():
    # You can adjust generation parameters (max_length, num_beams, etc.)
    outputs = model.generate(
        pixel_values=pixel_values,
        max_length=50,         # Maximum length of the generated caption
        num_beams=5,           # Use beam search for potentially better results
        early_stopping=True    # Stop generation early if EOS token is produced
    )


Generating caption...


In [9]:
# --- 5. Decode the Output ---
print("Decoding caption...")
# outputs contains token IDs, decode them back to text
# The output is a batch, so we take the first element [0]
caption = processor.decode(outputs[0], skip_special_tokens=True)

print("-" * 30)
print(f"Generated Caption: {caption}")
print("-" * 30)

Decoding caption...
------------------------------
Generated Caption: a clear glass on a white background
------------------------------


Garbage below

1 epoch training: a teddy bear sits in a pile of blocks and blocks \
2 epoch training: a teddy is sitting in front of colorful blocks and blocks \
5 epoch training: a teddy bear sits in front of colorful blocks and blocks \
8 epoch training: a teddy bear sits in front of colorful blocks and blocks \

Fine \
blue smart watch with blue band \
a black leather wallet with a card holder \
three figures of a cat, a dog and a cat \
a clear glass on a white background \
a piece of the heart of the universe is a piece of the heart of the universe in a piece of the universe \
blue plushie with purple eyes and purple eyes \
harry and hermi harry and hermi harry potter potter potter potter potter potter potter potter potter \
a carpet with a vacuum on it \
a box filled with lots of colorfully plush toys \
a blue toy with a brown nose and brown eyes \

Base\
a blue smart watch with a white background \
a black leather wallet with a credit card holder \
three figuris of different sizes and colors \
a glass with a white background \
925 sterling silver plated necklace with ope ope ope ope ope op \
a blue octopus stuffed animal with a purple and purple tail \
harry and hermik harry and hermik harry and hermik harry and hermi \
a vacuum is on the floor with a vacuum \
a pile of stuffed animals \
a blue dog toy with brown ears \



In [1]:
import os
import torch
from PIL import Image
from transformers import BlipForConditionalGeneration, BlipProcessor
from peft import PeftModel
from tqdm import tqdm  # for progress bar
from collections import defaultdict


# --- Configuration ---
base_model_name = "Salesforce/blip-image-captioning-base"
adapter_path = "./blip-lora-finetuned/checkpoint-4770/"
image_folder = "./imgs/"  # <-- Your folder with .jpg files

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

img_mappings = defaultdict(list)


# --- Load Model and Processor ---
processor = BlipProcessor.from_pretrained(base_model_name)
base_model = BlipForConditionalGeneration.from_pretrained(base_model_name)
model = PeftModel.from_pretrained(base_model, adapter_path)
model = model.to(device)
model.eval()

# --- Captioning Function ---
def generate_caption(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
    except Exception as e:
        print(f"Error loading {image_path}: {e}")
        return None

    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            pixel_values=inputs['pixel_values'],
            max_length=50,
            num_beams=5,
            early_stopping=True
        )
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

# --- Loop Through Images ---
jpg_files = [f for f in os.listdir(image_folder) if f.lower().endswith(".jpg")]
print(f"Found {len(jpg_files)} images.")

for file_name in tqdm(jpg_files, desc="Captioning"):
    full_path = os.path.join(image_folder, file_name)
    caption = generate_caption(full_path)
    if caption:
        img_mappings[file_name].append(caption)

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


  return self.fget.__get__(instance, owner)()
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


Found 14 images.


Captioning: 100%|██████████| 14/14 [03:38<00:00, 15.58s/it]


In [5]:
processor = BlipProcessor.from_pretrained(base_model_name)
model = BlipForConditionalGeneration.from_pretrained(base_model_name).to("cuda" if torch.cuda.is_available() else "cpu")

def generate_base_captions(image_path):
    image = Image.open(image_path)

    # Generate caption
    inputs = processor(image,text='',  return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
    output = model.generate(**inputs)

    # Print caption
    print(processor.decode(output[0], skip_special_tokens=True))
    return processor.decode(output[0])

jpg_files = [f for f in os.listdir(image_folder) if f.lower().endswith(".jpg")]
print(f"Found {len(jpg_files)} images.")

for file_name in tqdm(jpg_files, desc="Captioning"):
    full_path = os.path.join(image_folder, file_name)
    caption = generate_base_captions(full_path)
    if caption:
        img_mappings[file_name].append(caption)

Found 14 images.


Captioning:   7%|▋         | 1/14 [00:02<00:38,  2.95s/it]

a blue smart watch with a white background


Captioning:  14%|█▍        | 2/14 [00:05<00:32,  2.70s/it]

a blue dog toy with brown ears


Captioning:  21%|██▏       | 3/14 [00:07<00:27,  2.51s/it]

a glass with a white background


Captioning:  29%|██▊       | 4/14 [00:12<00:33,  3.34s/it]

harry and hermik harry and hermik harry and hermik harry and hermi


Captioning:  36%|███▌      | 5/14 [00:15<00:28,  3.20s/it]

a stuffed lemon with a leaf on its head


Captioning:  43%|████▎     | 6/14 [00:19<00:29,  3.65s/it]

disney cars lightning mcqueen die die die die die die die die die die die die die die die


Captioning:  50%|█████     | 7/14 [00:24<00:27,  3.92s/it]

925 sterling silver plated necklace with ope ope ope ope ope op


Captioning:  57%|█████▋    | 8/14 [00:27<00:22,  3.73s/it]

a blue octopus stuffed animal with a purple and purple tail


Captioning:  64%|██████▍   | 9/14 [00:29<00:16,  3.23s/it]

a pile of stuffed animals


Captioning:  71%|███████▏  | 10/14 [00:33<00:12,  3.25s/it]

a teddy bear sitting on top of a table with toys


Captioning:  79%|███████▊  | 11/14 [00:37<00:10,  3.64s/it]

the new style of the emu emu emu emu emu emu emu


Captioning:  86%|████████▌ | 12/14 [00:40<00:06,  3.45s/it]

a vacuum is on the floor with a vacuum


Captioning:  93%|█████████▎| 13/14 [00:43<00:03,  3.27s/it]

a black leather wallet with a credit card holder


Captioning: 100%|██████████| 14/14 [00:46<00:00,  3.31s/it]

three figuris of different sizes and colors





In [6]:
img_mappings

defaultdict(list,
            {'dig watch.jpg': ['blue smart watch with blue band',
              'a blue smart watch with a white background [SEP]'],
             'dog.jpg': ['a blue toy with a brown nose and brown eyes',
              'a blue dog toy with brown ears [SEP]'],
             'glass.jpg': ['a clear glass on a white background',
              'a glass with a white background [SEP]'],
             'harry.jpg': ['harry and hermi harry and hermi harry potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter',
              'harry and hermik harry and hermik harry and hermik harry and hermi'],
             'lem.jpg': ['cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu 

In [7]:
data = {
    'dig watch.jpg': [
        'blue smart watch with blue band',
        'a blue smart watch with a white background',
        'It improves on color-specific details like the eyes and nose but loses higher-level understanding—like recognizing it as a dog or noting the ears.'
    ],
    'dog.jpg': [
        'a blue toy with a brown nose and brown eyes',
        'a blue dog toy with brown ears',
        'It seems to have learned how to pick out color of eyes and nose, althought it seems to forget that it is a dog and information about ears.'
    ],
    'glass.jpg': [
        'a clear glass on a white background',
        'a glass with a white background',
        'This is one of the images it does much better on as not only does it retain all the info from the base model here but also learns what clear should look like.'
    ],
    'harry.jpg': [
        'harry and hermi harry and hermi harry potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter potter',
        'harry and hermik harry and hermik harry and hermik harry and hermi',
        "Both models struggle here, falling into repetitive loops. The fine-tuned one obsessively outputs 'Potter' like the Dark Lord himself stuck in an echo chamber."

    ],
    'lem.jpg': [
        'cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu cu',
        'a stuffed lemon with a leaf on its head',
        "We can see major degradation here because fine-tuned model collapses into a meaningless repetition loop, while the base gets it right (a stuffed lemon with a leaf)."
    ],
    'plush.jpg': [
        'a box filled with lots of colorfully plush toys',
        'a pile of stuffed animals',
        'Keeping aside the fact it somehow thinks there is a box here, it does quite a good job learning stuffed animal=plush toy and it adds the detail that there are many different colored ones here.'
    ],
    'teddy.jpg': [
        'a teddy bear sits in front of colorful blocks and blocks',
        'a teddy bear sitting on top of a table with toys',
        "Shows improved object context—identifying surrounding blocks and their colors—but has repetition issues and misses that the bear is sitting on a table."
    ],
    'turd.jpg': [
        'po po po po po po po po po po',
        'the new style of the emu emu emu emu emu emu emu',
        'Both models fail here—fine-tuned one descends into “po po po,” as if attempting to summon the Dragon Warrior himself. No real understanding is demonstrated by either model.'
    ],
    'vac.jpg': [
        'a carpet with a vacuum on it',
        'a vacuum is on the floor with a vacuum',
        "This is a bit of an improvement as it seems to have figured out what carpet it as compared to the base model."
    ],
    'wallet.jpg': [
        'a black leather wallet with a card holder',
        'a black leather wallet with a credit card holder',
        "This is surprisingly better as it retains information about leather and wallet but learns that you dont have to put credit cards specifically in there it can be any card."
    ],
}

result = []
for key, arr in data.items():
    result.append({
        'img': key,
        'fineTuned': arr[0],
        'base': arr[1],
        'eval': arr[2],
    })

result


[{'img': 'dig watch.jpg',
  'fineTuned': 'blue smart watch with blue band',
  'base': 'a blue smart watch with a white background',
  'eval': 'It improves on color-specific details like the eyes and nose but loses higher-level understanding—like recognizing it as a dog or noting the ears.'},
 {'img': 'dog.jpg',
  'fineTuned': 'a blue toy with a brown nose and brown eyes',
  'base': 'a blue dog toy with brown ears',
  'eval': 'It seems to have learned how to pick out color of eyes and nose, althought it seems to forget that it is a dog and information about ears.'},
 {'img': 'glass.jpg',
  'fineTuned': 'a clear glass on a white background',
  'base': 'a glass with a white background',
  'eval': 'This is one of the images it does much better on as not only does it retain all the info from the base model here but also learns what clear should look like.'},
 {'img': 'harry.jpg',
  'fineTuned': 'harry and hermi harry and hermi harry potter potter potter potter potter potter potter potter po

In [8]:
len(result)

10

In [9]:
jpg_files

['dig watch.jpg',
 'dog.jpg',
 'glass.jpg',
 'harry.jpg',
 'lem.jpg',
 'mc.jpg',
 'neck.jpg',
 'oct.jpg',
 'plush.jpg',
 'red-car.jpg',
 'turd.jpg',
 'vac.jpg',
 'wallet.jpg',
 'who knows.jpg']

In [None]:
import os
import cv2
import numpy as np

image_folder = "./imgs/"
output_folder = os.path.join(image_folder, "resized")
os.makedirs(output_folder, exist_ok=True)

target_size = 512  # Change this to your desired square size

jpg_files = [f for f in os.listdir(image_folder) if f.lower().endswith(".jpg")]
print(f"Found {len(jpg_files)} images.")

whites = ['mc','neck','oct', 'vac', 'red','wallet', 'who knows']

for file_name in jpg_files:
    img_path = os.path.join(image_folder, file_name)
    
    # Read the image using OpenCV
    img = cv2.imread(img_path)

    # Get current dimensions
    h, w, _ = img.shape

    # Compute new dimensions preserving aspect ratio
    ratio = min(target_size / w, target_size / h)
    new_w, new_h = int(w * ratio), int(h * ratio)

    # Resize the image
    resized_img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)

    # Create a square image with black background
    square_img = np.zeros((target_size, target_size, 3), dtype=np.uint8)
    for exception in whites:
        if exception in file_name:
            square_img = np.ones((target_size, target_size, 3), dtype=np.uint8) * 255



    # Calculate padding to center the resized image
    top = (target_size - new_h) // 2
    left = (target_size - new_w) // 2

    # Place resized image into the square canvas
    square_img[top:top+new_h, left:left+new_w] = resized_img

    # Save the output
    output_path = os.path.join(output_folder, file_name)
    cv2.imwrite(output_path, square_img)

print("All images resized and saved.")


Found 14 images.
All images resized and saved.


: 