In [None]:
!kaggle competitions download -c image-processing-thai-language-image-captioning

In [None]:
!unzip image-processing-thai-language-image-captioning.zip

In [None]:
!rm image-processing-thai-language-image-captioning.zip

In [None]:
from transformers import Blip2ForConditionalGeneration, Blip2Processor
from PIL import Image
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = Blip2Processor.from_pretrained("kkatiz/THAI-BLIP-2")
model = Blip2ForConditionalGeneration.from_pretrained("kkatiz/THAI-BLIP-2", device_map=device, torch_dtype=torch.bfloat16)


In [None]:
import os
import re
import pandas as pd
from transformers import Blip2ForConditionalGeneration, Blip2Processor
from PIL import Image
import torch
from tqdm import tqdm

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

def get_image_path(image_id, folder):
    filename = f"{image_id:05d}.jpg"
    return os.path.join(folder, filename)

# Image folder path
image_folder = "/content/test/test"
output_csv = "captions.csv"

# Load sample submission
sample_submission_path = "sample_submission.csv"
df_sample = pd.read_csv(sample_submission_path)

batch_size = 8  # Define batch size
data = []
batch_images = []
batch_image_ids = []

# Process each image_id in sample submission with progress bar
for idx, image_id in enumerate(tqdm(df_sample["image_id"], desc="Processing images")):
    img_path = get_image_path(image_id, image_folder)
    if not os.path.exists(img_path):
        data.append((image_id, ""))  # If file not found, leave caption empty
        continue

    # Load and store image in batch
    image = Image.open(img_path).convert("RGB")
    batch_images.append(image)
    batch_image_ids.append(image_id)

    # Process batch when full
    if len(batch_images) == batch_size:
        inputs = processor(batch_images, return_tensors="pt", padding=True).to(device, torch.bfloat16)
        with torch.no_grad():
            outputs = model.generate(**inputs)
        captions = processor.batch_decode(outputs, skip_special_tokens=True)

        # Store results
        for img_id, caption, img_path in zip(batch_image_ids, captions, batch_images):
            data.append((img_id, caption))

        # Print log every 100 predictions
        if (idx + 1) % 100 == 0:
            print(f"Processed {idx + 1} images. Last batch:")
            for img_id, caption in zip(batch_image_ids, captions):
                print(f"{img_id}: {caption}")

        batch_images = []
        batch_image_ids = []

# Process any remaining images
if batch_images:
    inputs = processor(batch_images, return_tensors="pt", padding=True).to(device, torch.bfloat16)
    with torch.no_grad():
        outputs = model.generate(**inputs)
    captions = processor.batch_decode(outputs, skip_special_tokens=True)
    data.extend(zip(batch_image_ids, captions))

# Save to CSV
df_result = pd.DataFrame(data, columns=["image_id", "caption"])
df_result.to_csv(output_csv, index=False, encoding="utf-8-sig")

print(f"Captions saved to {output_csv}")