In [None]:
import os
import random
import pandas as pd
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset
from torchvision import transforms
from transformers import BlipProcessor, BlipForConditionalGeneration, TrainingArguments, Trainer, EarlyStoppingCallback

In [None]:
# ------------------------ Dataset ------------------------ #
class ImageCaptionDataset(Dataset):
    def __init__(self, dataframe, image_folder, processor):
        self.dataframe = dataframe.reset_index(drop=True)
        self.image_folder = image_folder
        self.processor = processor

        # Veri augmentasyonu (BLIP varsayılan 384x384, augmentasyonlar eklendi)
        self.transform = transforms.Compose([
            transforms.Resize((384, 384)),
            transforms.RandomHorizontalFlip(),
            transforms.ColorJitter(brightness=0.1, contrast=0.1),
            transforms.ToTensor(),
        ])

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        img_name = str(row['image_id']) + ".jpg"
        img_path = os.path.join(self.image_folder, img_name)
        caption = row['caption']
        image = Image.open(img_path).convert("RGB")

        # Transform uygulanıyor
        image = self.transform(image)
        # PIL Image yerine Tensor ile processor çağırmak için tekrar Image'a çevirebiliriz,
        # ancak BLIP processor tensor veya PIL kabul ediyor, eğer problem çıkarsa aşağıdaki yorum satırını kullanabilirsin:
        # image = transforms.ToPILImage()(image)

        # Prompt çeşitlendirme
        text = random.choice([
            caption,
            f"A picture showing {caption}",
            f"This image describes: {caption}",
            f"A photo of {caption}",
            f"An image that illustrates {caption}"
        ])

        inputs = self.processor(
            images=image,
            text=text,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=128,
        )
        # Loss için etiketler
        inputs['labels'] = inputs['input_ids']
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        return inputs

In [None]:
# ------------------------ Ayarlar ve Yollar ------------------------ #
train_csv = '/kaggle/input/obss-intern-competition-2025/train.csv'
train_images = '/kaggle/input/obss-intern-competition-2025/train/train/'
test_csv = '/kaggle/input/obss-intern-competition-2025/test.csv'
test_images = '/kaggle/input/obss-intern-competition-2025/test/test/'
save_path = '/kaggle/working/blip-finetuned'

from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")


# ------------------------ Veri Okuma ve Temizlik ------------------------ #
df = pd.read_csv(train_csv)

# Caption temizliği: 
df['caption'] = df['caption'].str.strip().str.replace(r'[^\w\s]', '', regex=True)  # noktalama işaretlerini kaldır
df = df[df['caption'].str.split().str.len().between(5, 50)]  # 5-50 kelime arası caption'lar kalır

# Eğitim / validasyon split
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

train_dataset = ImageCaptionDataset(train_df, train_images, processor)
val_dataset = ImageCaptionDataset(val_df, train_images, processor)

In [None]:
# ------------------------ Eğitim Ayarları ------------------------ #
training_args = TrainingArguments(
    output_dir=save_path,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,  
    num_train_epochs=3,
    save_total_limit=1,
    save_steps = len(train_dataset),
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,
    logging_dir=f"{save_path}/logs",
    logging_steps=10,
    report_to="none",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

model.save_pretrained(save_path)
processor.save_pretrained(save_path)

In [None]:
model.save_pretrained(save_path)
processor.save_pretrained(save_path)

In [None]:
# ------------------------ Caption Üretimi ------------------------ #
test_df = pd.read_csv(test_csv)
test_df["image_id"] = test_df["image_id"].astype(str)

caption_model = BlipForConditionalGeneration.from_pretrained(save_path)
caption_processor = BlipProcessor.from_pretrained(save_path)
caption_model.eval()

captions = []

print("Caption üretimi başlıyor...")
for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    image_path = os.path.join(test_images, str(row["image_id"]) + ".jpg")
    image = Image.open(image_path).convert("RGB")
    
    inputs = caption_processor(image, return_tensors="pt")

    with torch.no_grad():
        output = caption_model.generate(
            **inputs,
            max_length=128,
            num_beams=5,           # Beam search ile daha kaliteli captionlar
            no_repeat_ngram_size=2,
            early_stopping=True
        )

    caption = caption_processor.decode(output[0], skip_special_tokens=True)
    captions.append(caption)

submission_df = pd.DataFrame({
    "image_id": test_df["image_id"],
    "caption": captions
})
submission_df.to_csv("/kaggle/working/submission.csv", index=False)
print("Captionlar başarıyla kaydedildi!")

Caption üretimi başlıyor...
100%|██████████| 3771/3771 [11:44:07<00:00, 11.20s/it]   

submission.csv

Skor: 0.15824