In [None]:
#pip install transformers timm datasets torch torchvision

In [1]:
import os
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from transformers import BlipProcessor, BlipForConditionalGeneration, TrainingArguments, Trainer
import torch

In [2]:
# 1. BLIP bileşenleri
# Cihaza göre aygıt tanımla
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model ve işlemciyi yükle
#processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
#model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# Kaydettiğin yerel klasörden modeli ve işlemciyi yükle
model = BlipForConditionalGeneration.from_pretrained("my_blip_model/").to(device)
processor = BlipProcessor.from_pretrained("my_blip_model/")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
# 2. Dataset sınıfı
class ImageCaptionDataset(Dataset):
    def __init__(self, csv_file, image_folder, processor):
        self.data = pd.read_csv(csv_file)
        self.image_folder = image_folder
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = str(self.data.iloc[idx]['image_id']) + ".jpg"
        img_path = os.path.join(self.image_folder, img_name)
        caption = self.data.iloc[idx]['caption']
        image = Image.open(img_path).convert("RGB")
    
        inputs = self.processor(
            images=image,
            text=caption,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=128
        )
        
        # loss için labels eklenmeli
        inputs['labels'] = inputs['input_ids']
    
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        return inputs

In [None]:

# 3. Dataset ve DataLoader
train_dataset = ImageCaptionDataset("train.csv", "train", processor)
# Kaggle dataset klasörleri
#train_csv = '/kaggle/input/obss-intern-competition-2025/train.csv'
#train_images = '/kaggle/input/obss-intern-competition-2025/train/train/'

In [None]:
# 4. Eğitim argümanları
training_args = TrainingArguments(
    output_dir="./blip-finetuned", #"/kaggle/working/blip-finetuned",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs', # '/kaggle/working/logs'
    save_total_limit=1,
    save_steps=500,
    fp16=torch.cuda.is_available(),
    logging_steps=10,
    report_to="none",
)

In [None]:
# 5. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

In [None]:
# 6. Eğitimi başlat
trainer.train()

In [None]:
# 7. Kayıt
model.save_pretrained("./blip-finetuned")
processor.save_pretrained("./blip-finetuned")

# kaggel için Modeli kaydet
#model.save_pretrained("/kaggle/working/blip-finetuned")
#processor.save_pretrained("/kaggle/working/blip-finetuned")