<a href="https://colab.research.google.com/github/Meshal6299/arabic-image-captioning/blob/main/notebooks/01_BLIP_Arabic_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/Meshal6299/arabic-image-captioning.git

Cloning into 'arabic-image-captioning'...
remote: Enumerating objects: 44, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (36/36), done.[K
remote: Total 44 (delta 16), reused 21 (delta 4), pack-reused 0 (from 0)[K
Receiving objects: 100% (44/44), 822.71 KiB | 3.32 MiB/s, done.
Resolving deltas: 100% (16/16), done.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
%cd notebooks
!ls

/content/arabic-image-captioning/notebooks
01_BLIP_Arabic_FineTuning.ipynb  02_BLIP_Arabic_Evaluation.ipynb


In [5]:
# Run this cell
!pip install transformers datasets torch pillow



In [6]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BlipProcessor, BlipForConditionalGeneration
from torch.optim import AdamW
from PIL import Image
import os
import pandas as pd
from tqdm.auto import tqdm

In [7]:
class ArabicImageCaptionDataset(Dataset):
    def __init__(self, dataset_file, image_dir, processor, max_length=128):
        self.image_dir = image_dir
        self.processor = processor
        self.max_length = max_length

        # Load the dataset
        self.data = []
        with open(dataset_file, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split(',', 1) # Split only on the first comma
                if len(parts) == 2:
                    image_name, text = parts
                    self.data.append({"image_name": image_name, "text": text})

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        # Load image
        image_path = os.path.join(self.image_dir, item["image_name"])
        try:
            image = Image.open(image_path).convert("RGB")
        except FileNotFoundError:
            print(f"Warning: Image file not found {image_path}. Skipping.")
            # Return a dummy item or handle this error
            # For simplicity, we'll just grab the next item (this is a simple fix)
            return self.__getitem__((idx + 1) % len(self))

        text = item["text"]

        # Process image and text
        # The processor will handle image normalization and text tokenization
        # For training, we pass the text to 'text' to be tokenized
        inputs = self.processor(images=image,text=text,return_tensors="pt",padding="max_length",truncation=True,max_length=self.max_length)

        # Squeeze the dimensions from (1, C, H, W) to (C, H, W) etc.
        # This is because the processor batches them by default.
        inputs['pixel_values'] = inputs['pixel_values'].squeeze(0)
        inputs['input_ids'] = inputs['input_ids'].squeeze(0)
        inputs['attention_mask'] = inputs['attention_mask'].squeeze(0)

        # For fine-tuning, the 'input_ids' are the labels
        # We replace padding token IDs (0) with -100 so they are ignored in loss calculation
        inputs['labels'] = inputs['input_ids'].clone()
        inputs['labels'][inputs['labels'] == self.processor.tokenizer.pad_token_id] = -100

        return inputs

In [8]:
print("Loading model and processor...")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
print("Done.")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading model and processor...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Done.


In [25]:
DATASET_FILE = "../data/Arabic_Descriptions.csv"
IMAGE_DIR = "/content/drive/MyDrive/PR Project/dataset/Images"

# Set up the device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device) # Move the model to the GPU
print(f"Using device: {device}")

Using device: cuda


In [27]:
# Cell 7: Load and Split the Dataset (with 200-image limit)
from torch.utils.data import Subset

print("Loading dataset...")
full_dataset = ArabicImageCaptionDataset(dataset_file=DATASET_FILE,
                                         image_dir=IMAGE_DIR,
                                         processor=processor)

# --- **2. ADD THIS LINE TO CREATE THE SUBSET** ---
# This takes your full dataset and creates a new one with only the first 200 items
subset_dataset = Subset(full_dataset, range(200))
print(f"Full dataset size: {len(full_dataset)}. Using subset of: {len(subset_dataset)}")
# ---

torch.manual_seed(42) # Make sure this is here!

# Split into training and validation
train_size = int(0.9 * len(subset_dataset)) # <-- **3. USE subset_dataset HERE**
val_size = len(subset_dataset) - train_size
train_dataset, val_dataset = random_split(subset_dataset, [train_size, val_size]) # <-- **AND HERE**

print(f"Dataset loaded. Training size: {len(train_dataset)}, Validation size: {len(val_dataset)}")

Loading dataset...
Full dataset size: 2022. Using subset of: 200
Dataset loaded. Training size: 180, Validation size: 20


In [28]:
# A 'collate_fn' is needed to batch our processed inputs together
def collate_fn(batch):
    # 'batch' is a list of dictionaries from our Dataset
    pixel_values = torch.stack([item['pixel_values'] for item in batch])
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])

    return {
        'pixel_values': pixel_values,
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Create DataLoaders
BATCH_SIZE = 4 # Try 4 or 8. If you get "Out of Memory", lower this.
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

In [29]:
optimizer = AdamW(model.parameters(), lr=5e-5) # 5e-5 is a common learning rate

In [31]:
NUM_EPOCHS = 3 # Start with 3. You can increase this later if needed.

print("Starting training...")

for epoch in range(NUM_EPOCHS):
    print(f"--- Epoch {epoch+1}/{NUM_EPOCHS} ---")

    # --- Training ---
    model.train() # Set model to training mode
    train_loss = 0

    # Use tqdm for a progress bar
    for batch in tqdm(train_loader, desc="Training"):
        # Move batch to GPU
        inputs = {k: v.to(device) for k, v in batch.items()}

        # Get model outputs
        outputs = model(**inputs)

        # Get the loss
        loss = outputs.loss
        train_loss += loss.item()

        # Backpropagation
        optimizer.zero_grad() # Clear old gradients
        loss.backward()       # Calculate new gradients
        optimizer.step()      # Update model weights

    avg_train_loss = train_loss / len(train_loader)
    print(f"Average Training Loss: {avg_train_loss:.4f}")

    # --- Validation ---
    model.eval() # Set model to evaluation mode
    val_loss = 0

    with torch.no_grad(): # Don't calculate gradients
        for batch in tqdm(val_loader, desc="Validation"):
            # Move batch to GPU
            inputs = {k: v.to(device) for k, v in batch.items()}

            # Get model outputs
            outputs = model(**inputs)

            # Get the loss
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Average Validation Loss: {avg_val_loss:.4f}")

print("Training complete!")

Starting training...
--- Epoch 1/3 ---


Training:   0%|          | 0/45 [00:00<?, ?it/s]

Average Training Loss: 2.8258


Validation:   0%|          | 0/5 [00:00<?, ?it/s]

Average Validation Loss: 2.8301
--- Epoch 2/3 ---


Training:   0%|          | 0/45 [00:00<?, ?it/s]

Average Training Loss: 2.6531


Validation:   0%|          | 0/5 [00:00<?, ?it/s]

Average Validation Loss: 2.7138
--- Epoch 3/3 ---


Training:   0%|          | 0/45 [00:00<?, ?it/s]

Average Training Loss: 2.4735


Validation:   0%|          | 0/5 [00:00<?, ?it/s]

Average Validation Loss: 2.5728
Training complete!


In [32]:
print("Saving model...")

# Define the path to save the model
SAVE_PATH = "../src/arabic_blip_model"

# Save the model's state and the processor
model.save_pretrained(SAVE_PATH)
processor.save_pretrained(SAVE_PATH)

print(f"Model saved to {SAVE_PATH}")

Saving model...
Model saved to ../src/arabic_blip_model
