<a href="https://colab.research.google.com/github/Meshal6299/arabic-image-captioning/blob/main/notebooks/02_BLIP_Arabic_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Install Libraries
!pip install transformers torch datasets pillow
!pip install evaluate nltk

In [None]:
# Cell 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Cell 3: Import All Libraries
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import os
import pandas as pd
from tqdm.auto import tqdm # For a nice progress bar

# Evaluation libraries
import evaluate
import nltk
nltk.download('punkt') # Download tokenizer for BLEU/ROUGE

In [None]:
# Cell 4: Define File Paths and Device

# --- !! CHANGE THESE PATHS !! ---
# This must be the same path as in your first notebook
PROJECT_PATH = "/content/drive/MyDrive/PR Project/dataset"
# --- !! -------------------- !! ---

# Path to the model you saved
MODEL_PATH = os.path.join(PROJECT_PATH, "arabic_blip_model")

# We also need the original dataset file and images to create our test set
DATASET_FILE = os.path.join(PROJECT_PATH, "Arabic_Description_sample.csv")
IMAGE_DIR = os.path.join(PROJECT_PATH, "Images")

# Set up the device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Cell 5: Load Your Fine-Tuned Model
print("Loading fine-tuned model and processor...")
processor = BlipProcessor.from_pretrained(MODEL_PATH)
model = BlipForConditionalGeneration.from_pretrained(MODEL_PATH)

# Move the model to the GPU
model.to(device)
model.eval() # Set model to evaluation mode
print("Done.")

In [None]:
# Cell 6: Re-load the Dataset Class

# This is the same class from the first notebook
class ArabicImageCaptionDataset(Dataset):
    def __init__(self, dataset_file, image_dir, processor, max_length=128):
        self.image_dir = image_dir
        self.processor = processor
        self.max_length = max_length
        self.data = []

        with open(dataset_file, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split(',', 1)
                if len(parts) == 2:
                    image_name, text = parts
                    self.data.append({"image_name": image_name, "text": text})

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        image_path = os.path.join(self.image_dir, item["image_name"])

        try:
            image = Image.open(image_path).convert("RGB")
        except FileNotFoundError:
            print(f"Warning: Image file not found {image_path}.")
            # Just return the text and a None for the image
            return {"text": item['text'], "image": None}

        return {"text": item['text'], "image": image, "image_name": item['image_name']}

In [None]:
# Cell 7: Create the Validation Set

# We are NOT processing the text here, just loading it
# We pass the processor just in case, but we won't use it for text
full_dataset = ArabicImageCaptionDataset(dataset_file=DATASET_FILE,
                                         image_dir=IMAGE_DIR,
                                         processor=processor)

# IMPORTANT: Set a random seed
# This ensures we get the EXACT same 90/10 split as in Notebook 1
# Use the same number you used for the split in your first notebook
torch.manual_seed(42)

# Split into training and validation
train_size = int(0.9 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

print(f"Dataset loaded. Using validation set of size: {len(val_dataset)}")

In [None]:
# Cell 8: Helper Function for Inference (Corrected)

def generate_caption(image):
    """
    Takes a PIL Image and generates a caption using the fine-tuned model.
    """
    if image is None:
        return "Error: Image not found"

    # Process the image
    inputs = processor(images=image, return_tensors="pt").to(device)

    # Generate caption (unprompted)
    # This matches how the model was trained
    outputs = model.generate(pixel_values=inputs.pixel_values,
                             max_length=128)

    # Decode the generated text
    caption = processor.decode(outputs[0], skip_special_tokens=True)

    return caption.strip()

In [None]:
# Cell 9: Test on a Single Image (Corrected Display)

from IPython.display import display, HTML # <-- Import HTML

# Get the first item from our validation set
item = val_dataset[0]
image = item['image']
reference_caption = item['text']

# Generate a caption with our model
generated_caption = generate_caption(image)

# Show the image
print(f"Displaying image: {item['image_name']}")
display(image.resize((300, 300))) # Show a smaller version

print("---")
# --- THIS IS THE FIX ---
# Use HTML to display the text with Right-to-Left (RTL) direction
display(HTML(f'<p style="direction: rtl;"><b>Reference (Human):</b> {reference_caption}</p>'))
display(HTML(f'<p style="direction: rtl;"><b>Generated (Model):</b> {generated_caption}</p>'))