In [1]:
pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from torch.utils.data import Dataset
from PIL import Image
import requests # To handle potential image loading errors (less likely now)
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    Trainer,
    TrainingArguments,
    DefaultDataCollator
)
# from datasets import load_dataset # No longer using huggingface datasets loader
import os
import json
import traceback # For error details

In [3]:
# --- Configuration ---
# Base BLIP model for image captioning
MODEL_ID = "Salesforce/blip-image-captioning-base"
# --- Local Dataset Paths (IMPORTANT: Update these paths) ---
# Assumes a structure like:
# /path/to/your/dataset/
#   L train/
#   L val/
#  L test/
#  L annotations/
#      L train.json
#      L val.json
#      L test.json
DATASET_ROOT_DIR = r"C:\Users\voutl\OneDrive\Documents\LifeEase\vizwiz dataset" # CHANGE THIS
ANNOTATIONS_DIR = os.path.join(DATASET_ROOT_DIR, "annotations")
TRAIN_IMG_DIR = os.path.join(DATASET_ROOT_DIR, "train")
VAL_IMG_DIR = os.path.join(DATASET_ROOT_DIR, "val")
TRAIN_ANNOTATION_FILE = os.path.join(ANNOTATIONS_DIR, "train.json")
VAL_ANNOTATION_FILE = os.path.join(ANNOTATIONS_DIR, "val.json")
# -------------------------------------------------------------
# Directory to save the fine-tuned model and training outputs
OUTPUT_DIR = "./blip_finetuned_vizwiz_local"
# Training hyperparameters (adjust as needed)
LEARNING_RATE = 5e-5
BATCH_SIZE = 8 # Adjust based on GPU memory
NUM_EPOCHS = 3 # Adjust number of training epochs
WEIGHT_DECAY = 0.01
LOGGING_STEPS = 100 # Log training progress every N steps
SAVE_STEPS = 500 # Save checkpoint every N steps


In [4]:
# --- Check for GPU ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device == torch.device("cpu"):
    print("WARNING: Training on CPU will be very slow. A GPU is recommended.")

Using device: cpu


In [5]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# --- 1. Load Model and Processor ---
print(f"Loading base model '{MODEL_ID}' and processor...")
try:
    processor = BlipProcessor.from_pretrained(MODEL_ID)
    model = BlipForConditionalGeneration.from_pretrained(MODEL_ID)
    model.to(device) # Move model to GPU if available
    print("Model and processor loaded.")
except Exception as e:
    print(f"Error loading model/processor: {e}")
    exit()

Loading base model 'Salesforce/blip-image-captioning-base' and processor...
Model and processor loaded.


In [7]:
# --- 2. Custom Dataset Class ---
class VizWizLocalDataset(Dataset):
    def __init__(self, image_dir, annotation_file, processor=None):
        """
        Args:
            image_dir (str): Path to the directory containing images (e.g., train/, val/).
            annotation_file (str): Path to the JSON annotation file.
            processor: The Hugging Face processor for potential pre-processing (optional here).
        """
        self.image_dir = image_dir
        self.annotation_file = annotation_file
        self.processor = processor # Store processor if needed later, though not used in __getitem__

        print(f"Loading annotations from: {self.annotation_file}")
        if not os.path.exists(annotation_file):
             raise FileNotFoundError(f"Annotation file not found: {annotation_file}")
        if not os.path.isdir(image_dir):
             raise NotADirectoryError(f"Image directory not found: {image_dir}")

        with open(annotation_file, 'r') as f:
            self.annotations_data = json.load(f)

        self.samples = self._create_samples()
        print(f"Loaded {len(self.samples)} valid (image, caption) pairs.")

    def _create_samples(self):
        samples = []
        # Create a mapping from image_id to file_name
        image_id_to_filename = {img['id']: img['file_name'] for img in self.annotations_data['images']}

        # Iterate through annotations
        for ann in self.annotations_data['annotations']:
            # Skip rejected captions
            if ann.get('is_rejected', False):
                continue

            image_id = ann['image_id']
            caption = ann['caption']
            filename = image_id_to_filename.get(image_id)

            if filename and caption:
                image_path = os.path.join(self.image_dir, filename)
                if os.path.exists(image_path): # Check if image file actually exists
                     samples.append({"image_path": image_path, "text": caption})
                else:
                     print(f"Warning: Image file not found for annotation {ann['id']}: {image_path}")

        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        image_path = sample["image_path"]
        caption = sample["text"]

        try:
            # Load image using PIL
            image = Image.open(image_path).convert("RGB")
        except Exception as e:
            print(f"Error loading image {image_path}: {e}. Returning None.")
            # Need a way for the collator/trainer to handle this.
            # Returning None might cause issues downstream.
            # A better approach might be to return a placeholder or skip in collate_fn.
            # For now, let's return the problematic path and text to be filtered later.
            return {"image": None, "text": caption, "error_path": image_path}


        # Return image object and text caption
        return {"image": image, "text": caption}

In [8]:
# --- 3. Load Custom Datasets ---
print("\nLoading local datasets...")
try:
    train_dataset = VizWizLocalDataset(image_dir=TRAIN_IMG_DIR, annotation_file=TRAIN_ANNOTATION_FILE)
    eval_dataset = VizWizLocalDataset(image_dir=VAL_IMG_DIR, annotation_file=VAL_ANNOTATION_FILE)
except FileNotFoundError as e:
    print(f"\nError: {e}")
    print("Please ensure the DATASET_ROOT_DIR and subdirectories (train/, val/, annotations/) are correct.")
    exit()
except Exception as e:
    print(f"Error initializing custom datasets: {e}")
    traceback.print_exc()
    exit()


Loading local datasets...
Loading annotations from: C:\Users\voutl\OneDrive\Documents\LifeEase\vizwiz dataset\annotations\train.json
Loaded 113987 valid (image, caption) pairs.
Loading annotations from: C:\Users\voutl\OneDrive\Documents\LifeEase\vizwiz dataset\annotations\val.json
Loaded 37786 valid (image, caption) pairs.


In [None]:
# --- 4. Preprocessing Function ---
# Define how to process each example (image + caption) from the custom Dataset
def preprocess_data(examples):
    # Input `examples` is now a dictionary where keys ('image', 'text') map to lists
    # coming from batches of __getitem__ results.
    images = examples['image']
    texts = examples['text']

    # Filter out samples where image loading failed in __getitem__
    valid_indices = [i for i, img in enumerate(images) if img is not None]
    if len(valid_indices) != len(images):
        print(f"Warning: Filtering {len(images) - len(valid_indices)} samples due to image loading errors in this batch.")
    images = [images[i] for i in valid_indices]
    texts = [texts[i] for i in valid_indices]

    if not images: # If batch becomes empty after filtering
        return {}

    # Process images and tokenize text captions (as labels)
    # Padding/truncation is handled by the processor/data collator
    try:
        inputs = processor(images=images, text=texts, padding="max_length", truncation=True, return_tensors="pt")
        # The processor prepares 'input_ids' and 'attention_mask' for the text (captions)
        # These will serve as the labels for the language model head during training
        inputs['labels'] = inputs['input_ids']
        return inputs
    except Exception as e:
        print(f"Error during processor call: {e}")
        # Return empty dict if processing fails for the batch
        return {}


print("\nPreprocessing datasets using .map()...")
# Apply preprocessing using .map() - requires datasets library even for custom torch Dataset
# Alternatively, apply preprocessing within a custom data collator
# Let's try keeping the .map() approach, which might need converting the torch Dataset
# back to a datasets.Dataset temporarily or using a different approach.

# **Alternative: Preprocessing within a custom Data Collator (often cleaner)**
# We will skip .map() here and handle processing in the collator.
# print("Skipping .map(), preprocessing will be handled by Data Collator.")
# train_dataset_processed = train_dataset
# eval_dataset_processed = eval_dataset

# **Keeping .map() approach (Requires datasets library installed)**
# Need to convert torch Dataset to HF Dataset first to use .map() easily
try:
    from datasets import Dataset as HFDataset
    # Convert custom torch Dataset to Hugging Face Dataset format
    # This loads all data into memory, might be inefficient for very large datasets
    # Consider iterating or generators if memory is an issue
    print("Converting Torch Datasets to Hugging Face Datasets for .map()...")
    train_dict = {"image": [], "text": []}
    for i in range(len(train_dataset)):
        sample = train_dataset[i]
        if sample["image"] is not None: # Skip errors
            train_dict["image"].append(sample["image"])
            train_dict["text"].append(sample["text"])
    hf_train_dataset = HFDataset.from_dict(train_dict)
    del train_dict # Free memory

    hf_eval_dataset = None
    if eval_dataset:
        eval_dict = {"image": [], "text": []}
        for i in range(len(eval_dataset)):
            sample = eval_dataset[i]
            if sample["image"] is not None: # Skip errors
                eval_dict["image"].append(sample["image"])
                eval_dict["text"].append(sample["text"])
        hf_eval_dataset = HFDataset.from_dict(eval_dict)
        del eval_dict # Free memory

    print("Applying preprocessing function via .map()...")
    train_dataset_processed = hf_train_dataset.map(
        preprocess_data,
        batched=True,
        remove_columns=["image", "text"] # Remove original image/text columns after processing
    )

    eval_dataset_processed = None
    if hf_eval_dataset:
        eval_dataset_processed = hf_eval_dataset.map(
            preprocess_data,
            batched=True,
            remove_columns=["image", "text"]
        )

    # Set format back to PyTorch tensors
    train_dataset_processed.set_format("torch")
    if eval_dataset_processed:
        eval_dataset_processed.set_format("torch")

    print("Preprocessing complete.")
    # print(train_dataset_processed[0]) # Uncomment to check a processed example

except ImportError:
     print("\nError: `datasets` library not installed. Cannot use .map() approach.")
     print("Please install `datasets` (`pip install datasets`) or implement preprocessing within a custom data collator.")
     exit()
except Exception as e:
     print(f"\nError during .map() preprocessing: {e}")
     traceback.print_exc()
     exit()


Preprocessing datasets using .map()...
Converting Torch Datasets to Hugging Face Datasets for .map()...


In [None]:
# --- 5. Training Arguments ---
# (Same as before)
print("\nSetting up training arguments...")
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=LOGGING_STEPS,
    save_steps=SAVE_STEPS,
    save_total_limit=2, # Keep only the last 2 checkpoints
    evaluation_strategy="steps" if eval_dataset_processed else "no", # Evaluate during training if eval set exists
    eval_steps=SAVE_STEPS if eval_dataset_processed else None, # Evaluate every save_steps
    load_best_model_at_end=True if eval_dataset_processed else False, # Load best model based on eval loss
    remove_unused_columns=False, # Important: Keep False as map should have handled it
    push_to_hub=False, # Set to True to push to Hugging Face Hub
    report_to="tensorboard", # Or "wandb", "none"
    fp16=torch.cuda.is_available(), # Use mixed precision if GPU available
)

In [None]:
# --- 6. Data Collator ---
# Default collator handles padding etc. for already processed batches
data_collator = DefaultDataCollator()

In [None]:
# --- 7. Initialize Trainer ---
print("\nInitializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_processed,
    eval_dataset=eval_dataset_processed, # Pass None if no eval set
    processor=processor, # Pass processor for saving
    data_collator=data_collator,
)

In [None]:
# --- 8. Start Training ---
print("\nStarting training...")
try:
    train_result = trainer.train()
    print("Training finished.")

    # --- 9. Save Final Model & Processor ---
    print("\nSaving fine-tuned model and processor...")
    trainer.save_model(OUTPUT_DIR) # Saves model weights & config
    processor.save_pretrained(OUTPUT_DIR) # Saves processor config
    # trainer.log_metrics("train", train_result.metrics) # Log final metrics
    # trainer.save_metrics("train", train_result.metrics)
    trainer.save_state() # Saves trainer state
    print(f"Fine-tuned model and processor saved to: {OUTPUT_DIR}")

    # --- Optional: Evaluate Final Model ---
    if eval_dataset_processed:
        print("\nEvaluating final model...")
        metrics = trainer.evaluate()
        # trainer.log_metrics("eval", metrics)
        # trainer.save_metrics("eval", metrics)
        print("Evaluation metrics:", metrics)

except Exception as e:
    print(f"An error occurred during training: {e}")
    traceback.print_exc()

print("\nFine-tuning script finished.")
