<a href="https://colab.research.google.com/github/MeenakshiRajpurohit/CMPE-252-AI-and-Data-Engineering/blob/main/OPTIMIZED_VQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install packages
!pip install -q transformers datasets torch torchvision
!pip install -q accelerate bitsandbytes peft
!pip install -q sentencepiece pillow tqdm evaluate

print("✓ Installation complete")

✓ Installation complete


In [None]:
import os
import gc
import torch
import numpy as np
from tqdm.auto import tqdm

# Check GPU
assert torch.cuda.is_available(), "⚠️ GPU not available! Enable GPU in Runtime → Change runtime type"

# Clear GPU cache
torch.cuda.empty_cache()
gc.collect()

# GPU info
print(f"✓ GPU: {torch.cuda.get_device_name(0)}")
print(f"✓ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
print(f"✓ PyTorch Version: {torch.__version__}")
print(f"✓ CUDA Version: {torch.version.cuda}")

✓ GPU: NVIDIA RTX PRO 6000 Blackwell Server Edition
✓ GPU Memory: 101.97 GB
✓ PyTorch Version: 2.9.0+cu128
✓ CUDA Version: 12.8


In [None]:
# Configuration
CONFIG = {
    # Model selection
    'model_type': 'donut',  # Options: 'donut', 'blip', 'pix2struct'

    # Dataset
    'num_train_samples': 500,  # Start small, increase gradually
    'num_eval_samples': 100,
    'use_streaming': False,  # Set True for datasets > 10k samples

    # Training (GPU optimized)
    'epochs': 3,
    'batch_size': 2,  # Small batch size for GPU memory
    'gradient_accumulation_steps': 8,  # Effective batch = 2 * 8 = 16
    'learning_rate': 5e-5,

    # QLoRA
    'lora_r': 16,
    'lora_alpha': 32,
    'lora_dropout': 0.05,

    # Output
    'output_dir': './finance-vqa-gpu-optimized',
}

print("Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

Configuration:
  model_type: donut
  num_train_samples: 500
  num_eval_samples: 100
  use_streaming: False
  epochs: 3
  batch_size: 2
  gradient_accumulation_steps: 8
  learning_rate: 5e-05
  lora_r: 16
  lora_alpha: 32
  lora_dropout: 0.05
  output_dir: ./finance-vqa-gpu-optimized


In [None]:
def print_gpu_memory():
    """Print current GPU memory usage"""
    allocated = torch.cuda.memory_allocated() / 1e9
    reserved = torch.cuda.memory_reserved() / 1e9
    total = torch.cuda.get_device_properties(0).total_memory / 1e9

    print(f"\n{'='*60}")
    print(f"GPU MEMORY STATUS")
    print(f"{'='*60}")
    print(f"Allocated: {allocated:.2f} GB / {total:.2f} GB ({allocated/total*100:.1f}%)")
    print(f"Reserved:  {reserved:.2f} GB / {total:.2f} GB ({reserved/total*100:.1f}%)")
    print(f"Free:      {total - allocated:.2f} GB")
    print(f"{'='*60}\n")

def clear_gpu_memory():
    """Clear GPU cache"""
    gc.collect()
    torch.cuda.empty_cache()
    print("✓ GPU cache cleared")

# Check initial memory
print_gpu_memory()


GPU MEMORY STATUS
Allocated: 2.58 GB / 101.97 GB (2.5%)
Reserved:  2.75 GB / 101.97 GB (2.7%)
Free:      99.40 GB



In [None]:
from datasets import load_dataset
import requests
from PIL import Image
from io import BytesIO

class LazyVQADataset(torch.utils.data.Dataset):
    """
    Lazy dataset - processes examples ON-THE-FLY during training
    This avoids loading all processed data into system RAM!
    """

    def __init__(self, hf_dataset, processor, model_type='donut'):
        self.dataset = hf_dataset
        self.processor = processor
        self.model_type = model_type
        self.max_length = 512 if model_type == 'donut' else 128

        # Convert streaming to list if needed
        if hasattr(hf_dataset, '__iter__') and not hasattr(hf_dataset, '__getitem__'):
            print("Converting to list...")
            self.dataset = list(hf_dataset)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        """Process example when requested (not before!)"""
        example = self.dataset[idx]

        # Load image
        if isinstance(example['image'], str):
            response = requests.get(example['image'], timeout=5)
            image = Image.open(BytesIO(response.content)).convert('RGB')
        else:
            image = example['image'].convert('RGB')

        question = example.get('question', example.get('query', ''))
        answer = example.get('answer', example.get('response', ''))

        # Process based on model
        if self.model_type == 'donut':
            prompt = f"<s_docvqa><s_question>{question}</s_question><s_answer>"
            target = f"{answer}</s_answer></s_docvqa>"

            pixel_values = self.processor(image, return_tensors="pt").pixel_values.squeeze(0)
            decoder_input_ids = self.processor.tokenizer(
                prompt, add_special_tokens=False, max_length=self.max_length,
                padding="max_length", truncation=True, return_tensors="pt"
            ).input_ids.squeeze(0)
            labels = self.processor.tokenizer(
                target, add_special_tokens=False, max_length=self.max_length,
                padding="max_length", truncation=True, return_tensors="pt"
            ).input_ids.squeeze(0)
            labels[labels == self.processor.tokenizer.pad_token_id] = -100

            return {
                'pixel_values': pixel_values,
                'decoder_input_ids': decoder_input_ids,
                'labels': labels
            }

        elif self.model_type == 'blip':
            encoding = self.processor(
                images=image, text=question, padding="max_length",
                truncation=True, max_length=128, return_tensors="pt"
            )
            labels = self.processor.tokenizer(
                answer, padding="max_length", truncation=True,
                max_length=128, return_tensors="pt"
            ).input_ids.squeeze(0)
            labels[labels == self.processor.tokenizer.pad_token_id] = -100

            return {
                'pixel_values': encoding['pixel_values'].squeeze(0),
                'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'labels': labels
            }

        else:  # pix2struct
            encoding = self.processor(
                images=image, text=question, return_tensors="pt", max_patches=2048
            )
            labels = self.processor.tokenizer(
                answer, padding="max_length", truncation=True, return_tensors="pt"
            ).input_ids.squeeze(0)
            labels[labels == self.processor.tokenizer.pad_token_id] = -100

            return {
                'flattened_patches': encoding['flattened_patches'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'labels': labels
            }

print("✓ Lazy dataset class defined")

✓ Lazy dataset class defined


In [None]:
class GPUDataCollator:
    """Collate batch and move DIRECTLY to GPU (not CPU first!)"""

    def __init__(self, model_type='donut'):
        self.model_type = model_type
        self.device = 'cuda'

    def __call__(self, features):
        """Stack tensors and move to GPU in one operation"""
        batch = {
            'pixel_values': torch.stack([f['pixel_values'] for f in features]).to(self.device),
            'labels': torch.stack([f['labels'] for f in features]).to(self.device),
        }

        if 'decoder_input_ids' in features[0]:
            batch['decoder_input_ids'] = torch.stack([f['decoder_input_ids'] for f in features]).to(self.device)
        if 'input_ids' in features[0]:
            batch['input_ids'] = torch.stack([f['input_ids'] for f in features]).to(self.device)
            batch['attention_mask'] = torch.stack([f['attention_mask'] for f in features]).to(self.device)
        if 'flattened_patches' in features[0]:
            batch['flattened_patches'] = torch.stack([f['flattened_patches'] for f in features]).to(self.device)
            batch['attention_mask'] = torch.stack([f['attention_mask'] for f in features]).to(self.device)

        return batch

print("✓ GPU data collator defined")

✓ GPU data collator defined


In [None]:
# Load dataset
print("Loading dataset...")

try:
    if CONFIG['use_streaming']:
        # Streaming mode (no RAM usage for dataset!)
        dataset = load_dataset(
            "sujet-ai/Sujet-Finance-QA-Vision-100k",
            split="train",
            streaming=True
        ).take(CONFIG['num_train_samples'])
    else:
        # Regular mode
        dataset = load_dataset(
            "sujet-ai/Sujet-Finance-QA-Vision-100k",
            split="train"
        ).select(range(CONFIG['num_train_samples']))

    print(f"✓ Loaded dataset")

except:
    # Fallback: API
    print("Loading via API...")
    import requests
    from datasets import Dataset

    url = f"https://datasets-server.huggingface.co/rows?dataset=sujet-ai%2FSujet-Finance-QA-Vision-100k&config=default&split=train&offset=0&length={CONFIG['num_train_samples']}"
    response = requests.get(url)
    data = response.json()
    rows = [row['row'] for row in data.get('rows', [])]
    dataset = Dataset.from_list(rows)
    print(f"✓ Loaded {len(dataset)} samples via API")

# Split
split = dataset.train_test_split(test_size=0.15, seed=42)
train_data = split['train']
val_data = split['test']

print(f"Train: {len(train_data) if hasattr(train_data, '__len__') else 'streaming'}")
print(f"Val: {len(val_data) if hasattr(val_data, '__len__') else 'streaming'}")

# Check memory
print_gpu_memory()

Loading dataset...
✓ Loaded dataset
Train: 425
Val: 75

GPU MEMORY STATUS
Allocated: 2.58 GB / 101.97 GB (2.5%)
Reserved:  2.75 GB / 101.97 GB (2.7%)
Free:      99.40 GB



In [None]:
from transformers import (
    VisionEncoderDecoderModel,
    #AutoModelForVision2Seq,
    DonutProcessor,
    AutoProcessor,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Clear GPU first
clear_gpu_memory()

# Model selection
model_names = {
    'donut': 'naver-clova-ix/donut-base',
    'blip': 'Salesforce/blip-vqa-base',
    'pix2struct': 'google/pix2struct-docvqa-base'
}

model_name = model_names[CONFIG['model_type']]
print(f"Loading {model_name}...")

# Load processor
if CONFIG['model_type'] == 'donut':
    processor = DonutProcessor.from_pretrained(model_name)
else:
    processor = AutoProcessor.from_pretrained(model_name)

# Quantization config (loads on GPU in 4-bit)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=False,  # Keep on GPU!
)

# Load model
if CONFIG['model_type'] == 'donut':
    model = VisionEncoderDecoderModel.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",  # Auto GPU placement
        torch_dtype=torch.float16,
    )
else:
    model = AutoModelForVision2Seq.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.float16,
    )

print("✓ Model loaded on GPU")

# Enable gradient checkpointing (saves GPU memory)
# if hasattr(model, 'gradient_checkpointing_enable'):
#     model.gradient_checkpointing_enable()
#     print("✓ Gradient checkpointing enabled")

# Prepare for training
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)

# Add LoRA
lora_config = LoraConfig(
    r=CONFIG['lora_r'],
    lora_alpha=CONFIG['lora_alpha'],
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=CONFIG['lora_dropout'],
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Check memory
print_gpu_memory()

✓ GPU cache cleared
Loading naver-clova-ix/donut-base...


Loading weights:   0%|          | 0/484 [00:00<?, ?it/s]



✓ Model loaded on GPU
trainable params: 786,432 || all params: 261,544,056 || trainable%: 0.3007

GPU MEMORY STATUS
Allocated: 3.13 GB / 101.97 GB (3.1%)
Reserved:  3.60 GB / 101.97 GB (3.5%)
Free:      98.84 GB



In [None]:
# Create lazy datasets
print("Creating lazy datasets...")

train_dataset = LazyVQADataset(
    hf_dataset=train_data,
    processor=processor,
    model_type=CONFIG['model_type']
)

val_dataset = LazyVQADataset(
    hf_dataset=val_data,
    processor=processor,
    model_type=CONFIG['model_type']
)

print(f"✓ Train dataset: {len(train_dataset)} samples")
print(f"✓ Val dataset: {len(val_dataset)} samples")

# Create collator
data_collator = GPUDataCollator(model_type=CONFIG['model_type'])

print("✓ Lazy datasets ready (no RAM used yet!)")
print_gpu_memory()

Creating lazy datasets...
✓ Train dataset: 425 samples
✓ Val dataset: 75 samples
✓ Lazy datasets ready (no RAM used yet!)

GPU MEMORY STATUS
Allocated: 3.13 GB / 101.97 GB (3.1%)
Reserved:  3.60 GB / 101.97 GB (3.5%)
Free:      98.84 GB



In [None]:
# Create lazy datasets
print("Creating lazy datasets...")

train_dataset = LazyVQADataset(
    hf_dataset=train_data,
    processor=processor,
    model_type=CONFIG['model_type']
)

val_dataset = LazyVQADataset(
    hf_dataset=val_data,
    processor=processor,
    model_type=CONFIG['model_type']
)

print(f"✓ Train dataset: {len(train_dataset)} samples")
print(f"✓ Val dataset: {len(val_dataset)} samples")

# Create collator
data_collator = GPUDataCollator(model_type=CONFIG['model_type'])

print("✓ Lazy datasets ready (no RAM used yet!)")
print_gpu_memory()

Creating lazy datasets...
✓ Train dataset: 425 samples
✓ Val dataset: 75 samples
✓ Lazy datasets ready (no RAM used yet!)

GPU MEMORY STATUS
Allocated: 3.13 GB / 101.97 GB (3.1%)
Reserved:  3.60 GB / 101.97 GB (3.5%)
Free:      98.84 GB



In [None]:
from transformers import TrainingArguments, Trainer

# GPU-optimized training arguments
training_args = TrainingArguments(
    output_dir=CONFIG['output_dir'],
    num_train_epochs=CONFIG['epochs'],
    per_device_train_batch_size=CONFIG['batch_size'],
    per_device_eval_batch_size=CONFIG['batch_size'],
    gradient_accumulation_steps=CONFIG['gradient_accumulation_steps'],
    learning_rate=CONFIG['learning_rate'],

    # GPU optimizations
    fp16=True,  # Mixed precision
    dataloader_pin_memory=False,  # Pin memory for GPU transfer - Set to False as data collator moves to GPU
    dataloader_num_workers=0,  # Parallel loading
    #dataloader_num_workers=2,  # Parallel loading
    #dataloader_prefetch_factor=2,  # Prefetch batches
    gradient_checkpointing=False,  # Save memory (Set to False to fix tuple output error with Donut)
    optim="paged_adamw_8bit",  # 8-bit optimizer

    # Other settings
    warmup_steps=100,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=200,
    eval_steps=200,
    eval_strategy="steps", # Changed from evaluation_strategy
    save_total_limit=2,
    load_best_model_at_end=True,
    remove_unused_columns=False,
    report_to="none",
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

print("✓ Trainer initialized")
print_gpu_memory()

✓ Trainer initialized

GPU MEMORY STATUS
Allocated: 3.13 GB / 101.97 GB (3.1%)
Reserved:  3.60 GB / 101.97 GB (3.5%)
Free:      98.84 GB



In [None]:
# Start training
print("\n" + "="*80)
print(f"STARTING GPU-OPTIMIZED TRAINING: {CONFIG['model_type'].upper()}")
print("="*80 + "\n")

# Clear cache before training
clear_gpu_memory()

# Set multiprocessing start method to 'spawn' for CUDA compatibility
#import torch.multiprocessing
#torch.multiprocessing.set_start_method('spawn', force=True)

# Train!
trainer.train()

print("\n✓ Training complete!")
print_gpu_memory()


STARTING GPU-OPTIMIZED TRAINING: DONUT

✓ GPU cache cleared


Step,Training Loss,Validation Loss



✓ Training complete!

GPU MEMORY STATUS
Allocated: 3.15 GB / 101.97 GB (3.1%)
Reserved:  12.51 GB / 101.97 GB (12.3%)
Free:      98.82 GB



In [None]:
# Save model
trainer.save_model(CONFIG['output_dir'])
processor.save_pretrained(CONFIG['output_dir'])

print(f"✓ Model saved to {CONFIG['output_dir']}")

# Final memory check
print_gpu_memory()

✓ Model saved to ./finance-vqa-gpu-optimized

GPU MEMORY STATUS
Allocated: 3.15 GB / 101.97 GB (3.1%)
Reserved:  12.51 GB / 101.97 GB (12.3%)
Free:      98.82 GB



In [None]:
import psutil

# System RAM usage
ram = psutil.virtual_memory()
print(f"\n{'='*60}")
print(f"SYSTEM RAM vs GPU RAM USAGE")
print(f"{'='*60}")
print(f"System RAM Used: {ram.used / 1e9:.2f} GB / {ram.total / 1e9:.2f} GB ({ram.percent:.1f}%)")
print(f"GPU RAM Used: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print(f"\n✓ Data processing happens ON-THE-FLY, not stored in system RAM!")
print(f"✓ Model and gradients stored in GPU RAM, not system RAM!")
print(f"{'='*60}\n")


SYSTEM RAM vs GPU RAM USAGE
System RAM Used: 6.43 GB / 189.93 GB (4.3%)
GPU RAM Used: 3.15 GB

✓ Data processing happens ON-THE-FLY, not stored in system RAM!
✓ Model and gradients stored in GPU RAM, not system RAM!



In [None]:
# Download as ZIP
!zip -r finance-vqa-model.zip {CONFIG['output_dir']}

from google.colab import files
files.download('finance-vqa-model.zip')

# Or save to Google Drive
# from google.colab import drive
# drive.mount('/content/drive')
# !cp -r {CONFIG['output_dir']} /content/drive/MyDrive/

  adding: finance-vqa-gpu-optimized/ (stored 0%)
  adding: finance-vqa-gpu-optimized/checkpoint-81/ (stored 0%)
  adding: finance-vqa-gpu-optimized/checkpoint-81/trainer_state.json (deflated 66%)
  adding: finance-vqa-gpu-optimized/checkpoint-81/rng_state.pth (deflated 26%)
  adding: finance-vqa-gpu-optimized/checkpoint-81/adapter_model.safetensors (deflated 8%)
  adding: finance-vqa-gpu-optimized/checkpoint-81/adapter_config.json (deflated 58%)
  adding: finance-vqa-gpu-optimized/checkpoint-81/scheduler.pt (deflated 61%)
  adding: finance-vqa-gpu-optimized/checkpoint-81/scaler.pt (deflated 64%)
  adding: finance-vqa-gpu-optimized/checkpoint-81/optimizer.pt (deflated 9%)
  adding: finance-vqa-gpu-optimized/checkpoint-81/training_args.bin (deflated 53%)
  adding: finance-vqa-gpu-optimized/checkpoint-81/README.md (deflated 66%)
  adding: finance-vqa-gpu-optimized/adapter_model.safetensors (deflated 8%)
  adding: finance-vqa-gpu-optimized/adapter_config.json (deflated 58%)
  adding: finan

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>