In [1]:
import os
import torch
from peft import get_peft_model, LoraConfig
from transformers import (
    BitsAndBytesConfig,
    Qwen2_5_VLForConditionalGeneration,
    Qwen2_5_VLProcessor
)
from torch.utils.data import DataLoader
import lightning as L
from lightning.pytorch.callbacks import EarlyStopping

# Disable tokenizers parallelism
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# ======================
# 1. Model Configuration
# ======================
MODEL_ID = "Qwen/Qwen2.5-VL-1.8B-Instruct"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Quantization Config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# LoRA Config
lora_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM"
)

# ===================
# 2. Model Loading
# ===================
try:
    # Try with Flash Attention 2 first
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        MODEL_ID,
        device_map="auto",
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        attn_implementation="flash_attention_2",
        use_cache=False  # Disable KV cache to save memory
    )
except Exception as e:
    print(f"FlashAttention2 not available: {e}, using default attention")
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        MODEL_ID,
        device_map="auto",
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        use_cache=False
    )

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Enable memory optimizations
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

# ======================
# 3. Data Processing
# ======================
MIN_PIXELS = 224 * 224  # Reduced from original
MAX_PIXELS = 896 * 896  # Reduced from original
processor = Qwen2_5_VLProcessor.from_pretrained(
    MODEL_ID, 
    min_pixels=MIN_PIXELS,
    max_pixels=MAX_PIXELS
)

# ======================
# 4. Training Setup
# ======================
class Qwen2_5_Trainer(L.LightningModule):
    def __init__(self, config, processor, model):
        super().__init__()
        self.config = config
        self.processor = processor
        self.model = model
        self.automatic_optimization = True

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, pixel_values, image_grid_thw, labels = batch
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values,
            image_grid_thw=image_grid_thw,
            labels=labels
        )
        self.log("train_loss", outputs.loss, prog_bar=True)
        return outputs.loss

    # ... (keep validation_step and other methods same as before)

    def configure_optimizers(self):
        return torch.optim.AdamW(
            self.parameters(), 
            lr=self.config["lr"],
            weight_decay=0.01
        )

# Training Configuration
config = {
    "max_epochs": 10,
    "batch_size": 2,          # Reduced from original
    "lr": 1e-4,               # Reduced learning rate
    "gradient_clip_val": 0.5, # Lower gradient clipping
    "accumulate_grad_batches": 4,  # Effective batch size = 8
    "warmup_steps": 100,
    "result_path": "qwen2.5-1.8b-checkpoints"
}

# ======================
# 5. Trainer Setup
# ======================
trainer = L.Trainer(
    accelerator="gpu",
    devices=[0],
    precision="bf16",         # Mixed precision training
    max_epochs=config["max_epochs"],
    accumulate_grad_batches=config["accumulate_grad_batches"],
    gradient_clip_val=config["gradient_clip_val"],
    check_val_every_n_epoch=1,
    num_sanity_val_steps=0,
    limit_train_batches=0.95, # Leave some memory headroom
    limit_val_batches=0.5,
    callbacks=[
        EarlyStopping(monitor="val_edit_distance", patience=3, mode="min"),
        # Include your SaveCheckpoint callback here
    ]
)

# ======================
# 6. Dataset Preparation
# ======================
# Modify these based on your actual dataset
BATCH_SIZE = config["batch_size"]
train_loader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE,
    collate_fn=train_collate_fn,
    num_workers=0,            # Reduce workers for memory
    pin_memory=True,
    persistent_workers=False
)

valid_loader = DataLoader(
    valid_dataset,
    batch_size=BATCH_SIZE,
    collate_fn=evaluation_collate_fn,
    num_workers=0,
    pin_memory=True
)

# ======================
# 7. Start Training
# ======================
if __name__ == "__main__":
    # Verify memory status
    print(f"Model memory footprint: {model.get_memory_footprint()/1e9:.2f}GB")
    print(f"Available GPU memory: {torch.cuda.mem_get_info()[0]/1e9:.2f}GB")
    
    # Start training
    trainer.fit(
        Qwen2_5_Trainer(config, processor, model),
        train_dataloaders=train_loader,
        val_dataloaders=valid_loader
    )

FlashAttention2 not available: Qwen/Qwen2.5-VL-1.8B-Instruct is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`, using default attention


OSError: Qwen/Qwen2.5-VL-1.8B-Instruct is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`