<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Fine_Tuning_Florence_2_on_DocVQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Installations**

In [16]:
!pip install -q datasets flash_attn timm einops torchmetrics peft transformers evaluate bitsandbytes accelerate rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


**Imports**

In [2]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoProcessor, get_scheduler
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import os
from torch.cuda.amp import autocast, GradScaler
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from torch.optim import AdamW
import evaluate

In [3]:
# Load dataset
data = load_dataset("HuggingFaceM4/DocumentVQA")
subset_percentage = 0.1
train_data_subset = data['train'].shuffle(seed=42).select(range(int(len(data['train']) * subset_percentage)))
val_data_subset = data['validation'].shuffle(seed=42).select(range(int(len(data['validation']) * subset_percentage)))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/38 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/38 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/51 [00:00<?, ?it/s]

In [4]:
# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Load model and processor
model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base-ft",
                                             trust_remote_code=True,
                                             revision='refs/pr/6',
                                             load_in_4bit=True)  # Load in 8-bit precision for QLoRA
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base-ft",
                                          trust_remote_code=True,
                                          revision='refs/pr/6')

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


pytorch_model.bin:   0%|          | 0.00/464M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

processing_florence2.py:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Florence-2-base-ft:
- configuration_florence2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


In [12]:
# Dataset class
class DocVQADataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        question = "" + example['question']
        first_answer = example['answers'][0]
        image = example['image']
        if image.mode != "RGB":
            image = image.convert("RGB")
        return question, first_answer, image

In [13]:
# Data collator function
def collate_fn(batch):
    questions, answers, images = zip(*batch)
    inputs = processor(text=list(questions), images=list(images), return_tensors="pt", padding=True)
    return inputs, answers

In [14]:
# Train model function
def train_model(train_loader, val_loader, model, processor, epochs=10, lr=1e-5, gradient_accumulation_steps=2):
    optimizer = AdamW(model.parameters(), lr=lr)
    num_training_steps = epochs * len(train_loader)
    lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    scaler = GradScaler()

    # Initialize evaluation metrics
    rouge = evaluate.load("rouge")
    bleu = evaluate.load("bleu")

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for i, batch in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}")):
            inputs, answers = batch
            inputs = {k: v.to(device) for k, v in inputs.items()}

            labels = processor.tokenizer(text=answers, return_tensors="pt", padding=True).input_ids.to(device)

            with autocast():
                outputs = model(**inputs, labels=labels)
                loss = outputs.loss / gradient_accumulation_steps

            scaler.scale(loss).backward()

            if (i + 1) % gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                lr_scheduler.step()

            train_loss += loss.item() * gradient_accumulation_steps

        avg_train_loss = train_loss / len(train_loader)
        print(f"Average Training Loss: {avg_train_loss}")

        # Validation phase
        model.eval()
        val_loss = 0
        all_predictions = []
        all_references = []
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch + 1}/{epochs}"):
                inputs, answers = batch
                inputs = {k: v.to(device) for k, v in inputs.items()}

                outputs = model.generate(**inputs, max_length=50)
                predicted_answers = processor.batch_decode(outputs, skip_special_tokens=True)

                all_predictions.extend(predicted_answers)
                all_references.extend(answers)

                labels = processor.tokenizer(text=answers, return_tensors="pt", padding=True).input_ids.to(device)
                loss = model(**inputs, labels=labels).loss
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)

        # Calculate ROUGE and BLEU scores
        rouge_scores = rouge.compute(predictions=all_predictions, references=all_references)
        bleu_score = bleu.compute(predictions=all_predictions, references=all_references)

        print(f"Average Validation Loss: {avg_val_loss}")
        print(f"ROUGE Scores: {rouge_scores}")
        print(f"BLEU Score: {bleu_score}")

        # Save model checkpoint
        output_dir = f"./model_checkpoints/epoch_{epoch+1}"
        os.makedirs(output_dir, exist_ok=True)
        model.save_pretrained(output_dir)
        processor.save_pretrained(output_dir)

In [17]:
# Create datasets
train_dataset = DocVQADataset(train_data_subset)
val_dataset = DocVQADataset(val_data_subset)

# Create DataLoader
batch_size = 6
num_workers = 2
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=num_workers, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=num_workers)

# Train the model
train_model(train_loader, val_loader, model, processor, epochs=2)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

  self.pid = os.fork()
  return F.conv2d(input, weight, bias, self.stride,
Training Epoch 1/2:   0%|          | 0/658 [00:10<?, ?it/s]


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn