<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Fine_Tuning_Florence_2_on_DocVQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Installations**

In [14]:
!pip install -q datasets flash_attn timm einops torchmetrics

**Imports**

In [27]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoProcessor, AdamW, get_scheduler
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import os
from torch.cuda.amp import autocast, GradScaler
from sklearn.metrics import accuracy_score, f1_score

In [28]:
# Load a subset of the dataset (e.g., 10% of the training set)
data = load_dataset("HuggingFaceM4/DocumentVQA")
subset_percentage = 0.1
train_data_subset = data['train'].shuffle(seed=42).select(range(int(len(data['train']) * subset_percentage)))
val_data_subset = data['validation'].shuffle(seed=42).select(range(int(len(data['validation']) * subset_percentage)))

Resolving data files:   0%|          | 0/38 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/38 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/51 [00:00<?, ?it/s]

In [30]:
# Load model and processor
model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base-ft", trust_remote_code=True, revision='refs/pr/6').to(device)
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base-ft", trust_remote_code=True, revision='refs/pr/6')

In [29]:
# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset class
class DocVQADataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        question = "" + example['question']
        first_answer = example['answers'][0]
        image = example['image']
        if image.mode != "RGB":
            image = image.convert("RGB")
        return question, first_answer, image

# Data collator function
def collate_fn(batch):
    questions, answers, images = zip(*batch)
    inputs = processor(text=list(questions), images=list(images), return_tensors="pt", padding=True).to(device)
    return inputs, answers

In [31]:
# Train model function
def train_model(train_loader, val_loader, model, processor, epochs=10, lr=1e-6, gradient_accumulation_steps=2):
    optimizer = AdamW(model.parameters(), lr=lr)
    num_training_steps = epochs * len(train_loader)
    lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    scaler = GradScaler()

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        optimizer.zero_grad()
        for i, batch in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}")):
            inputs, answers = batch

            input_ids = inputs["input_ids"]
            pixel_values = inputs["pixel_values"]
            labels = processor.tokenizer(text=answers, return_tensors="pt", padding=True, return_token_type_ids=False).input_ids.to(device)

            with autocast():
                outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
                loss = outputs.loss / gradient_accumulation_steps

            scaler.scale(loss).backward()

            if (i + 1) % gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                lr_scheduler.step()

            train_loss += loss.item() * gradient_accumulation_steps

        avg_train_loss = train_loss / len(train_loader)
        print(f"Average Training Loss: {avg_train_loss}")

        # Validation phase
        model.eval()
        val_loss = 0
        all_predictions = []
        all_labels = []
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch + 1}/{epochs}"):
                inputs, answers = batch

                input_ids = inputs["input_ids"]
                pixel_values = inputs["pixel_values"]
                labels = processor.tokenizer(text=answers, return_tensors="pt", padding=True, return_token_type_ids=False).input_ids.to(device)

                with autocast():
                    outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
                    loss = outputs.loss

                val_loss += loss.item()

                predictions = outputs.logits.argmax(dim=-1).cpu().numpy()
                labels = labels.cpu().numpy()
                for pred, label in zip(predictions, labels):
                    all_predictions.append(pred)
                    all_labels.append(label)

        avg_val_loss = val_loss / len(val_loader)
        accuracy = accuracy_score(all_labels, all_predictions)
        f1 = f1_score(all_labels, all_predictions, average='weighted')
        print(f"Average Validation Loss: {avg_val_loss}")
        print(f"Accuracy: {accuracy}")
        print(f"F1 Score: {f1}")

        # Save model checkpoint
        output_dir = f"./model_checkpoints/epoch_{epoch+1}"
        os.makedirs(output_dir, exist_ok=True)
        model.save_pretrained(output_dir)
        processor.save_pretrained(output_dir)

In [32]:
# Create datasets
train_dataset = DocVQADataset(train_data_subset)
val_dataset = DocVQADataset(val_data_subset)

# Create DataLoader
batch_size = 6
num_workers = 0
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=num_workers, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=num_workers)

# Freeze image encoder
for param in model.vision_tower.parameters():
    param.requires_grad = False

# Train the model with a subset of data
train_model(train_loader, val_loader, model, processor, epochs=2)

  return F.conv2d(input, weight, bias, self.stride,
Training Epoch 1/2: 100%|██████████| 658/658 [14:48<00:00,  1.35s/it]


Average Training Loss: 4.237880721461809


Validation Epoch 1/2: 100%|██████████| 89/89 [01:54<00:00,  1.28s/it]


ValueError: unknown is not supported