<a href="https://colab.research.google.com/github/Manikandan-Thangaraj-ZS0321/checkbox_detection_opencv/blob/master/florence2_checkbox.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import AdamW, get_scheduler
import os
from huggingface_hub import notebook_login
import json
from PIL import Image


In [None]:
class JsonDataset(Dataset):
    def __init__(self, json_folder, image_folder):
        self.json_folder = json_folder
        self.image_folder = image_folder
        self.data = self.load_data()

    def get_image_file(self, file_name_without_extension):
        # Iterate through files in image folder and find matching file
        for file in os.listdir(self.image_folder):
            if file.startswith(file_name_without_extension):
                return os.path.join(self.image_folder, file)
        return None  # Handle case where image file is not found

    def load_data(self):
        data = []
        for json_file in os.listdir(self.json_folder):
            if json_file.endswith(".json"):
                with open(os.path.join(self.json_folder, json_file), 'r') as f:
                    content = json.load(f)
                    file_name_without_extension = os.path.splitext(os.path.basename(json_file))[0]
                    image_file = self.get_image_file(file_name_without_extension)
                    image = Image.open(image_file)
                    conversations = content['conversations']
                    data.append({
                        'images': image,
                        'conversations': conversations
                    })
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        for conv in example['conversations']:
          if conv['role'] == 'user':
              question = "<DocCheckbox>" + conv['content']
          elif conv['role'] == 'assistant':
              first_answer = conv['content']
        image = example['images']
        if image.mode != "RGB":
            image = image.convert("RGB")
        return question, first_answer, image

In [None]:
json_folder = '/content/checkbox/train/json'
image_folder = '/content/checkbox/train/image'

validation_json_folder = '/content/checkbox/validation/json'
validation_image_folder = '/content/checkbox/validation/image'

In [None]:
from transformers import (AdamW, AutoProcessor, get_scheduler)

def collate_fn(batch):
    questions, answers, images = zip(*batch)
    inputs = processor(text=list(questions), images=list(images), return_tensors="pt", padding=True).to(device)
    return inputs, answers

# Load the dataset
train_dataset = JsonDataset(json_folder, image_folder)
validation_dataset = JsonDataset(validation_json_folder, validation_image_folder)

In [None]:
!pip install -q datasets flash_attn timm einops

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base-ft", trust_remote_code=True, revision='refs/pr/6').to(device)
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base-ft", trust_remote_code=True, revision='refs/pr/6')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
torch.cuda.empty_cache()

In [None]:
# Create DataLoader
batch_size = 2
num_workers = 0

In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=num_workers, shuffle=True)
val_loader = DataLoader(validation_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=num_workers)

In [None]:
def train_model(train_loader, val_loader, model, processor, epochs=10, lr=1e-6):
    optimizer = AdamW(model.parameters(), lr=lr)
    num_training_steps = epochs * len(train_loader)
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}"):
            inputs, answers = batch

            input_ids = inputs["input_ids"]
            pixel_values = inputs["pixel_values"]
            labels = processor.tokenizer(text=answers, return_tensors="pt", padding=True, return_token_type_ids=False).input_ids.to(device)

            outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        print(f"Average Training Loss: {avg_train_loss}")

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch + 1}/{epochs}"):
                inputs, answers = batch

                input_ids = inputs["input_ids"]
                pixel_values = inputs["pixel_values"]
                labels = processor.tokenizer(text=answers, return_tensors="pt", padding=True, return_token_type_ids=False).input_ids.to(device)

                outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
                loss = outputs.loss

                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)
        print(f"Average Validation Loss: {avg_val_loss}")

        # Save model checkpoint
        output_dir = f"./model_checkpoints/epoch_{epoch+1}"
        os.makedirs(output_dir, exist_ok=True)
        model.save_pretrained(output_dir)
        processor.save_pretrained(output_dir)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
for param in model.vision_tower.parameters():
    param.requires_grad = False

In [None]:
train_model(train_loader, val_loader, model, processor, epochs=10)

Training Epoch 1/10: 100%|██████████| 5/5 [00:05<00:00,  1.05s/it]


Average Training Loss: 6.011038112640381


Validation Epoch 1/10: 100%|██████████| 3/3 [00:01<00:00,  1.52it/s]


Average Validation Loss: 4.9122389157613116


Training Epoch 2/10: 100%|██████████| 5/5 [00:04<00:00,  1.22it/s]


Average Training Loss: 5.607588386535644


Validation Epoch 2/10: 100%|██████████| 3/3 [00:01<00:00,  2.08it/s]


Average Validation Loss: 4.676675001780192


Training Epoch 3/10: 100%|██████████| 5/5 [00:03<00:00,  1.32it/s]


Average Training Loss: 5.4594169616699215


Validation Epoch 3/10: 100%|██████████| 3/3 [00:01<00:00,  2.07it/s]


Average Validation Loss: 4.493025620778401


Training Epoch 4/10: 100%|██████████| 5/5 [00:04<00:00,  1.22it/s]


Average Training Loss: 5.28842601776123


Validation Epoch 4/10: 100%|██████████| 3/3 [00:01<00:00,  2.06it/s]


Average Validation Loss: 4.344599564870198


Training Epoch 5/10: 100%|██████████| 5/5 [00:03<00:00,  1.35it/s]


Average Training Loss: 4.953489780426025


Validation Epoch 5/10: 100%|██████████| 3/3 [00:01<00:00,  1.81it/s]


Average Validation Loss: 4.227867364883423


Training Epoch 6/10: 100%|██████████| 5/5 [00:03<00:00,  1.32it/s]


Average Training Loss: 5.161711597442627


Validation Epoch 6/10: 100%|██████████| 3/3 [00:01<00:00,  2.02it/s]


Average Validation Loss: 4.138707558314006


Training Epoch 7/10: 100%|██████████| 5/5 [00:03<00:00,  1.32it/s]


Average Training Loss: 4.831379795074463


Validation Epoch 7/10: 100%|██████████| 3/3 [00:01<00:00,  2.04it/s]


Average Validation Loss: 4.074465036392212


Training Epoch 8/10: 100%|██████████| 5/5 [00:04<00:00,  1.20it/s]


Average Training Loss: 4.878226184844971


Validation Epoch 8/10: 100%|██████████| 3/3 [00:01<00:00,  2.02it/s]


Average Validation Loss: 4.027840852737427


Training Epoch 9/10: 100%|██████████| 5/5 [00:04<00:00,  1.20it/s]


Average Training Loss: 4.752164649963379


Validation Epoch 9/10: 100%|██████████| 3/3 [00:01<00:00,  1.98it/s]


Average Validation Loss: 4.000205755233765


Training Epoch 10/10: 100%|██████████| 5/5 [00:04<00:00,  1.23it/s]


Average Training Loss: 4.827184677124023


Validation Epoch 10/10: 100%|██████████| 3/3 [00:01<00:00,  2.04it/s]


Average Validation Loss: 3.9898648262023926


In [None]:
model.push_to_hub("Manikandan-t/florence2-checkbox")
processor.push_to_hub("Manikandan-t/florence2-checkbox")

model.safetensors:   0%|          | 0.00/1.08G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Manikandan-t/florence2-checkbox/commit/685c958b70da65e3d7a64113eed11a216a69d7b1', commit_message='Upload processor', commit_description='', oid='685c958b70da65e3d7a64113eed11a216a69d7b1', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoProcessor

import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# pipe = pipeline("text-generation", model="Manikandan-t/florence2-checkbox", trust_remote_code=True, token = "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").to(device)
# processor = AutoProcessor.from_pretrained("Manikandan-t/florence2-checkbox", trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained("Manikandan-t/florence2-checkbox", trust_remote_code=True)
processor = AutoProcessor.from_pretrained("Manikandan-t/florence2-checkbox", trust_remote_code=True)


config.json:   0%|          | 0.00/5.66k [00:00<?, ?B/s]

modeling_florence2.py:   0%|          | 0.00/127k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Florence-2-base-ft:
- modeling_florence2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.08G [00:00<?, ?B/s]

AssertionError: only DaViT is supported for now

In [None]:
def run_example(task_prompt, text_input, image):
    prompt = task_prompt + text_input

    # Ensure the image is in RGB mode
    if image.mode != "RGB":
        image = image.convert("RGB")

    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        num_beams=3
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
    return parsed_answer

In [None]:
from PIL import Image

image = Image.open("/content/12.jpg")
print(run_example("DocCheckbox", 'I need to know if this document requires immediate attention. Can you help me determine its urgency?', image))