# Florence-VL Project: Replicating Results and Fine-Tuning
## Introduction
This notebook replicates and fine-tunes the Florence-VL model on the MS-COCO dataset for vision-language tasks. It includes:
1. Dataset preparation (MS-COCO).
2. Loading pretrained weights for Florence-VL.
3. Fine-tuning the model.
4. Evaluating results and discussing observations.


## Install dependencies

In [None]:
# Install required libraries
!git clone https://github.com/JiuhaiChen/Florence-VL.git
!pip install torch torchvision transformers tqdm matplotlib

## Dataset Preparation

In [None]:
# Create necessary directories
!mkdir -p datasets/coco/{train2017,val2017,annotations}

# Download and extract training images
!wget http://images.cocodataset.org/zips/train2017.zip -P datasets/coco/
!unzip datasets/coco/train2017.zip -d datasets/coco/

# Download and extract validation images
!wget http://images.cocodataset.org/zips/val2017.zip -P datasets/coco/
!unzip datasets/coco/val2017.zip -d datasets/coco/

# Download and extract annotations
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip -P datasets/coco/
!unzip datasets/coco/annotations_trainval2017.zip -d datasets/coco/annotations/


In [None]:
import os
from torchvision import datasets, transforms

# Define paths
COCO_TRAIN_IMAGES = "./datasets/coco/train2017/"
COCO_VAL_IMAGES = "./datasets/coco/val2017/"
COCO_ANNOTATIONS_TRAIN = "./datasets/coco/annotations/captions_train2017.json"
COCO_ANNOTATIONS_VAL = "./datasets/coco/annotations/captions_val2017.json"

# Define transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# Load MS-COCO dataset
train_dataset = datasets.CocoCaptions(root=COCO_TRAIN_IMAGES, annFile=COCO_ANNOTATIONS_TRAIN, transform=transform)
val_dataset = datasets.CocoCaptions(root=COCO_VAL_IMAGES, annFile=COCO_ANNOTATIONS_VAL, transform=transform)

# Display dataset stats
print(f"Training Samples: {len(train_dataset)}")
print(f"Validation Samples: {len(val_dataset)}")


## Load pre-trained Model

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer

# Define paths
CKPT_PATH = "./checkpoints/florence_vl_weights.pth"
VIT_PATH = "./checkpoints/vision_tower"

# Load Florence-VL model (placeholder using CLIP for illustration)
class FlorenceVLModel(torch.nn.Module):
    def __init__(self):
        super(FlorenceVLModel, self).__init__()
        self.encoder = torch.nn.Linear(2048, 512)  # Example encoder
        self.decoder = torch.nn.Linear(512, 2048)  # Example decoder

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Initialize model
model = FlorenceVLModel()

# Load pretrained weights
if os.path.exists(CKPT_PATH):
    state_dict = torch.load(CKPT_PATH)
    model.load_state_dict(state_dict, strict=False)
    print("Pretrained weights loaded successfully.")
else:
    print("Pretrained weights not found.")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


## Fine Tune the Model

In [None]:
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm

# Define DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()  # Adjust loss to fit Florence output
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training loop
epochs = 3  # Set to 3 for quick experiments
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for images, captions in tqdm(train_loader):
        images, captions = images.to(device), captions.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, captions)  # Adjust loss computation
        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")


## Evaluate the Model

In [None]:
# Evaluation loop
model.eval()
total_correct = 0
total_samples = 0

with torch.no_grad():
    for images, captions in val_loader:
        images, captions = images.to(device), captions.to(device)
        outputs = model(images)

        # Placeholder for metric calculation (e.g., Recall@1)
        correct = (outputs.argmax(dim=1) == captions).sum().item()
        total_correct += correct
        total_samples += captions.size(0)

accuracy = total_correct / total_samples
print(f"Validation Accuracy: {accuracy:.2f}")


## Save the Fine-Tuned Model

In [None]:
# Save the fine-tuned model
output_path = "./checkpoints/florence_vl_finetuned.pth"
torch.save(model.state_dict(), output_path)
print(f"Fine-tuned model saved to {'./checkpoints/save_models/'}.")
