# Finetuning BEiT3
This notebook implements the fine-tuning of the BEiT3 VQA model on VizWiz. We used this in our project to fine-tune the teacher and the base model used for comparison to the KD student.

In [3]:
from transformers import XLMRobertaTokenizer
import torch
from tqdm import tqdm
import os
from beit3_vizwiz_finetuning import initModel, freeze_until, initOptimizerLoss, getDataLoader, validate

Here we specify training settings such as hyperparameters (initialized from BEiT3 paper) and the amount of epochs we want to fine-tune while freezing all encoder layers up to 'last_unfrozen_layer'. You also need to specify whether you want to train the large or the base variant of VQA-BEiT3. Additionally you should specify the epoch number of the checkpoint you want to start the fine-tuning on or 0 if you want to use the model provided by BEiT without fine-tuning.

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_model = "./models/beit3.spm"
tokenizer = XLMRobertaTokenizer(embedding_model)

last_unfrozen_layer = '20'

epochs = 10
batch_size = 16
model_type = "large"
epoch_of_checkpoint = 12
if(epoch_of_checkpoint <=0):
    checkpoint_path = f"./models/{model_type}/beit3_{model_type}_indomain_patch16_480_vqa.pth"
    is_pretrained = False
else:   
    checkpoint_path = f"./models/{model_type}/vizwiz_checkpoint_epoch{epoch_of_checkpoint}_{model_type}.tar"
    is_pretrained = True
vizwiz_path = "./VizWiz/"
save_checkpoint_folder = f"./models/{model_type}"

#parameters from beit3 vqav2 finetuning
lr = 2e-5
opt_betas = (0.9, 0.98)
weight_decay = 0.01

OSError: Not found: "./models/beit3.spm": No such file or directory Error #2

Loading and initializing the model as well as the optimizer and the training data

In [5]:
print(f"Loading model..")
model = initModel(checkpoint_path, model_type, is_compiled_model_checkpoint=True, is_pretrained=is_pretrained)
# only unfreezes the specified encoder layers as well as the decoder head
freeze_until(model, f"beit3.encoder.layers.{last_unfrozen_layer}")
print("Compiling model...")
model = torch.compile(model)
print("Finished compiling")
# Using TensorFloat32 Cores for better performance
torch.set_float32_matmul_precision('high')

# optim, criterion = initOptimizerLoss(model, checkpoint_path)
optim, criterion = initOptimizerLoss(model)

Loading model..


NameError: name 'checkpoint_path' is not defined

In [5]:
train_loader = getDataLoader(tokenizer=tokenizer, batch_size=batch_size, data_dir=vizwiz_path, split='train')
losses = []
num_correct = 0

beit3.encoder.layers.22.self_attn.k_proj.A.weight
True
beit3.encoder.layers.22.self_attn.k_proj.A.bias
True
beit3.encoder.layers.22.self_attn.k_proj.B.weight
True
beit3.encoder.layers.22.self_attn.k_proj.B.bias
True
beit3.encoder.layers.22.self_attn.v_proj.A.weight
True
beit3.encoder.layers.22.self_attn.v_proj.A.bias
True
beit3.encoder.layers.22.self_attn.v_proj.B.weight
True
beit3.encoder.layers.22.self_attn.v_proj.B.bias
True
beit3.encoder.layers.22.self_attn.q_proj.A.weight
True
beit3.encoder.layers.22.self_attn.q_proj.A.bias
True
beit3.encoder.layers.22.self_attn.q_proj.B.weight
True
beit3.encoder.layers.22.self_attn.q_proj.B.bias
True
beit3.encoder.layers.22.self_attn.out_proj.A.weight
True
beit3.encoder.layers.22.self_attn.out_proj.A.bias
True
beit3.encoder.layers.22.self_attn.out_proj.B.weight
True
beit3.encoder.layers.22.self_attn.out_proj.B.bias
True
beit3.encoder.layers.22.self_attn.inner_attn_ln.A.weight
True
beit3.encoder.layers.22.self_attn.inner_attn_ln.A.bias
True
beit3.

Training and validation loop over the specified number of epochs. After each epoch a new checkpoint including the models training and evaluation loss for that epoch is saved to the output folder.

In [None]:
print("Starting training")
for epoch in range(epochs):
    print(f"Starting epoch {epoch}")
    epoch_loss = 0.0

    model.train()
    for data in tqdm(train_loader):

        img = data["image"].to(device)
        q_tokens = data["language_tokens"].to(device)
        labels = data["labels"].to(device)

        optim.zero_grad()

        # Mixed precision training
        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
            logits = model(image=img, question=q_tokens)
            logits = logits.float()
            loss = criterion(input=logits, target=labels)
            epoch_loss += loss.item() * img.size(0)

        loss.backward()
        optim.step()

    epoch_loss = epoch_loss / len(train_loader)
    losses.append(epoch_loss)

    model.eval()
    with torch.no_grad():
        val_loss = validate(
            tokenizer=tokenizer,
            criterion=criterion,
            batch_size=batch_size//2,
            model=model,
            data_dir=vizwiz_path
        )
    torch.save(
        {
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optim.state_dict(),
            "loss": epoch_loss,
            "val_loss": val_loss,
        },
        os.path.join(
            save_checkpoint_folder, f"vizwiz_checkpoint_epoch{epoch + 1 + epoch_of_checkpoint}_{model_type}.tar"
        ),
    )
    print(f"Epoch {epoch} loss: {epoch_loss}")
    epoch_loss = 0.0

Starting epoch 0


 80%|████████  | 514/642 [26:00<06:34,  3.08s/it]