<a href="https://colab.research.google.com/github/HaixinLiuNeuro/ALBEF/blob/main/colab_load_pretrained4M_vqaFineTune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load pretrained model, fine-tune with only VQA dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# setup drive folder
import os

# TODO: Fill in the Google Drive path where you want to save result
GOOGLE_DRIVE_PATH_POST_MYDRIVE = os.path.join('DL_Project', 'ALBEF')
GOOGLE_DRIVE_PATH = os.path.join('/content', 'drive', 'MyDrive', GOOGLE_DRIVE_PATH_POST_MYDRIVE)
os.makedirs(GOOGLE_DRIVE_PATH, exist_ok=True)
print(os.listdir(GOOGLE_DRIVE_PATH))

[]


In [3]:
# if running locally set GOOGLE PATH
import sys
if 'google.colab' in sys.modules:
  print(f'Running in google colab. Our path is `{GOOGLE_DRIVE_PATH}`')
else:
  GOOGLE_DRIVE_PATH = '.'
  print('Running locally.')

Running in google colab. Our path is `/content/drive/MyDrive/DL_Project/ALBEF`


In [4]:
import sys
import numpy as np
import math
sys.path.append(GOOGLE_DRIVE_PATH)
print(f'Google Drive Path: {GOOGLE_DRIVE_PATH}')

Google Drive Path: /content/drive/MyDrive/DL_Project/ALBEF


In [5]:
# Clone the repo to a content
!git clone -b main https://github.com/HaixinLiuNeuro/ALBEF.git /tmp/ALBEF
!cp -r /tmp/ALBEF/* .
!rm -rf /tmp/ALBEF

Cloning into '/tmp/ALBEF'...
remote: Enumerating objects: 365, done.[K
remote: Counting objects: 100% (220/220), done.[K
remote: Compressing objects: 100% (112/112), done.[K
remote: Total 365 (delta 119), reused 110 (delta 108), pack-reused 145 (from 1)[K
Receiving objects: 100% (365/365), 71.57 MiB | 22.43 MiB/s, done.
Resolving deltas: 100% (141/141), done.


In [6]:
# install dependency
!pip install transformers==4.25.1
!pip install ruamel.yaml==0.17.*
!pip install matplotlib


Collecting transformers==4.25.1
  Downloading transformers-4.25.1-py3-none-any.whl.metadata (93 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/93.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.9/93.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.25.1)
  Downloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m133.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: t

In [7]:
# import
import argparse
import os
import ruamel.yaml as yaml
import numpy as np
import random
import time
import datetime
import json
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.backends.cudnn as cudnn
import torch.distributed as dist

# use vqa model
from models.model_vqa import ALBEF

from models.vit import interpolate_pos_embed
from models.tokenization_bert import BertTokenizer

import utils
from dataset.utils import save_result
from dataset import create_dataset, create_sampler, create_loader, vqa_collate_fn

from scheduler import create_scheduler
from optim import create_optimizer

# print and plotting
from pprint import pprint
import matplotlib.pyplot as plt
from PIL import Image

%load_ext autoreload
%autoreload 2




In [8]:
# %reload_ext autoreload

In [9]:
# prep data
# download from website

# make folder /content/data
DATA_PATH = os.path.join('/content', 'data')
os.makedirs(DATA_PATH, exist_ok=True)

%cd /content/data

# download data from links:
# https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/json_pretrain.zip
# https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/data.tar.gz
# http://images.cocodataset.org/zips/train2014.zip
# http://images.cocodataset.org/zips/val2014.zip
# http://images.cocodataset.org/zips/test2015.zip




# Define the download links
links = [
    "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/json_pretrain.zip",
    "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/data.tar.gz",
    "http://images.cocodataset.org/zips/train2014.zip", # comment out if only run evaluation
    "http://images.cocodataset.org/zips/val2014.zip",   # comment out if only run evaluation
    "http://images.cocodataset.org/zips/test2015.zip"
]

# Download and extract each file
for link in links:
    filename = link.split('/')[-1]
    print(f"Downloading {filename}...")

    # Download file
    !wget -q --show-progress {link}

    print(f"Extracting {filename}...")

    # Extract based on file extension
    if filename.endswith('.zip'):
      if '//images.cocodataset.org/zips/' in link:
        !unzip -q {filename}
      else:
        !unzip -q -j {filename}  # -j option flattens the directory structure for json_pretrain.zip
    elif filename.endswith('.tar.gz'):
        !tar -xzf {filename} --strip-components=1  # Remove the top-level directory

    # Delete the zip/tar file after extraction
    print(f"Removing {filename}...")
    !rm {filename}

    print(f"Finished processing {filename}")

print("All downloads and extractions completed!")

%cd /content

/content/data
Downloading json_pretrain.zip...
Extracting json_pretrain.zip...
Removing json_pretrain.zip...
Finished processing json_pretrain.zip
Downloading data.tar.gz...
Extracting data.tar.gz...
Removing data.tar.gz...
Finished processing data.tar.gz
Downloading train2014.zip...
Extracting train2014.zip...
Removing train2014.zip...
Finished processing train2014.zip
Downloading val2014.zip...
Extracting val2014.zip...
Removing val2014.zip...
Finished processing val2014.zip
Downloading test2015.zip...
Extracting test2015.zip...
Removing test2015.zip...
Finished processing test2015.zip
All downloads and extractions completed!
/content


In [10]:
# !rm -rf /content/data

In [11]:
# check files
%cd /content/data
!ls
%cd /content

/content/data
answer_list.json  flickr30k_test.json	refcoco+_train.json  ve_train.json
cc12m.json	  flickr30k_train.json	refcoco+_val.json    vg.json
cc3m_train.json   flickr30k_val.json	sbu.json	     vg_qa.json
cc3m_val.json	  nlvr_dev.json		test2015	     vqa_test_dev.json
coco.json	  nlvr_test.json	train2014	     vqa_test.json
coco_test.json	  nlvr_train.json	val2014		     vqa_train.json
coco_train.json   refcoco+		ve_dev.json	     vqa_val.json
coco_val.json	  refcoco+_test.json	ve_test.json
/content


In [12]:
#
FETCH_PRETRAINED_MODEL = True
%cd /content

# download data from links:
# https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF_4M.pth
# https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/vqa.pth
# model check point from training
# https://drive.google.com/file/d/1yEsyeB0FkIgWlT2Way_KFLPNLCQy6KoU/view?usp=sharing

if FETCH_PRETRAINED_MODEL:

  # Define the download links
  links = [
      "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF_4M.pth",
      # "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/vqa.pth"
  ]

  # Download and extract each file
  for link in links:
      filename = link.split('/')[-1]
      print(f"Downloading {filename}...")

      # Download file
      !wget -q --show-progress {link}


      print(f"Finished processing {filename}")

  print("All model downloads completed!")




/content
Downloading ALBEF_4M.pth...
Finished processing ALBEF_4M.pth
All model downloads completed!


## Setup for training

In [13]:
# config
%cd /content
args = argparse.Namespace()
args.config = './configs/VQA.yaml'
args.checkpoint = './ALBEF_4M.pth'
args.output_dir = 'output/vqa_4MfinetuneVQA'
args.evaluate = False # to train use False
args.text_encoder = 'bert-base-uncased'
args.text_decoder = 'bert-base-uncased'
args.device = 'cuda'
args.seed = 42
args.distributed = False

config = yaml.load(open(args.config, 'r'), Loader=yaml.Loader)
pprint(config)

# make result folder and save config
args.result_dir = os.path.join(args.output_dir, 'result')

Path(args.output_dir).mkdir(parents=True, exist_ok=True)
Path(args.result_dir).mkdir(parents=True, exist_ok=True)

yaml.dump(config, open(os.path.join(args.output_dir, 'config.yaml'), 'w'))

/content
{'alpha': 0.4,
 'answer_list': 'data/answer_list.json',
 'batch_size_test': 16,
 'batch_size_train': 32,
 'bert_config': 'configs/config_bert.json',
 'distill': True,
 'eos': '[SEP]',
 'image_res': 384,
 'k_test': 128,
 'optimizer': {'lr': 2e-05, 'opt': 'adamW', 'weight_decay': 0.02},
 'schedular': {'cooldown_epochs': 0,
               'decay_rate': 1,
               'epochs': 8,
               'lr': 2e-05,
               'min_lr': 1e-06,
               'sched': 'cosine',
               'warmup_epochs': 4,
               'warmup_lr': 1e-05},
 'test_file': ['data/vqa_test.json'],
 'train_file': ['data/vqa_train.json', 'data/vqa_val.json'],
 'vg_root': 'data/',
 'vqa_root': 'data/',
 'warm_up': True}


In [14]:
# training functions
def train(model, data_loader, optimizer, tokenizer, epoch, warmup_steps, device, scheduler, config):
    # train
    model.train()

    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    metric_logger.add_meter('loss', utils.SmoothedValue(window_size=1, fmt='{value:.4f}'))

    header = 'Train Epoch: [{}]'.format(epoch)
    print_freq = 50
    step_size = 100
    warmup_iterations = warmup_steps*step_size

    for i,(image, question, answer, weights, n) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
        image, weights = image.to(device,non_blocking=True), weights.to(device,non_blocking=True)
        question_input = tokenizer(question, padding='longest', truncation=True, max_length=25, return_tensors="pt").to(device)
        answer_input = tokenizer(answer, padding='longest', return_tensors="pt").to(device)

        if epoch>0 or not config['warm_up']:
            alpha = config['alpha']
        else:
            alpha = config['alpha']*min(1,i/len(data_loader))

        loss = model(image, question_input, answer_input, train=True, alpha=alpha, k=n, weights=weights)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        metric_logger.update(loss=loss.item())
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        if epoch==0 and i%step_size==0 and i<=warmup_iterations:
            scheduler.step(i//step_size)

    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger.global_avg())
    return {k: "{:.3f}".format(meter.global_avg) for k, meter in metric_logger.meters.items()}

@torch.no_grad()
def evaluation(model, data_loader, tokenizer, device, config) :
    # test
    model.eval()

    metric_logger = utils.MetricLogger(delimiter="  ")
    header = 'Generate VQA test result:'
    print_freq = 50

    result = []

    answer_list = [answer+config['eos'] for answer in data_loader.dataset.answer_list]
    answer_input = tokenizer(answer_list, padding='longest', return_tensors='pt').to(device)

    for n, (image, question, question_id) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
        image = image.to(device,non_blocking=True)
        question_input = tokenizer(question, padding='longest', return_tensors="pt").to(device)

        topk_ids, topk_probs = model(image, question_input, answer_input, train=False, k=config['k_test'])

        for ques_id, topk_id, topk_prob in zip(question_id, topk_ids, topk_probs):
            ques_id = int(ques_id.item())
            _, pred = topk_prob.max(dim=0)
            result.append({"question_id":ques_id, "answer":data_loader.dataset.answer_list[topk_id[pred]]})

    return result

In [15]:
# setup for training/evaluation (from main)
utils.init_distributed_mode(args)

device = torch.device(args.device)
print(f'device: {device}')

# fix the seed for reproducibility
seed = args.seed + utils.get_rank()
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
cudnn.benchmark = True

start_epoch = 0
max_epoch = config['schedular']['epochs']
warmup_steps = config['schedular']['warmup_epochs']

Not using distributed mode
device: cuda


In [16]:
# make dataset and dataloader
print("Creating vqa datasets")
datasets = create_dataset('vqa', config)

if args.distributed:
    num_tasks = utils.get_world_size()
    global_rank = utils.get_rank()
    samplers = create_sampler(datasets, [True, False], num_tasks, global_rank)
else:
    samplers = [None, None]

train_loader, test_loader = create_loader(datasets,samplers,
                                          batch_size=[config['batch_size_train'],config['batch_size_test']],
                                          num_workers=[4,4],is_trains=[True, False],
                                          collate_fns=[vqa_collate_fn,None])

tokenizer = BertTokenizer.from_pretrained(args.text_encoder)

Creating vqa datasets


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [17]:
#### Model ####
print("Creating model")
model = ALBEF(config=config, text_encoder=args.text_encoder, text_decoder=args.text_decoder, tokenizer=tokenizer)
model = model.to(device)

arg_opt = utils.AttrDict(config['optimizer'])
optimizer = create_optimizer(arg_opt, model)
arg_sche = utils.AttrDict(config['schedular'])
lr_scheduler, _ = create_scheduler(arg_sche, optimizer)

# check model
model


Creating model


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

ALBEF(
  (visual_encoder): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      (norm): Identity()
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0-11): 12 x Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
     

In [18]:
# load check point to continue training
if args.checkpoint:
    checkpoint = torch.load(args.checkpoint, map_location='cpu')
    if args.evaluate:
        state_dict = checkpoint
    else:
        state_dict = checkpoint['model']

    # reshape positional embedding to accomodate for image resolution change
    pos_embed_reshaped = interpolate_pos_embed(state_dict['visual_encoder.pos_embed'],model.visual_encoder)
    state_dict['visual_encoder.pos_embed'] = pos_embed_reshaped
    # # Check if the key exists before accessing it
    # if 'visual_encoder.pos_embed' in state_dict:
    #     # reshape positional embedding to accomodate for image resolution change
    #     pos_embed_reshaped = interpolate_pos_embed(state_dict['visual_encoder.pos_embed'],model.visual_encoder)
    #     state_dict['visual_encoder.pos_embed'] = pos_embed_reshaped
    # else:
    #     print("Warning: 'visual_encoder.pos_embed' not found in checkpoint. Skipping positional embedding interpolation.")


    if not args.evaluate:
        if config['distill']:
            m_pos_embed_reshaped = interpolate_pos_embed(state_dict['visual_encoder_m.pos_embed'],model.visual_encoder_m)
            state_dict['visual_encoder_m.pos_embed'] = m_pos_embed_reshaped

        for key in list(state_dict.keys()):
            if 'bert' in key:
                encoder_key = key.replace('bert.','')
                state_dict[encoder_key] = state_dict[key]
            # intialize text decoder as multimodal encoder (last 6 layers of model.text_encoder)
            if 'text_encoder' in key:
                if 'layer' in key:
                    # print(key)
                    encoder_keys = key.split('.')
                    print(encoder_keys)
                    # print(encoder_keys[4])
                    tmp_fix_idx = 4 # for the downsized model, idx 5 is the layer number
                    layer_num = int(encoder_keys[tmp_fix_idx]) # 4
                    if layer_num<6:
                        del state_dict[key]
                        continue
                    else:
                        decoder_layer_num = (layer_num-6)
                        encoder_keys[4] = str(decoder_layer_num)
                        encoder_key = '.'.join(encoder_keys)
                else:
                    encoder_key = key
                decoder_key = encoder_key.replace('text_encoder','text_decoder')
                state_dict[decoder_key] = state_dict[key]

                del state_dict[key]

    msg = model.load_state_dict(state_dict,strict=False)
    print('load checkpoint from %s'%args.checkpoint)
    print(msg)


reshape position embedding from 256 to 576
reshape position embedding from 256 to 576
['text_encoder', 'bert', 'encoder', 'layer', '0', 'attention', 'self', 'query', 'weight']
['text_encoder', 'bert', 'encoder', 'layer', '0', 'attention', 'self', 'query', 'bias']
['text_encoder', 'bert', 'encoder', 'layer', '0', 'attention', 'self', 'key', 'weight']
['text_encoder', 'bert', 'encoder', 'layer', '0', 'attention', 'self', 'key', 'bias']
['text_encoder', 'bert', 'encoder', 'layer', '0', 'attention', 'self', 'value', 'weight']
['text_encoder', 'bert', 'encoder', 'layer', '0', 'attention', 'self', 'value', 'bias']
['text_encoder', 'bert', 'encoder', 'layer', '0', 'attention', 'output', 'dense', 'weight']
['text_encoder', 'bert', 'encoder', 'layer', '0', 'attention', 'output', 'dense', 'bias']
['text_encoder', 'bert', 'encoder', 'layer', '0', 'attention', 'output', 'LayerNorm', 'weight']
['text_encoder', 'bert', 'encoder', 'layer', '0', 'attention', 'output', 'LayerNorm', 'bias']
['text_encod

In [19]:
# handle distributed training
model_without_ddp = model
if args.distributed:
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
    model_without_ddp = model.module


In [20]:
# training loop, single GPU
print("Start training")
start_time = time.time()

for epoch in range(start_epoch, max_epoch):
    if epoch>0:
        lr_scheduler.step(epoch+warmup_steps)

    if not args.evaluate:
        if args.distributed:
            train_loader.sampler.set_epoch(epoch)

        train_stats = train(model, train_loader, optimizer, tokenizer, epoch, warmup_steps, device, lr_scheduler, config)

    if args.evaluate:
        break

    if utils.is_main_process():
        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
                      'epoch': epoch,
                    }
        with open(os.path.join(args.output_dir, "log.txt"),"a") as f:
            f.write(json.dumps(log_stats) + "\n")

        save_obj = {
            'model': model_without_ddp.state_dict(),
            'optimizer': optimizer.state_dict(),
            'lr_scheduler': lr_scheduler.state_dict(),
            'config': config,
            'epoch': epoch,
        }
        torch.save(save_obj, os.path.join(args.output_dir, 'checkpoint_%02d.pth'%epoch))
    if args.distributed:
        dist.barrier()
    else:
        pass  # Skip barrier for non-distributed training

# evaluation
vqa_result = evaluation(model, test_loader, tokenizer, device, config)
result_file = save_result(vqa_result, args.result_dir, 'vqa_result_epoch%d'%epoch)

total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Time time {}'.format(total_time_str))

Start training


  offset = -low * scale
  offset = -low * scale
  offset = -low * scale


OutOfMemoryError: CUDA out of memory. Tried to allocate 164.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 38.12 MiB is free. Process 10381 has 14.70 GiB memory in use. Of the allocated memory 14.23 GiB is allocated by PyTorch, and 351.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
GOOGLE_DRIVE_PATH

In [None]:
# save result to google drive
!cp -r {args.output_dir} {GOOGLE_DRIVE_PATH}