In [8]:
from open_flamingo import create_model_and_transforms
from peft.src.peft import LoraModel, LoraConfig
import torch

model, image_processor, tokenizer = create_model_and_transforms(
    clip_vision_encoder_path="ViT-L-14", clip_vision_encoder_pretrained="openai", lang_encoder_path="./llama-7b-hf", tokenizer_path="./llama-7b-hf", cross_attn_every_n_layers=4
)

checkpoint_path = "/home/v-boli7/azure_storage/models/openflamingo/checkpoint.pt"
model.load_state_dict(torch.load(checkpoint_path), strict=False)

config = LoraConfig(
    peft_type="LORA",
    task_type="SEQ_2_SEQ_LM",
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.01,
)

lora_model = LoraModel(config, model)

total_params = sum(p.numel() for p in lora_model.parameters() if p.requires_grad)
print(f"Total number of trainable parameters in {total_params / 1e6}M")

In [None]:
for name, param in lora_model.named_parameters():
    if param.requires_grad:
        print(name)

print(f"Total number of trainable parameters in {total_params / 1e6}M")

In [1]:
from lavis.datasets.builders import dataset_zoo
dataset_names = dataset_zoo.get_names()
print(dataset_names)

  from .autonotebook import tqdm as notebook_tqdm


['aok_vqa', 'avsd_dialogue', 'coco_caption', 'coco_retrieval', 'coco_vqa', 'conceptual_caption_12m', 'conceptual_caption_3m', 'didemo_retrieval', 'flickr30k', 'gqa', 'imagenet', 'laion2B_multi', 'msrvtt_caption', 'msrvtt_qa', 'msrvtt_retrieval', 'msvd_caption', 'msvd_qa', 'nlvr', 'nocaps', 'ok_vqa', 'sbu_caption', 'snli_ve', 'vatex_caption', 'vg_caption', 'vg_vqa']


In [6]:
from lavis.datasets.builders import load_dataset
coco_dataset = load_dataset("coco_caption")
coco_dataloader = torch.utils.data.DataLoader(coco_dataset, batch_size=1, shuffle=False, num_workers=0)
for batch in coco_dataloader:
    print(batch)
    break
# {'image': <PIL.Image.Image image mode=RGB size=640x480>,
#  'text_input': 'A woman wearing a net on her head cutting a cake. ',
#  'image_id': 0}

Using downloaded and verified file: /home/v-boli7/azure_storage/data/lavis/coco/annotations/coco_karpathy_train.json
Using downloaded and verified file: /home/v-boli7/azure_storage/data/lavis/coco/annotations/coco_karpathy_val.json
Using downloaded and verified file: /home/v-boli7/azure_storage/data/lavis/coco/annotations/coco_karpathy_test.json


In [4]:
from lavis.datasets.builders import load_dataset
vqav2_dataset = load_dataset("aok_vqa")
print(vqav2_dataset.keys())
print(len(vqav2_dataset["train"]))
print(vqav2_dataset["train"][0])


Downloading https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json to /home/v-boli7/azure_storage/data/lavis/aokvqa/annotations/aokvqa_v1p0_train.json


100%|██████████| 11358135/11358135 [00:00<00:00, 36972039.44it/s]


Downloading https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json to /home/v-boli7/azure_storage/data/lavis/aokvqa/annotations/aokvqa_v1p0_val.json


100%|██████████| 751694/751694 [00:00<00:00, 22747057.45it/s]


Downloading https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json to /home/v-boli7/azure_storage/data/lavis/aokvqa/annotations/specialized_vocab_train_lavis.json


100%|██████████| 52108/52108 [00:00<00:00, 6395224.37it/s]


Downloading https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json to /home/v-boli7/azure_storage/data/lavis/aokvqa/annotations/aokvqa_v1p0_test.json


100%|██████████| 2048674/2048674 [00:00<00:00, 54158677.12it/s]

Using downloaded and verified file: /home/v-boli7/azure_storage/data/lavis/aokvqa/annotations/specialized_vocab_train_lavis.json





dict_keys(['train', 'val', 'test'])
17056
{'image': <PIL.Image.Image image mode=RGB size=640x480 at 0x7F1D0B50E880>, 'text_input': 'What is the man by the bags awaiting?', 'answers': ['ride', 'bus', 'taxi', 'travelling', 'traffic', 'cab', 'his ride'], 'weights': [0.2, 0.1, 0.2, 0.1, 0.1, 0.2, 0.1]}
