## Model

In [None]:
from prismatic.models import get_llm_backbone_and_tokenizer, get_vision_backbone_and_transform, get_vlm

In [None]:
vision_backbone, image_transform = get_vision_backbone_and_transform(
        "dinosiglip-vit-so-384px", image_resize_strategy="resize-naive"
    )

In [None]:
llm_backbone, tokenizer = get_llm_backbone_and_tokenizer(
        "phi2_base", llm_max_length=2048
    )

In [None]:
vlm = get_vlm(
    model_id = "dino-siglip-phi2",
    arch_specifier = "gelu-mlp",
    vision_backbone = vision_backbone,
    llm_backbone = llm_backbone,
)
print(vlm.llm_backbone)

In [None]:
vlm.freeze_backbones(stage="finetune")

print("Total params = ",sum(p.numel() for p in vlm.parameters()))
print("Trainable params = ",sum(p.numel() for p in vlm.parameters() if p.requires_grad))

#       original
# Total params =  3,519,119,543
# Trainable params =  3,519,119,543

#       align
# Total params =  3,519,119,543
# Trainable params =  12,129,280

# "finetune" with LoRA ; commented the line # self.llm_backbone.requires_grad_(True) in prismatic.py
# Total params =  3,521,740,983
# Trainable params =  14,750,720

In [None]:
prompter = vlm.get_prompt_builder()
prompter = PurePromptBuilder(model_family="phi2")
print(prompter.model_family, prompter.bos, prompter.eos, prompter.prompt, prompter.turn_count)

In [14]:
prompter.add_turn(
    role = "human",
    message = "<image> How many apples are in this picture?"
)
print("\n",prompter.prompt, prompter.turn_count)

prompter.add_turn(
    role = "gpt",
    message = "There are 10 apples."
)
print("\n",prompter.prompt, prompter.turn_count)

prompter.add_turn(
    role = "human",
    message = "<image> How many oranges are in this picture?"
)
print("\n",prompter.prompt, prompter.turn_count)

prompter.add_turn(
    role = "gpt",
    message = "There are no oranges in this picture."
)
print("\n",prompter.prompt, prompter.turn_count)


 In: How many apples are in this picture?
Out:  1

 In: How many apples are in this picture?
Out: There are 10 apples.<|endoftext|> 2

 In: How many apples are in this picture?
Out: There are 10 apples.<|endoftext|>In: How many oranges are in this picture?
Out:  3

 In: How many apples are in this picture?
Out: There are 10 apples.<|endoftext|>In: How many oranges are in this picture?
Out:  <|endoftext|> 4
