## Model

In [1]:
from prismatic.models import get_llm_backbone_and_tokenizer, get_vision_backbone_and_transform, get_vlm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
vision_backbone, image_transform = get_vision_backbone_and_transform(
        "dinosiglip-vit-so-384px", image_resize_strategy="resize-naive"
    )

In [3]:
llm_backbone, tokenizer = get_llm_backbone_and_tokenizer(
        "phi2_base", llm_max_length=2048
    )

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.28s/it]


In [4]:
vlm = get_vlm(
    model_id = "dino-siglip-phi2",
    arch_specifier = "gelu-mlp",
    vision_backbone = vision_backbone,
    llm_backbone = llm_backbone,
)
print(vlm.llm_backbone)

Phi2LLMBackbone(
  (llm): PeftModelForCausalLM(
    (base_model): LoraModel(
      (model): PhiForCausalLM(
        (model): PhiModel(
          (embed_tokens): Embedding(50295, 2560)
          (embed_dropout): Dropout(p=0.0, inplace=False)
          (layers): ModuleList(
            (0-31): 32 x PhiDecoderLayer(
              (self_attn): PhiAttention(
                (q_proj): lora.Linear(
                  (base_layer): Linear(in_features=2560, out_features=2560, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=2560, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=8, out_features=2560, bias=False)
                  )
                  (lora_embedding_A): ParameterDict()
                  (lora_embedding_B): ParameterD

In [5]:
vlm.freeze_backbones(stage="finetune")

print("Total params = ",sum(p.numel() for p in vlm.parameters()))
print("Trainable params = ",sum(p.numel() for p in vlm.parameters() if p.requires_grad))

#       original
# Total params =  3,519,119,543
# Trainable params =  3,519,119,543

#       align
# Total params =  3,519,119,543
# Trainable params =  12,129,280

# "finetune" without LoRA ; commented the line # self.llm_backbone.requires_grad_(True) in prismatic.py
# Total params =  3519119543
# Trainable params =  2787178615

# "finetune" with LoRA ; commented the line # self.llm_backbone.requires_grad_(True) in prismatic.py
# Total params =  3521740983
# Trainable params =  14750720

Total params =  3521740983
Trainable params =  14750720


In [12]:
prompter = vlm.get_prompt_builder()
prompter = PurePromptBuilder(model_family="phi2")
print(prompter.model_family, prompter.bos, prompter.eos, prompter.prompt, prompter.turn_count)

phi2 <|endoftext|> <|endoftext|>  0


In [14]:
prompter.add_turn(
    role = "human",
    message = "<image> How many apples are in this picture?"
)
print("\n",prompter.prompt, prompter.turn_count)

prompter.add_turn(
    role = "gpt",
    message = "There are 10 apples."
)
print("\n",prompter.prompt, prompter.turn_count)

prompter.add_turn(
    role = "human",
    message = "<image> How many oranges are in this picture?"
)
print("\n",prompter.prompt, prompter.turn_count)

prompter.add_turn(
    role = "gpt",
    message = "There are no oranges in this picture."
)
print("\n",prompter.prompt, prompter.turn_count)

# PurePromptBuilder doesnot include system prompt yet!
# <|endoftext|> appearing in chat history ???


 In: How many apples are in this picture?
Out:  1

 In: How many apples are in this picture?
Out: There are 10 apples.<|endoftext|> 2

 In: How many apples are in this picture?
Out: There are 10 apples.<|endoftext|>In: How many oranges are in this picture?
Out:  3

 In: How many apples are in this picture?
Out: There are 10 apples.<|endoftext|>In: How many oranges are in this picture?
Out:  <|endoftext|> 4
