In [1]:
import os
import copy
from dataclasses import dataclass, field
import json
import logging
import pathlib
from typing import Dict, Optional, Sequence, List

import torch

import transformers

from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from torch.utils.data import Dataset
from llava.train.llava_trainer import LLaVATrainer

from llava import conversation as conversation_lib
from llava.model import *
from llava.mm_utils import tokenizer_image_token

from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


[2023-10-26 21:38:11,503] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:

# ViTMAEConfig()
# LlavaGeoConfig()
from llava.model.language_model.llava_llama import LlavaGeoConfig, LlavaGeoLlamaForCausalLM
from transformers import AutoImageProcessor, ViTMAEForPreTraining
from transformers.models.vit_mae.configuration_vit_mae import *
from transformers import AutoConfig, AutoModelForCausalLM, \
                         LlamaConfig, LlamaModel, LlamaForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig


In [5]:
# init model from scratch
config = LlavaGeoConfig.from_pretrained("liuhaotian/llava-v1.5-7b")
model = LlavaGeoLlamaForCausalLM(config)
model

You are using a model of type llava to instantiate a model of type llava_geo. This is not supported for all configurations of models and can yield errors.


In [3]:
# load model from pretraing
mae_decoder_config = {
    "base_config": "facebook/vit-mae-base",
    "norm_pix_loss": True,
    "decoder_num_hidden_layers": 2
}
losses = ["lm", "mae"]
loss_weights = {
    "lm": 1.0,
    "mae": 1.0
}
device = 'cuda:2'

kwargs = {
    "device_map": device,
    "mae_decoder_config": mae_decoder_config,
    "losses": losses,
    "loss_weights": loss_weights,
    # "torch_dtype": torch.bfloat16
}

model = LlavaGeoLlamaForCausalLM.from_pretrained('liuhaotian/llava-v1.5-7b', **kwargs)
model

You are using a model of type llava to instantiate a model of type llava_geo. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 2/2 [00:47<00:00, 23.56s/it]
Some weights of LlavaGeoLlamaForCausalLM were not initialized from the model checkpoint at liuhaotian/llava-v1.5-7b and are newly initialized: ['mae_decoder.decoder_layers.1.intermediate.dense.weight', 'mae_decoder.decoder_layers.0.intermediate.dense.weight', 'mae_decoder.mask_token', 'mae_decoder.decoder_layers.1.layernorm_before.bias', 'mae_decoder.decoder_layers.0.attention.output.dense.weight', 'mae_decoder.decoder_layers.0.attention.attention.key.bias', 'mae_decoder.decoder_norm.weight', 'mae_decoder.decoder_layers.1.attention.attention.value.bias', 'mae_decoder.decoder_layers.0.layernorm_after.bias', 'mae_decoder.decoder_layers.0.layernorm_after.weight', 'mae_decoder.decoder_layers.1.attention.attention.query.bias', 'mae_decoder.decoder_embed.weight', 'mae

LlavaGeoLlamaForCausalLM(
  (model): LlavaLlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm

In [None]:
# model.config
# Check dtype of each parameter
for name, param in model.named_parameters():
    param_dtype = param.dtype
    print(name, param_dtype)

In [5]:
import argparse
import torch

from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria

import requests
from PIL import Image
from io import BytesIO
from transformers import TextStreamer

import json
from tqdm import tqdm
import copy

device = 'cuda:2'

vision_tower_dtype = torch.bfloat16
# vision_tower_dtype = torch.float32
# model.to(device, dtype=torch.float16)

tokenizer = AutoTokenizer.from_pretrained("liuhaotian/llava-v1.5-7b")

vision_tower = model.get_vision_tower()
if not vision_tower.is_loaded:
    vision_tower.load_model()
vision_tower.to(device=device, dtype=vision_tower_dtype)
image_processor = vision_tower.image_processor



In [6]:
# get inputs
text_prompt = DEFAULT_IMAGE_TOKEN + '\n' + "What's in this image?"
input_ids = tokenizer_image_token(
    text_prompt, 
    tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(device=model.device)

image = Image.open('/data/wangz3/projects/ecole-gvs-method/third_party/LLaVA/images/llava_logo.png').convert('RGB')
image_tensor = process_images([image], image_processor, {})
if type(image_tensor) is list:
    image_tensor = [image.to(model.device, dtype=vision_tower_dtype) for image in image_tensor]
else:
    image_tensor = image_tensor.to(model.device, dtype=vision_tower_dtype)

# do inference
conv_mode = "llava_v1"
conv = conv_templates[conv_mode].copy()

stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)

# print("=== test inference ===")
# temperature = 0.2
# output_ids = model.generate(
#     input_ids,
#     images=image_tensor,
#     do_sample=True if temperature > 0 else False,
#     temperature=temperature,
#     max_new_tokens=520,
#     # streamer=streamer,
#     use_cache=True,
#     stopping_criteria=[stopping_criteria])
# outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
# print(outputs)


print("=== test forward ===")
labels = input_ids.clone()
# image_tensor = torch.rand(1, 3, 336, 336).to(device=model.device, dtype=torch.float16)
outputs = model(input_ids, labels = input_ids, images=image_tensor)



=== test forward ===
images.shape: torch.Size([1, 3, 336, 336])
image_features.shape: torch.Size([1, 577, 1024])
inputs_embeds shape torch.Size([1, 586, 4096])
image_features_with_cls shape torch.Size([1, 577, 4096])
lm loss: tensor(4.1562, device='cuda:2', dtype=torch.bfloat16,
       grad_fn=<NllLossBackward0>)
sequence_unmasked shape torch.Size([1, 144, 4096])
mask shape torch.Size([1, 576])
ids_restore shape torch.Size([1, 576])
reconstruction_logits shape torch.Size([1, 576, 588])
pred shape | target shape: torch.Size([1, 576, 588]) torch.Size([1, 576, 588])
reconstruction_loss tensor(nan, device='cuda:2', grad_fn=<DivBackward0>)
total loss: tensor(nan, device='cuda:2', grad_fn=<AddBackward0>)
> [0;32m/data/wangz3/projects/ecole-gvs-method/third_party/LLaVA/llava/model/language_model/llava_llama.py[0m(385)[0;36mforward[0;34m()[0m
[0;32m    383 [0;31m            [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m

In [9]:
import os
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria

# os.environ["CUDA_VISIBLE_DEVICES"] = "7"
# device = torch.device("cuda:7")
# # print(text_inputs)
# model.to(device, dtype=torch.float16)
# vision_tower.to(device=model.device, dtype=torch.float16)
# model.eval()
# print(model)

tokenizer = AutoTokenizer.from_pretrained("liuhaotian/llava-v1.5-7b")
text_prompt = DEFAULT_IMAGE_TOKEN + '\n' + "Hello, my dog is cute"
input_ids = tokenizer_image_token(
    text_prompt, 
    tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(device=model.device)
print(input_ids.shape)
images = torch.rand(1, 3, 336, 336).to(device=model.device, dtype=torch.float16)
# text_inputs = tokenizer("Hello, my dog is cute", return_tensors="pt").to(model.device)
outputs = model(input_ids, labels = input_ids, images=images)



torch.Size([1, 11])
select feature: cls_patch
images.shape: torch.Size([1, 3, 336, 336])
image_features.shape: torch.Size([1, 577, 1024])
select feature: patch
inputs_embeds shape torch.Size([1, 586, 4096])
image_features_with_cls shape torch.Size([1, 577, 4096])
sequence_unmasked shape torch.Size([1, 144, 4096])
mask shape torch.Size([1, 576])
ids_restore shape torch.Size([1, 576])
reconstruction_logits shape torch.Size([1, 576, 588])
pred shape | target shape: torch.Size([1, 576, 588]) torch.Size([1, 576, 588])
reconstruction_loss tensor(nan, device='cuda:2', grad_fn=<DivBackward0>)
total loss: tensor(nan, device='cuda:2', grad_fn=<AddBackward0>)


In [None]:

# def count_parameters(model):
#     total_params = sum(p.numel() for p in model.parameters())
#     print(f'{total_params:,} total parameters.')
#     llava_params = sum(p.numel() for p in model.model.parameters())
#     print(f'{llava_params:,} llava parameters.')
#     mm_projector_params = sum(p.numel() for p in model.model.mm_projector.parameters())
#     print(f'{mm_projector_params:,} mm projector parameters.')
#     mae_decoder_params = sum(p.numel() for p in model.mae_decoder.parameters())
#     print(f'{mae_decoder_params:,} mae decoder parameters.')

#     print("mae decoder param ratio:", mae_decoder_params / total_params)
#     # trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
# print(model)
# count_parameters(model)


In [None]:
# device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")
# model.to(device, dtype=torch.float16)
# for name, param in model.named_parameters():
#     print(f"Layer {name} has dtype: {param.dtype}")

In [23]:
from transformers import AutoImageProcessor, ViTMAEForPreTraining
from transformers.models.vit_mae.configuration_vit_mae import *
# from transformers.models.vit_mae.configuration_vit_mae import VitMAEConfig
# image_processor = AutoImageProcessor.from_pretrained("facebook/vit-mae-base")
# mae_model = ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base")
config = ViTMAEConfig.from_pretrained("facebook/vit-mae-base")
config.hidden_size = 4
config.decoder_hidden_size = 4
config.decoder_intermediate_size = 8
config.intermediate_size = 8
config.decoder_num_attention_heads = 2
config.num_attention_heads = 2
config.num_hidden_layers = 2
config.decoder_num_hidden_layers = 2

print(config)

from transformers.models.vit_mae.modeling_vit_mae import *
mae_decoder = ViTMAEDecoder(config, num_patches = 16)


ViTMAEConfig {
  "architectures": [
    "ViTMAEForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.0,
  "decoder_hidden_size": 4,
  "decoder_intermediate_size": 8,
  "decoder_num_attention_heads": 2,
  "decoder_num_hidden_layers": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 4,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 8,
  "layer_norm_eps": 1e-12,
  "mask_ratio": 0.75,
  "model_type": "vit_mae",
  "norm_pix_loss": false,
  "num_attention_heads": 2,
  "num_channels": 3,
  "num_hidden_layers": 2,
  "patch_size": 16,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.31.0"
}



In [27]:
def random_masking(mask_ratio, sequence, noise=None):
    """
    Perform per-sample random masking by per-sample shuffling. Per-sample shuffling is done by argsort random
    noise.

    Args:
        sequence (`torch.LongTensor` of shape `(batch_size, sequence_length, dim)`)
        noise (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*) which is
            mainly used for testing purposes to control randomness and maintain the reproducibility
    """
    batch_size, seq_length, dim = sequence.shape
    len_keep = int(seq_length * (1 - mask_ratio))

    if noise is None:
        noise = torch.rand(batch_size, seq_length, device=sequence.device)  # noise in [0, 1]
    print(sequence)
    print(noise)

    # sort noise for each sample
    ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
    print(ids_shuffle)
    ids_restore = torch.argsort(ids_shuffle, dim=1)
    print(ids_restore)

    # keep the first subset
    ids_keep = ids_shuffle[:, :len_keep]
    sequence_unmasked = torch.gather(sequence, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, dim))
    print(ids_keep.unsqueeze(-1).repeat(1, 1, dim))
    print(sequence_unmasked)

    # generate the binary mask: 0 is keep, 1 is remove
    mask = torch.ones([batch_size, seq_length], device=sequence.device)
    mask[:, :len_keep] = 0
    # unshuffle to get the binary mask
    mask = torch.gather(mask, dim=1, index=ids_restore)
    print(mask)
    return sequence_unmasked, mask, ids_restore

# set random seed
torch.manual_seed(0)
batch_size, num_patches, dim = 1, 16, config.hidden_size 
test_seq = torch.rand((batch_size, num_patches, dim))
sequence_unmasked, mask, ids_restore = random_masking(0.5, test_seq)
# add cls token
cls_token = torch.rand((batch_size, 1, dim))
hidden_states = torch.cat((cls_token, sequence_unmasked), dim=1)
print(hidden_states.shape)

tensor([[[0.4963, 0.7682, 0.0885, 0.1320],
         [0.3074, 0.6341, 0.4901, 0.8964],
         [0.4556, 0.6323, 0.3489, 0.4017],
         [0.0223, 0.1689, 0.2939, 0.5185],
         [0.6977, 0.8000, 0.1610, 0.2823],
         [0.6816, 0.9152, 0.3971, 0.8742],
         [0.4194, 0.5529, 0.9527, 0.0362],
         [0.1852, 0.3734, 0.3051, 0.9320],
         [0.1759, 0.2698, 0.1507, 0.0317],
         [0.2081, 0.9298, 0.7231, 0.7423],
         [0.5263, 0.2437, 0.5846, 0.0332],
         [0.1387, 0.2422, 0.8155, 0.7932],
         [0.2783, 0.4820, 0.8198, 0.9971],
         [0.6984, 0.5675, 0.8352, 0.2056],
         [0.5932, 0.1123, 0.1535, 0.2417],
         [0.7262, 0.7011, 0.2038, 0.6511]]])
tensor([[0.7745, 0.4369, 0.5191, 0.6159, 0.8102, 0.9801, 0.1147, 0.3168, 0.6965,
         0.9143, 0.9351, 0.9412, 0.5995, 0.0652, 0.5460, 0.1872]])
tensor([[13,  6, 15,  7,  1,  2, 14, 12,  3,  8,  0,  4,  9, 10, 11,  5]])
tensor([[10,  4,  5,  8, 11, 15,  1,  3,  9, 12, 13, 14,  7,  0,  6,  2]])
tensor([[[13

In [30]:
decoder_output = mae_decoder(hidden_states, ids_restore)
print(hidden_states.shape)
print(ids_restore.shape)
print(decoder_output.logits.shape) # (batch_size, num_patches, patch_size * patch_size * 3)

torch.Size([1, 9, 4])
torch.Size([1, 16])
torch.Size([1, 16, 768])


In [36]:
# the following is how CLIP encoder get the image patches
# the patches are ordered: left to right, top to bottom
from transformers.models.clip.modeling_clip import CLIPVisionConfig
class CLIPVisionEmbeddings(nn.Module):
    def __init__(self, config: CLIPVisionConfig):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.image_size = config.image_size
        self.patch_size = config.patch_size

        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))

        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,
            out_channels=self.embed_dim,
            kernel_size=self.patch_size,
            stride=self.patch_size,
            bias=False,
        )

        self.num_patches = (self.image_size // self.patch_size) ** 2
        self.num_positions = self.num_patches + 1
        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)

    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        batch_size = pixel_values.shape[0]
        target_dtype = self.patch_embedding.weight.dtype
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
        print("patch_embeds.shape", patch_embeds.shape)
        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
        print("patch_embeds.shape falttened", patch_embeds.shape)

        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
        embeddings = embeddings + self.position_embedding(self.position_ids)
        return embeddings

clip_config = CLIPVisionConfig()
print(clip_config)
clip_emb_layer = CLIPVisionEmbeddings(clip_config)
print(clip_emb_layer)
pixel_values = torch.rand((batch_size, 3, 224, 224))
patch_embeds = clip_emb_layer(pixel_values)
print(patch_embeds.shape)

CLIPVisionConfig {
  "attention_dropout": 0.0,
  "hidden_act": "quick_gelu",
  "hidden_size": 768,
  "image_size": 224,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "model_type": "clip_vision_model",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 32,
  "projection_dim": 512,
  "transformers_version": "4.31.0"
}

CLIPVisionEmbeddings(
  (patch_embedding): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
  (position_embedding): Embedding(50, 768)
)
patch_embeds.shape torch.Size([1, 768, 7, 7])
patch_embeds.shape falttened torch.Size([1, 49, 768])
torch.Size([1, 50, 768])


In [39]:
patch_size, num_channels = 32, 3
def patchify(pixel_values):
    """
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values.

    Returns:
        `torch.FloatTensor` of shape `(batch_size, num_patches, patch_size**2 * num_channels)`:
            Patchified pixel values.
    """
    # sanity checks
    if (pixel_values.shape[2] != pixel_values.shape[3]) or (pixel_values.shape[2] % patch_size != 0):
        raise ValueError("Make sure the pixel values have a squared size that is divisible by the patch size")
    if pixel_values.shape[1] != num_channels:
        raise ValueError(
            "Make sure the number of channels of the pixel values is equal to the one set in the configuration"
        )

    # patchify
    batch_size = pixel_values.shape[0]
    num_patches_one_direction = pixel_values.shape[2] // patch_size
    print(num_patches_one_direction)
    patchified_pixel_values = pixel_values.reshape(
        batch_size, num_channels, num_patches_one_direction, patch_size, num_patches_one_direction, patch_size
    )
    print(patchified_pixel_values.shape)
    patchified_pixel_values = torch.einsum("nchpwq->nhwpqc", patchified_pixel_values)
    print(patchified_pixel_values.shape)
    patchified_pixel_values = patchified_pixel_values.reshape(
        batch_size, num_patches_one_direction * num_patches_one_direction, patch_size**2 * num_channels
    )
    print(patchified_pixel_values.shape)
    return patchified_pixel_values

patchified_pixel_values = patchify(pixel_values)
print(patchified_pixel_values.shape)

7
torch.Size([1, 3, 7, 32, 7, 32])
torch.Size([1, 7, 7, 32, 32, 3])
torch.Size([1, 49, 3072])
torch.Size([1, 49, 3072])


In [18]:
from transformers.models.clip.modeling_clip import CLIPVisionConfig
CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14-336")

CLIPVisionConfig {
  "attention_dropout": 0.0,
  "dropout": 0.0,
  "hidden_act": "quick_gelu",
  "hidden_size": 1024,
  "image_size": 336,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "model_type": "clip_vision_model",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 14,
  "projection_dim": 768,
  "transformers_version": "4.31.0"
}

In [9]:
from transformers.models.vit_mae.configuration_vit_mae import *
from transformers import AutoConfig, AutoModelForCausalLM, \
                         LlamaConfig, LlamaModel, LlamaForCausalLM
                         
class LlavaGeoConfig(LlamaConfig):
    model_type: str = "llava_geo"
    mae_decoder_config: ViTMAEConfig

In [13]:
ViTMAEConfig.from_pretrained("facebook/vit-mae-base")

ViTMAEConfig {
  "architectures": [
    "ViTMAEForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.0,
  "decoder_hidden_size": 512,
  "decoder_intermediate_size": 2048,
  "decoder_num_attention_heads": 16,
  "decoder_num_hidden_layers": 8,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "mask_ratio": 0.75,
  "model_type": "vit_mae",
  "norm_pix_loss": false,
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.31.0"
}