In [None]:
"""
!pip install -q condacolab
import condacolab
condacolab.install()

!conda create -n llava python=3.10 -y
!conda run -n llava pip install torch==2.0.1 torchvision==0.15.2
!conda run -n llava pip install transformers==4.31.0
!conda run -n llava pip install tokenizers==0.13.3
!conda run -n llava pip install numpy==1.26.0
!conda run -n llava pip install accelerate==0.21.0

# Sccessfully created conda environment named llava, but the default environment is still python 3.11 without torch module, the default Colab environment
!python -c "import torch, transformers; print(torch.__version__, transformers.__version__)"

# We have to add !conda run -n llava to excute the script in llava environment. This is a bit uncomfortable.
!conda run -n llava python -c "import torch, transformers; print(torch.__version__, transformers.__version__)"
"""

In [None]:
!python -c "import torch, transformers; print(torch.__version__, transformers.__version__)"

In [None]:
import transformers
import torch
from dataclasses import dataclass, field
from typing import Optional

In [None]:
@dataclass
class ModelArguments:
    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
    version: Optional[str] = field(default="v0")
    freeze_backbone: bool = field(default=False)
    tune_mm_mlp_adapter: bool = field(default=False)
    vision_tower: Optional[str] = field(default=None) # default to None
    mm_vision_select_layer: Optional[int] = field(default=-1)   # default to the last layer
    pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
    mm_projector_type: Optional[str] = field(default='linear')
    mm_use_im_start_end: bool = field(default=False)
    mm_use_im_patch_token: bool = field(default=True)
    mm_patch_merge_type: Optional[str] = field(default='flat')
    mm_vision_select_feature: Optional[str] = field(default="patch")

@dataclass
class DataArguments:
    data_path: str = field(default=None,
                           metadata={"help": "Path to the training data."})
    lazy_preprocess: bool = False
    is_multimodal: bool = False
    image_folder: Optional[str] = field(default=None)
    image_aspect_ratio: str = 'square'

@dataclass
class TrainingArguments(transformers.TrainingArguments):
    cache_dir: Optional[str] = field(default=None)
    optim: str = field(default="adamw_torch")
    remove_unused_columns: bool = field(default=False)
    freeze_mm_mlp_adapter: bool = field(default=False)
    mpt_attn_impl: Optional[str] = field(default="triton")
    model_max_length: int = field(
        default=512,
        metadata={
            "help":
            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
        },
    )
    double_quant: bool = field(
        default=True,
        metadata={"help": "Compress the quantization statistics through double quantization."}
    )
    quant_type: str = field(
        default="nf4",
        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
    )
    bits: int = field(
        default=16,
        metadata={"help": "How many bits to use."}
    )
    lora_enable: bool = False
    lora_r: int = 64
    lora_alpha: int = 16
    lora_dropout: float = 0.05
    lora_weight_path: str = ""
    lora_bias: str = "none"
    mm_projector_lr: Optional[float] = None
    group_by_modality_length: bool = field(default=False)


In [None]:
from transformers import HfArgumentParser

args_dict = {
    #"deepspeed": "./scripts/zero2.json",
    "model_name_or_path": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    "version": "plain",
    "data_path": "/LLaVA/blip_laion_cc_sbu_1.json",
    "image_folder": "/LLaVA/images/",
    "vision_tower": "openai/clip-vit-large-patch14-336",
    "mm_projector_type": "mlp2x_gelu",
    "tune_mm_mlp_adapter": True,
    "mm_vision_select_layer": -2,
    "mm_use_im_start_end": False,
    "mm_use_im_patch_token": False,
    "bf16": True,
    "output_dir": "./checkpoints/llava-TinyLlama-1.1B-Chat-v1.0",

    # TrainingArguments 相当
    "num_train_epochs": 1,
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "gradient_accumulation_steps": 1,
    "evaluation_strategy": "no",
    "save_strategy": "steps",
    "save_steps": 1,
    "save_total_limit": 1,
    "learning_rate": 1e-3,
    "weight_decay": 0.0,
    "warmup_ratio": 0.03,
    "lr_scheduler_type": "cosine",
    "logging_steps": 1,
    "tf32": False, # switched from True for TinyLlama
    "model_max_length": 2048,
    "gradient_checkpointing": True,
    "dataloader_num_workers": 2,
    "lazy_preprocess": True,
    "report_to": "none",
}

In [None]:
parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_dict(args_dict)
print("model_args\n", model_args)
print("data_args\n", data_args)
print("training_args\n", training_args)

In [None]:
# Model Constants
IGNORE_INDEX = -100
IMAGE_TOKEN_INDEX = -200
DEFAULT_IMAGE_TOKEN = "<image>"
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
DEFAULT_IM_START_TOKEN = "<im_start>"
DEFAULT_IM_END_TOKEN = "<im_end>"
IMAGE_PLACEHOLDER = "<image-placeholder>"

In [None]:
local_rank = training_args.local_rank
print("local_rank\n", local_rank)
compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
print("compute_dtype\n", compute_dtype)
bnb_model_from_pretrained_args = {} # bitsandbytes
print("bnb_model_from_pretrained_args\n", bnb_model_from_pretrained_args)

In [None]:
"""
from transformers import CLIPModel

normal_clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14-336")
print("normal_clip_model\n", normal_clip_model)
"""

In [None]:
"""
from transformers import CLIPImageProcessor

image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
print("image_processor\n", image_processor)
"""

In [None]:
"""
from PIL import Image
import requests
from io import BytesIO
from transformers import CLIPImageProcessor
import torch
import torchvision.transforms as T
import matplotlib.pyplot as plt

# 画像 URL
url = "https://llava-vl.github.io/static/images/view.jpg"

# 画像を取得
response = requests.get(url)
img = Image.open(BytesIO(response.content)).convert("RGB")

# 前処理
processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
processed = processor(img, return_tensors="pt")

# tensor: shape (1, 3, H, W), 値は正規化済み
pix = processed["pixel_values"][0]

# 正規化を戻す
mean = torch.tensor(processor.image_mean).unsqueeze(1).unsqueeze(2)
std = torch.tensor(processor.image_std).unsqueeze(1).unsqueeze(2)
pix = pix * std + mean

# 0-1 範囲にクリップ
pix = pix.clamp(0.0, 1.0)

# 画像生成
to_pil = T.ToPILImage()
img_processed = to_pil(pix)

# ==== Colab 上で可視化 ====
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

axes[0].imshow(img)
axes[0].set_title("Original")
axes[0].axis("off")

axes[1].imshow(img_processed)
axes[1].set_title("Processed (normalized etc.)")
axes[1].axis("off")

plt.show()
"""

In [None]:
"""
from transformers import CLIPVisionModel

clip_vision_tower = CLIPVisionModel.from_pretrained("openai/clip-vit-large-patch14-336")
print("clip_vision_tower\n", clip_vision_tower)
"""

In [None]:
"""
config_clip_vision_tower = clip_vision_tower.config
print("config_clip_vision_tower\n", config_clip_vision_tower)
"""

In [None]:
from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
import torch.nn as nn
# __init__
# load_model

# result = CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
class CLIPVisionTower(nn.Module):
    def __init__(self, vision_tower, args, delay_load=False):
        # result = CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
        print("current file path", "llava/llava/model/multimodal_encoder/clip_encoder.py")
        print("def CLIPVisionTower.__init__(self, vision_tower, args, delay_load=False)")
        print("self\n", type(self))
        print("vision_tower\n", vision_tower) # openai/clip-vit-large-patch14-336
        print("args\n", args) # ModelArguments(model_name_or_path='lmsys/vicuna-7b-v1.5', version='plain', freeze_backbone=False, tune_mm_mlp_adapter=True, vision_tower='openai/clip-vit-large-patch14-336', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, mm_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch')
        print("delay_load\n", delay_load) # False
        super().__init__()

        self.is_loaded = False

        print("self.is_loaded\n", self.is_loaded) # False

        self.vision_tower_name = vision_tower
        print("self.vision_tower_name\n", self.vision_tower_name) # openai/clip-vit-large-patch14-336
        self.select_layer = args.mm_vision_select_layer
        print("self.select_layer\n", self.select_layer) # -2
        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
        print("self.select_feature\n", self.select_feature) # patch

        print(f"[COND] delay_load={delay_load}")
        if not delay_load:
            # 【ENTER】
            print("【ENTER】if not delay_load:")
            self.load_model()
        elif getattr(args, 'unfreeze_mm_vision_tower', False):
            print("【ENTER】elif getattr(args, 'unfreeze_mm_vision_tower', False):")
            self.load_model()
            print("【EXIT】elif getattr(args, 'unfreeze_mm_vision_tower', False):")
        else:
            print("【ENTER】else of if not delay_load/elif getattr(args, 'unfreeze_mm_vision_tower', False):")
            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
            print("self.cfg_only\n", self.cfg_only)
            print("【EXIT】else of if not delay_load/elif getattr(args, 'unfreeze_mm_vision_tower', False):")


    def load_model(self):

        print("current file path", "llava/llava/model/multimodal_encoder/clip_encoder.py")
        print("def CLIPVisionTower.load_model(self)")
        print("self\n", type(self))
        print("self.vision_tower_name\n", self.vision_tower_name) # openai/clip-vit-large-patch14-336
        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
        print("self.image_processor\n", self.image_processor)
        """
        CLIPImageProcessor {
        "crop_size": {
            "height": 336,
            "width": 336
        },
        "do_center_crop": true,
        "do_convert_rgb": true,
        "do_normalize": true,
        "do_rescale": true,
        "do_resize": true,
        "feature_extractor_type": "CLIPFeatureExtractor",
        "image_mean": [
            0.48145466,
            0.4578275,
            0.40821073
        ],
        "image_processor_type": "CLIPImageProcessor",
        "image_std": [
            0.26862954,
            0.26130258,
            0.27577711
        ],
        "resample": 3,
        "rescale_factor": 0.00392156862745098,
        "size": {
            "shortest_edge": 336
        }
        }
        """
        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
        print("self.vision_tower\n", self.vision_tower)
        """
        CLIPVisionModel(
        (vision_model): CLIPVisionTransformer(
            (embeddings): CLIPVisionEmbeddings(
            (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
            (position_embedding): Embedding(577, 1024)
            )
            (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (encoder): CLIPEncoder(
            (layers): ModuleList(
                (0-23): 24 x CLIPEncoderLayer(
                (self_attn): CLIPAttention(
                    (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
                    (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
                    (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
                    (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
                )
                (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
                (mlp): CLIPMLP(
                    (activation_fn): QuickGELUActivation()
                    (fc1): Linear(in_features=1024, out_features=4096, bias=True)
                    (fc2): Linear(in_features=4096, out_features=1024, bias=True)
                )
                (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
                )
            )
            )
            (post_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        )
        """
        self.vision_tower.requires_grad_(False)

        self.is_loaded = True
        print("self.is_loaded\n", self.is_loaded) # True

In [None]:
import os

def build_vision_tower(vision_tower_cfg, **kwargs):
    # vision_tower = build_vision_tower(model_args)
    print("current file path", "llava/llava/model/multimodal_encoder/builder.py")
    print("def build_vision_tower(vision_tower_cfg, **kwargs)")
    print("vision_tower_cfg\n", vision_tower_cfg) # ModelArguments(model_name_or_path='lmsys/vicuna-7b-v1.5', version='plain', freeze_backbone=False, tune_mm_mlp_adapter=True, vision_tower='openai/clip-vit-large-patch14-336', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, mm_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch')
    print("kwargs\n", kwargs) # {}
    vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
    print("vision_tower from vision_tower_cfg\n", vision_tower) # openai/clip-vit-large-patch14-336
    # ローカルに存在しない場合はFalse。存在する場合の例: /ubuntu/home/user/model/openai/clip-vit-large-patch14-336
    is_absolute_path_exists = os.path.exists(vision_tower)
    print("is_absolute_path_exists\n", is_absolute_path_exists) # False
    print(f"[COND] is_absolute_path_exists={is_absolute_path_exists} vision_tower={vision_tower}") # is_absolute_path_exists=False vision_tower=openai/clip-vit-large-patch14-336
    if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
        # 【ENTER】
        print("【ENTER】if is_absolute_path_exists or vision_tower.startswith('openai') or vision_tower.startswith('laion') or 'ShareGPT4V' in vision_tower:")
        result = CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
        print("result (return)\n", result) # CLIPVisionTowerクラスのselfに登録されたモジュール一覧を出力する
        print("【EXIT】if is_absolute_path_exists or vision_tower.startswith('openai') or vision_tower.startswith('laion') or 'ShareGPT4V' in vision_tower:")
        return result

    print("print(risk): print(vision_tower) disabled for safety")
    raise ValueError(f'Unknown vision tower: {vision_tower}')

In [None]:
"""
build_vision_tower(model_args)
"""

In [None]:
import re

def build_vision_projector(config, delay_load=False, **kwargs):

    print("current file path", "llava/llava/model/multimodal_projector/builder.py")
    print("def build_vision_projector(config, delay_load=False, **kwargs)")
    print("config\n", config)
    """
    config
    LlavaConfig {
    "_name_or_path": "lmsys/vicuna-7b-v1.5",
    "architectures": [
        "LlamaForCausalLM"
    ],
    "bos_token_id": 1,
    "eos_token_id": 2,
    "hidden_act": "silu",
    "hidden_size": 4096,
    "initializer_range": 0.02,
    "intermediate_size": 11008,
    "max_position_embeddings": 4096,
    "mm_hidden_size": 1024,
    "mm_patch_merge_type": "flat",
    "mm_projector_type": "mlp2x_gelu",
    "mm_vision_select_feature": "patch",
    "mm_vision_select_layer": -2,
    "mm_vision_tower": "openai/clip-vit-large-patch14-336",
    "model_type": "llava_llama",
    "num_attention_heads": 32,
    "num_hidden_layers": 32,
    "num_key_value_heads": 32,
    "pad_token_id": 0,
    "pretraining_tp": 1,
    "rms_norm_eps": 1e-05,
    "rope_scaling": null,
    "tie_word_embeddings": false,
    "torch_dtype": "float16",
    "transformers_version": "4.31.0",
    "use_cache": false,
    "use_mm_proj": true,
    "vocab_size": 32000
    }
    """
    print("delay_load\n", delay_load) # False
    print("kwargs\n", kwargs) # {}
    projector_type = getattr(config, 'mm_projector_type', 'linear')
    print("projector_type from config\n", projector_type) # mlp2x_gelu

    print("【COND】 projector_type\n", projector_type) # mlp2x_gelu
    if projector_type == 'linear':
      pass

    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
    print("【COND】mlp_gelu_match\n", mlp_gelu_match)
    if mlp_gelu_match:
        #【ENTER】if mlp_gelu_match:
        print("【ENTER】if mlp_gelu_match:")
        mlp_depth = int(mlp_gelu_match.group(1))
        print("mlp_depth from mlp_gelu_match.group(1)\n", mlp_depth)
        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
        print("modules after first Linear\n", modules)
        for _ in range(1, mlp_depth):
            modules.append(nn.GELU())
            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
        print("modules before Sequential\n", modules)
        result = nn.Sequential(*modules) # * はリストをアンパックして引数に展開する
        print("result (return)\n", result)
        """
        Sequential(
        (0): Linear(in_features=1024, out_features=4096, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=4096, out_features=4096, bias=True)
        )
        """
        print("【EXIT】if mlp_gelu_match:")
        return result

    print("【COND】projector_type\n", projector_type)
    if projector_type == 'identity':
      pass

    print("print(risk): print(projector_type) disabled for safety")
    raise ValueError(f'Unknown projector type: {projector_type}')

In [None]:
# LlavaMetaModel
# __init__
# get_vision_tower
# initialize_vision_modules
# unpad_image

class LlavaMetaModel:

    def __init__(self, config):

        print("current file path", "llava/model/llava_arch.py")
        print("LlavaMetaModel.__init__(self, config)")
        print("config\n", config)
        # LlamaModelの__init_を呼び出す
        super(LlavaMetaModel, self).__init__(config)

        print(f"[COND] mm_vision_tower={hasattr(config, 'mm_vision_tower')}")
        if hasattr(config, "mm_vision_tower"):
            print("【ENTER】if hasattr(config, 'mm_vision_tower'):")
            self.vision_tower = build_vision_tower(config, delay_load=True)
            print("self.vision_tower\n", self.vision_tower)
            self.mm_projector = build_vision_projector(config)
            print("self.mm_projector\n", self.mm_projector)

            print("self.config.mm_patch_merge_type\n", self.config.mm_patch_merge_type)
            print(f"[COND] unpad_in_mm_patch_merge_type={'unpad' in getattr(config, 'mm_patch_merge_type', '')}")
            if 'unpad' in getattr(config, 'mm_patch_merge_type', ''):
              pass

In [None]:
from transformers import LlamaConfig, LlamaModel

class LlavaConfig(LlamaConfig):
    model_type = "llava_llama"


class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
    config_class = LlavaConfig

    def __init__(self, config: LlamaConfig):

        print("current file path", "llava/llava/model/language_model/llava_llama.py")
        print("def LlavaLlamaModel.__init__(self, config: LlamaConfig)")
        print("self\n", type(self))
        print("config\n", config)
        super(LlavaLlamaModel, self).__init__(config)

In [None]:
# LlavaMetaForCausalLM
# get_vision_tower
# encode_images
# prepare_inputs_labels_for_multimodal
# initialize_vision_tokenizer

class LlavaMetaForCausalLM:

    def get_vision_tower(self):
        print("current file path", "llava/model/llava_arch.py")
        print("class LlavaMetaForCausalLM(ABC).get_vision_tower(self)")
        result = self.get_model().get_vision_tower()
        print("LlavaMetaForCausalLM(ABC).get_vision_tower(self) result (return)\n", result)
        """
        CLIPVisionTower(
        (vision_tower): CLIPVisionModel(
            (vision_model): CLIPVisionTransformer(
            (embeddings): CLIPVisionEmbeddings(
                (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
                (position_embedding): Embedding(577, 1024)
            )
            (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (encoder): CLIPEncoder(
                (layers): ModuleList(
                (0-23): 24 x CLIPEncoderLayer(
                    (self_attn): CLIPAttention(
                    (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
                    (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
                    (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
                    (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
                    )
                    (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
                    (mlp): CLIPMLP(
                    (activation_fn): QuickGELUActivation()
                    (fc1): Linear(in_features=1024, out_features=4096, bias=True)
                    (fc2): Linear(in_features=4096, out_features=1024, bias=True)
                    )
                    (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
                )
                )
            )
            (post_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            )
        )
        )
        """
        return result

In [None]:
from typing import List, Optional, Tuple, Union
from transformers.generation.utils import GenerateOutput
from transformers.modeling_outputs import CausalLMOutputWithPast
from transformers import LlamaForCausalLM

class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
    config_class = LlavaConfig

    def __init__(self, config):

        print("current file path", "llava/llava/model/language_model/llava_llama.py")
        print("def LlavaLlamaForCausalLM.__init__(self, config)")
        print("self\n", type(self))
        # config は https://huggingface.co/lmsys/vicuna-7b-v1.5/blob/main/config.json
        print("config\n", config)
        super(LlamaForCausalLM, self).__init__(config)
        self.model = LlavaLlamaModel(config)
        # LlavaLlamaModelの初期化あと、LlavaMetaModelの初期化も呼ばれる。
        self.pretraining_tp = config.pretraining_tp
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        print("self.model\n", self.model)
        """
        self.model
        LlavaLlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
            (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
                (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
                (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
                (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
                (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
                (rotary_emb): LlamaRotaryEmbedding()
            )
            (mlp): LlamaMLP(
                (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
                (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
                (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
                (act_fn): SiLUActivation()
            )
            (input_layernorm): LlamaRMSNorm()
            (post_attention_layernorm): LlamaRMSNorm()
            )
        )
        (norm): LlamaRMSNorm()
        )
        """
        print("self.pretraining_tp\n", self.pretraining_tp) # 1
        print("self.vocab_size\n", self.vocab_size) # 32_000
        print("self.lm_head\n", self.lm_head) # Linear(in_features=4096, out_features=32000, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

In [None]:
"""
from transformers import AutoConfig

# 公式 LLaMA-2-7B の config をロード
llama_config = AutoConfig.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

print(llama_config)
"""

In [None]:
"""
from transformers import AutoConfig

# まず config.json をロードして Config クラスを自動判別
config = AutoConfig.from_pretrained(
    model_args.model_name_or_path,
    cache_dir=training_args.cache_dir
)

print("model_args.model_name_or_path\n", model_args.model_name_or_path)
print("training_args.cache_dir\n", training_args.cache_dir)
print("")
print("Loaded config:\n", config)
"""

In [None]:
import inspect
print(inspect.getattr_static(LlavaLlamaModel, "__init__"))

In [None]:
def print_mro(cls):
    print(f"MRO for {cls.__name__}:\n")
    for i, c in enumerate(cls.mro()):
        print(f"{i:2d}: {c.__module__}.{c.__name__}")

print_mro(LlavaLlamaModel)

In [None]:
print_mro(LlavaMetaModel)

In [None]:
print_mro(LlavaLlamaForCausalLM)

In [None]:
model = LlavaLlamaForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    cache_dir=training_args.cache_dir,
    **bnb_model_from_pretrained_args
)

In [None]:
print("model\n", model)

In [None]:
model.enable_input_require_grads()

In [None]:
import dataclasses
from typing import List
from enum import auto, Enum

class SeparatorStyle(Enum):
    """Different separator style."""
    SINGLE = auto()
    TWO = auto()
    MPT = auto()
    PLAIN = auto()
    LLAMA_2 = auto()

@dataclasses.dataclass
class Conversation:
    """A class that keeps all conversation history."""
    system: str
    roles: List[str]
    messages: List[List[str]]
    offset: int
    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
    sep: str = "###"
    sep2: str = None
    version: str = "Unknown"

    skip_next: bool = False


conv_llava_plain = Conversation(
    system="",
    roles=("", ""),
    messages=(
    ),
    offset=0,
    sep_style=SeparatorStyle.PLAIN,
    sep="\n",
)


conv_templates = {
    "plain": conv_llava_plain,
}

In [None]:
import inspect
print(inspect.getattr_static(LlamaForCausalLM, "from_pretrained"))
print(inspect.getattr_static(LlamaForCausalLM, "enable_input_require_grads"))

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_args.model_name_or_path,
    cache_dir=training_args.cache_dir,
    model_max_length=training_args.model_max_length,
    padding_side="right",
    use_fast=False,
)

In [None]:
print("pad_token:", tokenizer.pad_token)
print("pad_token_id:", tokenizer.pad_token_id)
print("unk_token:", tokenizer.unk_token)
print("unk_token_id:", tokenizer.unk_token_id)
print("tokenizer\n", tokenizer)

In [None]:
tokenizer.pad_token = tokenizer.unk_token

In [None]:
print("pad_token:", tokenizer.pad_token)
print("pad_token_id:", tokenizer.pad_token_id)
print("unk_token:", tokenizer.unk_token)
print("unk_token_id:", tokenizer.unk_token_id)
print("tokenizer\n", tokenizer)

In [None]:
default_conversation = conv_templates[model_args.version]
print("default_conversation\n", default_conversation)

In [None]:
print("model_args.vision_tower\n", model_args.vision_tower)

In [None]:
def get_model(self):

    print("current file path", "llava/llava/model/language_model/llava_llama.py")
    print("def LlavaLlamaForCausalLM.get_model(self)")
    print("self\n", type(self))
    print("self.model (return)\n", self.model)
    return self.model

In [None]:
LlavaLlamaForCausalLM.get_model = get_model

In [None]:
initial_model = model.get_model()

In [None]:
def config(self):

    print("current file path", "llava/llava/model/multimodal_encoder/clip_encoder.py")
    print("def CLIPVisionTower.config(self)")
    print("self\n", type(self))
    print("self.is_loaded\n", self.is_loaded) # True
    print(f"[COND] is_loaded={self.is_loaded}")
    if self.is_loaded:
        # 【ENTER】
        print("【ENTER】if self.is_loaded:")
        result = self.vision_tower.config
        print("result (return)\n", type(result))
        print("【EXIT】if self.is_loaded:")
    else:
      pass
    print("result (return)\n", result)
    """
    CLIPVisionConfig {
    "_name_or_path": "openai/clip-vit-large-patch14-336",
    "attention_dropout": 0.0,
    "dropout": 0.0,
    "hidden_act": "quick_gelu",
    "hidden_size": 1024,
    "image_size": 336,
    "initializer_factor": 1.0,
    "initializer_range": 0.02,
    "intermediate_size": 4096,
    "layer_norm_eps": 1e-05,
    "model_type": "clip_vision_model",
    "num_attention_heads": 16,
    "num_channels": 3,
    "num_hidden_layers": 24,
    "patch_size": 14,
    "projection_dim": 768,
    "transformers_version": "4.31.0"
    }
    """
    return result

In [None]:
def hidden_size(self):

    print("current file path", "llava/llava/model/multimodal_encoder/clip_encoder.py")
    print("def CLIPVisionTower.hidden_size(self)")
    print("self\n", type(self))
    result = self.config.hidden_size
    print("result (return), self.config.hidden_size\n", result) # 1024
    return result

In [None]:
CLIPVisionTower.config = property(config)

In [None]:
CLIPVisionTower.hidden_size = property(hidden_size)

In [None]:
def initialize_vision_modules(self, model_args, fsdp=None):

  print("current file path", "llava/model/llava_arch.py")
  print("def initialize_vision_modules(self, model_args, fsdp=None)")
  print("model_args\n", model_args) #  ModelArguments(model_name_or_path='lmsys/vicuna-7b-v1.5', version='plain', freeze_backbone=False, tune_mm_mlp_adapter=True, vision_tower='openai/clip-vit-large-patch14-336', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, mm_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch')
  print("fsdp\n", fsdp) # []
  vision_tower = model_args.vision_tower
  print("vision_tower from model_args\n", vision_tower) # openai/clip-vit-large-patch14-336
  mm_vision_select_layer = model_args.mm_vision_select_layer
  print("mm_vision_select_layer from model_args\n", mm_vision_select_layer) # -2
  mm_vision_select_feature = model_args.mm_vision_select_feature
  print("mm_vision_select_feature from model_args\n", mm_vision_select_feature) # patch
  pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
  print("pretrain_mm_mlp_adapter from model_args\n", pretrain_mm_mlp_adapter) # None
  mm_patch_merge_type = model_args.mm_patch_merge_type
  # 下記はself.config.mm_vision_towerに関するもの。self.vision_towerは依然としてNone
  self.config.mm_vision_tower = vision_tower
  print("self.config.mm_vision_tower\n", self.config.mm_vision_tower) # None

  print("[COND] self.get_vision_tower()\n", self.get_vision_tower()) # None
  print(f"[COND] get_vision_tower_is_None={self.get_vision_tower() is None}")
  if self.get_vision_tower() is None:
      #【ENTER】self.vision_tower, self.get_vision_towerはNoneなのでこの分岐に入る。
      print("【ENTER】if self.get_vision_tower() is None:")
      print("[ENTER] self.get_vision_tower() is None")
      # build_vision_tower(model_args) はちょっと奥の依存関係が深い
      vision_tower = build_vision_tower(model_args)
      print("vision_tower after build_vision_tower\n", vision_tower)
      """
      CLIPVisionTower(
      (vision_tower): CLIPVisionModel(
      (vision_model): CLIPVisionTransformer(
          (embeddings): CLIPVisionEmbeddings(
          (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
          (position_embedding): Embedding(577, 1024)
          )
          (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (encoder): CLIPEncoder(
          (layers): ModuleList(
              (0-23): 24 x CLIPEncoderLayer(
              (self_attn): CLIPAttention(
                  (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
                  (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
                  (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
                  (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
              )
              (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
              (mlp): CLIPMLP(
                  (activation_fn): QuickGELUActivation()
                  (fc1): Linear(in_features=1024, out_features=4096, bias=True)
                  (fc2): Linear(in_features=4096, out_features=1024, bias=True)
              )
              (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
              )
          )
          )
          (post_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
      )
      )
      """
      # 分散学習(FSDP)を使うかどうか. 今回は [] 空のリストとなるので、Noneではないが、len(fsdp) == 0
      print("[COND] fsdp\n", fsdp) # []
      print(f"[COND] fsdp_is_not_None={fsdp is not None} len_fsdp={len(fsdp) if fsdp is not None else 'N/A'}") # fsdp_is_not_None=True len_fsdp=0
      if fsdp is not None and len(fsdp) > 0:
        pass
      else:
          # 【ENTER】else of if fsdp is not None and len(fsdp) > 0:
          print("[COND] else_fsdp_is_not_None_and_len_fsdp_gt_0=True")
          print("【ENTER】else of if fsdp is not None and len(fsdp) > 0:")
          self.vision_tower = vision_tower
          print("self.vision_tower\n", self.vision_tower)
          """
          CLIPVisionTower(
          (vision_tower): CLIPVisionModel(
              (vision_model): CLIPVisionTransformer(
              (embeddings): CLIPVisionEmbeddings(
                  (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
                  (position_embedding): Embedding(577, 1024)
              )
              (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
              (encoder): CLIPEncoder(
                  (layers): ModuleList(
                  (0-23): 24 x CLIPEncoderLayer(
                      (self_attn): CLIPAttention(
                      (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
                      (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
                      (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
                      (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
                      )
                      (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
                      (mlp): CLIPMLP(
                      (activation_fn): QuickGELUActivation()
                      (fc1): Linear(in_features=1024, out_features=4096, bias=True)
                      (fc2): Linear(in_features=4096, out_features=1024, bias=True)
                      )
                      (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
                  )
                  )
              )
              (post_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
              )
          )
          )
          """
          print("【EXIT】else of if fsdp is not None and len(fsdp) > 0:")

      print("【EXIT】if self.get_vision_tower() is None:")
  else:
    pass

  self.config.use_mm_proj = True
  print("self.config.use_mm_proj set to True") # True
  self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
  print("self.config.mm_projector_type\n", self.config.mm_projector_type) # mlp2x_gelu
  self.config.mm_hidden_size = vision_tower.hidden_size
  print("self.config.mm_hidden_size\n", self.config.mm_hidden_size) # 1024
  self.config.mm_vision_select_layer = mm_vision_select_layer
  print("self.config.mm_vision_select_layer\n", self.config.mm_vision_select_layer) # -2
  self.config.mm_vision_select_feature = mm_vision_select_feature
  print("self.config.mm_vision_select_feature\n", self.config.mm_vision_select_feature) # patch
  self.config.mm_patch_merge_type = mm_patch_merge_type
  print("self.config.mm_patch_merge_type\n", self.config.mm_patch_merge_type) # flat

  # mm_projector_is_None=True
  print(f"[COND] mm_projector_is_None={getattr(self, 'mm_projector', None) is None}")
  if getattr(self, 'mm_projector', None) is None:
      # 【ENTER】
      print("【ENTER】if getattr(self, 'mm_projector', None) is None:")
      self.mm_projector = build_vision_projector(self.config)
      """
      Sequential(
        (0): Linear(in_features=1024, out_features=2048, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=2048, out_features=2048, bias=True)
      )
      """
      print("self.mm_projector after build_vision_projector\n", self.mm_projector)
      print("mm_patch_merge_type\n", mm_patch_merge_type) # flat
      print(f"[COND] unpad_in_mm_patch_merge_type={'unpad' in mm_patch_merge_type}")
      if 'unpad' in mm_patch_merge_type:
        pass
      print("【EXIT】if getattr(self, 'mm_projector', None) is None:")
  else:
    pass

  print(f"[COND] pretrain_mm_mlp_adapter_is_not_None={pretrain_mm_mlp_adapter is not None}")
  if pretrain_mm_mlp_adapter is not None:
    pass

In [None]:
LlavaMetaModel.initialize_vision_modules = initialize_vision_modules

In [None]:
def get_vision_tower(self):

    print("current file path", "llava/model/llava_arch.py")
    print("def get_vision_tower(self)")
    vision_tower = getattr(self, 'vision_tower', None)
    print("vision_tower (raw)\n", vision_tower)
    """
    CLIPVisionTower(
    (vision_tower): CLIPVisionModel(
        (vision_model): CLIPVisionTransformer(
        (embeddings): CLIPVisionEmbeddings(
            (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
            (position_embedding): Embedding(577, 1024)
        )
        (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (encoder): CLIPEncoder(
            (layers): ModuleList(
            (0-23): 24 x CLIPEncoderLayer(
                (self_attn): CLIPAttention(
                (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
                )
                (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
                (mlp): CLIPMLP(
                (activation_fn): QuickGELUActivation()
                (fc1): Linear(in_features=1024, out_features=4096, bias=True)
                (fc2): Linear(in_features=4096, out_features=1024, bias=True)
                )
                (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            )
            )
        )
        (post_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
    )
    )
    """
    print("type(vision_tower)\n", type(vision_tower))
    print(f"[COND] type_vision_tower_is_list={type(vision_tower) is list}")  # False
    if type(vision_tower) is list:
        # 【SKIP】
        print("【ENTER】if type(vision_tower) is list:")
        vision_tower = vision_tower[0]
        print("【EXIT】if type(vision_tower) is list:")
    print("vision_tower (return)\n", vision_tower)
    """
    vision_tower (return)
    CLIPVisionTower(
    (vision_tower): CLIPVisionModel(
        (vision_model): CLIPVisionTransformer(
        (embeddings): CLIPVisionEmbeddings(
            (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
            (position_embedding): Embedding(577, 1024)
        )
        (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (encoder): CLIPEncoder(
            (layers): ModuleList(
            (0-23): 24 x CLIPEncoderLayer(
                (self_attn): CLIPAttention(
                (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
                )
                (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
                (mlp): CLIPMLP(
                (activation_fn): QuickGELUActivation()
                (fc1): Linear(in_features=1024, out_features=4096, bias=True)
                (fc2): Linear(in_features=4096, out_features=1024, bias=True)
                )
                (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            )
            )
        )
        (post_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
    )
    )
    """
    return vision_tower

In [None]:
LlavaMetaModel.get_vision_tower = get_vision_tower

In [None]:
initial_model.initialize_vision_modules(
    model_args=model_args,
    fsdp=training_args.fsdp
)

In [None]:
vision_tower = model.get_vision_tower()
print("vision_tower\n", vision_tower)
vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device)

data_args.image_processor = vision_tower.image_processor
print("data_args.image_processor\n", data_args.image_processor)
data_args.is_multimodal = True
print("data_args.is_multimodal\n", data_args.is_multimodal) # True

model.config.image_aspect_ratio = data_args.image_aspect_ratio
print("model.config.image_aspect_ratio\n", model.config.image_aspect_ratio) # square
model.config.tokenizer_padding_side = tokenizer.padding_side
print("model.config.tokenizer_padding_side\n", model.config.tokenizer_padding_side) # right
model.config.tokenizer_model_max_length = tokenizer.model_max_length
print("model.config.tokenizer_model_max_length\n", model.config.tokenizer_model_max_length) # 2048

In [None]:
model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter
print(f"[COND] tune_mm_mlp_adapter={model_args.tune_mm_mlp_adapter}") # True
if model_args.tune_mm_mlp_adapter:
    # 【ENTER】 tune_mm_mlp_adapter=True なので、この分岐に入る
    print("【ENTER】if model_args.tune_mm_mlp_adapter:")
    # モデル全体の全パラメータを「学習不可（requires_grad=False）」にする
    # これで通常の重みは全て凍結される
    model.requires_grad_(False)
    for p in model.get_model().mm_projector.parameters():
        # mm_projector（画像特徴量→テキスト特徴量への変換層）の全パラメータだけを「学習可能（requires_grad=True）」に戻す
        # これで mm_projector のみ学習されることになる
        print("model.get_model().mm_projector.parameters()", model.get_model().mm_projector.parameters())
        p.requires_grad = True
    print("【EXIT】if model_args.tune_mm_mlp_adapter:")

In [None]:
model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
print(f"[COND] freeze_mm_mlp_adapter={training_args.freeze_mm_mlp_adapter}") # False
if training_args.freeze_mm_mlp_adapter:
  pass

print(f"[COND] bits={training_args.bits}") # 16
if training_args.bits in [4, 8]:
  pass

In [None]:
def initialize_vision_tokenizer(self, model_args, tokenizer):
    print("current file path", "llava/model/llava_arch.py")
    print("def initialize_vision_tokenizer(self, model_args, tokenizer)")
    print("model_args\n", model_args) # ModelArguments(model_name_or_path='lmsys/vicuna-7b-v1.5', version='plain', freeze_backbone=False, tune_mm_mlp_adapter=True, vision_tower='openai/clip-vit-large-patch14-336', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, mm_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch')
    print("tokenizer\n", tokenizer) # LlamaTokenizer(name_or_path='lmsys/vicuna-7b-v1.5', vocab_size=32000, model_max_length=2048, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False)

    print(f"[COND] mm_use_im_patch_token={model_args.mm_use_im_patch_token}") # False
    if model_args.mm_use_im_patch_token:
      pass

    if model_args.mm_use_im_start_end: # False
      pass

    elif model_args.mm_use_im_patch_token: # False
      pass

In [None]:
LlavaLlamaForCausalLM.initialize_vision_tokenizer = initialize_vision_tokenizer

In [None]:
model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end
print("model_args.mm_use_im_start_end", model_args.mm_use_im_start_end)
model.config.mm_projector_lr = training_args.mm_projector_lr
print("training_args.mm_projector_lr", training_args.mm_projector_lr)
training_args.use_im_start_end = model_args.mm_use_im_start_end
print("training_args.use_im_start_end", training_args.use_im_start_end)
model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token
print("model_args.mm_use_im_patch_token", model_args.mm_use_im_patch_token)
model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer)
print("【EXIT】if model_args.vision_tower is not None:")

In [None]:
def rank0_print(*args):

    print("current file path", "llava/train/train.py")
    print("def rank0_print(*args)")
    print("args\n", args) # ('Formatting inputs...Skip in lazy mode',)
    if local_rank == 0:
        print(*args)

In [None]:
from torch.utils.data import Dataset
import json

class LazySupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self, data_path: str,
                 tokenizer: transformers.PreTrainedTokenizer,
                 data_args: DataArguments):

        print("current file path", "llava/train/train.py")
        print("def LazySupervisedDataset.__init__(self, data_path, tokenizer, data_args)")
        print("data_path\n", data_path) # /content/LLaVA/blip_laion_cc_sbu_1.json
        print("tokenizer\n", type(tokenizer)) # <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>
        print("data_args\n", data_args) # DataArguments(data_path='/content/LLaVA/blip_laion_cc_sbu_1.json', lazy_preprocess=True, is_multimodal=True, image_folder='/content/LLaVA/images', image_aspect_ratio='square')
        super(LazySupervisedDataset, self).__init__()
        list_data_dict = json.load(open(data_path, "r"))
        # 今回は1サンプルだけなのでprintしても危険ではない
        print("list_data_dict", list_data_dict)

        rank0_print("Formatting inputs...Skip in lazy mode") # Formatting inputs...Skip in lazy mode
        self.tokenizer = tokenizer
        print("self.tokenizer\n", self.tokenizer)
        self.list_data_dict = list_data_dict
        print("self.list_data_dict\n", self.list_data_dict)
        self.data_args = data_args
        print("self.data_args\n", self.data_args)

In [None]:
def __len__(self):

    print("current file path", "llava/train/train.py")
    print("def LazySupervisedDataset.__len__(self)")
    return len(self.list_data_dict)

In [None]:
LazySupervisedDataset.__len__ = __len__

In [None]:
from typing import Sequence
from typing import Dict

@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:

        print("current file path", "llava/train/train.py")
        print("def DataCollatorForSupervisedDataset.__call__(self, instances)")
        print("instances\n", instances)

        # Noneを除外
        instances = [x for x in instances if x is not None]
        input_ids, labels = tuple([instance[key] for instance in instances]
                                  for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids,
            batch_first=True,
            padding_value=self.tokenizer.pad_token_id)
        labels = torch.nn.utils.rnn.pad_seqIGNORE_INDEXuence(labels,
                                                 batch_first=True,
                                                 padding_value=IGNORE_INDEX)
        input_ids = input_ids[:, :self.tokenizer.model_max_length]
        labels = labels[:, :self.tokenizer.model_max_length]
        batch = dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

        if 'image' in instances[0]:
            images = [instance['image'] for instance in instances]
            if all(x is not None and x.shape == images[0].shape for x in images):
                batch['images'] = torch.stack(images)
            else:
                batch['images'] = images

        return batch

In [None]:
from typing import Dict

def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
                                data_args) -> Dict:

    print("current file path", "llava/train/train.py")
    print("def make_supervised_data_module(tokenizer, data_args)")
    print("tokenizer\n", type(tokenizer))
    print("data_args\n", data_args) #  DataArguments(data_path='/content/LLaVA/blip_laion_cc_sbu_1.json', lazy_preprocess=True, is_multimodal=True, image_folder='/content/LLaVA/images', image_aspect_ratio='square')
    """Make dataset and collator for supervised fine-tuning."""
    train_dataset = LazySupervisedDataset(tokenizer=tokenizer,
                                data_path=data_args.data_path,
                                data_args=data_args)
    print("train_dataset\n", train_dataset) # <llava.train.train.LazySupervisedDataset object at 0x7ed6341f4880>
    print("len(train_dataset)\n", len(train_dataset)) # 1
    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
    print("data_collator\n", data_collator) # DataCollatorForSupervisedDataset(tokenizer=LlamaTokenizer(name_or_path='lmsys/vicuna-7b-v1.5', vocab_size=32000, model_max_length=2048, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False))
    result = dict(train_dataset=train_dataset,
                  eval_dataset=None,
                  data_collator=data_collator)
    print("def make_supervised_data_module: result (return)\n", result) # {'train_dataset': <llava.train.train.LazySupervisedDataset object at 0x7ed6341f4880>, 'eval_dataset': None, 'data_collator': DataCollatorForSupervisedDataset(tokenizer=LlamaTokenizer(name_or_path='lmsys/vicuna-7b-v1.5', vocab_size=32000, model_max_length=2048, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False))}
    return result

In [None]:
data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
print("data_module\n", data_module)

In [None]:
from transformers import Trainer
from transformers.trainer import (
    is_sagemaker_mp_enabled,
    get_parameter_names,
    has_length,
    ALL_LAYERNORM_LAYERS,
    ShardedDDPOption,
    logger,
)

class LLaVATrainer(Trainer):

    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:

        print("current file path", "llava/train/llava_trainer.py")
        print("def _get_train_sampler(self)")
        print("self\n", self)
        print(f"[COND] train_dataset_is_None={self.train_dataset is None}, has_length={has_length(self.train_dataset) if self.train_dataset is not None else 'N/A'}") # train_dataset_is_None=False, has_length=True
        if self.train_dataset is None or not has_length(self.train_dataset):
          pass

        print(f"[COND] group_by_modality_length={self.args.group_by_modality_length}") # group_by_modality_length=False
        if self.args.group_by_modality_length:
          pass
        else:
            # 【ENTER】
            print("【ENTER】else (not group_by_modality_length):")
            result = super()._get_train_sampler()
            print("result, super()._get_train_sampler()\n", result) # <torch.utils.data.sampler.RandomSampler object at 0x7ed63e925e70>
            print("【EXIT】else (not group_by_modality_length):")
            return result

In [None]:
def train():

    print("current file path", "llava/train/train.py")
    print("def train()")
    global local_rank

    parser = transformers.HfArgumentParser(
        (ModelArguments, DataArguments, TrainingArguments))
    print("original parser\n", parser)
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
    print("model_args\n", model_args)
    print("data_args\n", data_args)
    print("training_args\n", training_args)
    local_rank = training_args.local_rank
    print("local_rank\n", local_rank)
    compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
    print("compute_dtype\n", compute_dtype)
    bnb_model_from_pretrained_args = {}
    print("bnb_model_from_pretrained_args\n", bnb_model_from_pretrained_args)
    # 【SKIP】bfloat16 なので 以下の if 文はスキップされる
    print(f"[COND] bits={training_args.bits}")
    if training_args.bits in [4, 8]:
      pass

    print(f"[COND] vision_tower={model_args.vision_tower}")
    # 【ENTER】 vision_tower=openai/clip-vit-large-patch14-336 なので、この分岐に入る
    if model_args.vision_tower is not None:
        print("【ENTER】if model_args.vision_tower is not None:")
        print(f"[COND] mpt_in_model_name_or_path={'mpt' in model_args.model_name_or_path}")
        #【SKIP】model_args.model_name_or_path に mptは含まれていないので、この分岐はskipされる
        if 'mpt' in model_args.model_name_or_path:
          pass

        #【ENTER】 model_args.model_name_or_path に mptは含まれていないので、この分岐に入る
        else:
            print("[COND] not_mpt_in_model_name_or_path={'mpt' not in model_args.model_name_or_path}")
            print("【ENTER】else of if 'mpt' in model_args.model_name_or_path:")
            # PreTrainedModel.from_pretrained
            model = LlavaLlamaForCausalLM.from_pretrained(
                model_args.model_name_or_path,
                cache_dir=training_args.cache_dir,
                **bnb_model_from_pretrained_args
            )
            print("model defined as LlavaLlamaForCausalLM \n", model)
            print("【EXIT】else of if 'mpt' in model_args.model_name_or_path:")
        print("【EXIT】if model_args.vision_tower is not None:")
    # 【SKIP】 vision_tower=clip-vit-large-patch14-336 なので、この分岐には入らない
    else:
      pass

    print(f"[COND] freeze_backbone={model_args.freeze_backbone}")
    # 【SKIP】 freeze_backbone=False なので、この分岐はskipされる
    if model_args.freeze_backbone:
        pass

    # 【SKIP】 bfloat16 なので 以下の if 文はスキップされる
    print(f"[COND] bits={training_args.bits}")
    if training_args.bits in [4, 8]:
      pass

    print(f"[COND] gradient_checkpointing={training_args.gradient_checkpointing}")
    # 【ENTER】 gradient_checkpointing=True なので、この分岐に入る
    if training_args.gradient_checkpointing:
        print("【ENTER】if training_args.gradient_checkpointing:")
        print(f"[COND] has_enable_input_require_grads={hasattr(model, 'enable_input_require_grads')}")
        # 【ENTER】 model に enable_input_require_grads メソッドがあるので、この分岐に入る
        if hasattr(model, "enable_input_require_grads"):
            print("【ENTER】if hasattr(model, 'enable_input_require_grads'):")
            # PreTrainedModel.enable_input_require_grads
            # 元々 全ての重みについて True
            model.enable_input_require_grads()
            print("【EXIT】if hasattr(model, 'enable_input_require_grads'):")
        # 【SKIP】 model に enable_input_require_grads メソッドがあるので、この分岐はskipされる
        else:
          pass

        print("【EXIT】if training_args.gradient_checkpointing:")

    print(f"[COND] lora_enable={training_args.lora_enable}")
    # 【SKIP】 lora_enable=False なので、この分岐はskipされる
    if training_args.lora_enable:
      pass

    print(f"[COND] mpt_in_model_name_or_path={'mpt' in model_args.model_name_or_path}")
    # 【SKIP】model_args.model_name_or_path に mptは含まれていないので、この分岐はskipされる
    if 'mpt' in model_args.model_name_or_path:
      pass

    #【ENTER】 model_args.model_name_or_path に mptは含まれていないので、この分岐に入る
    else:
        print("[COND] not_mpt_in_model_name_or_path={'mpt' not in model_args.model_name_or_path}")
        print("【ENTER】else of if 'mpt' in model_args.model_name_or_path:")
        tokenizer = transformers.AutoTokenizer.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=training_args.cache_dir,
            model_max_length=training_args.model_max_length,
            padding_side="right",
            use_fast=False,
        )
        print("tokenizer defined by AutoTokenizer.from_pretrained \n", tokenizer)
        print("【EXIT】else of if 'mpt' in model_args.model_name_or_path:")

    print(f"[COND] version={model_args.version}")
    # 【SKIP】 version=plain なので、この分岐はskipされる
    if model_args.version == "v0":
      pass

    # 【SKIP】 version=plain なので、この分岐はskipされる
    elif model_args.version == "v0.5":
      pass
    # 【ENTER】 version=plain なので、この分岐に入る
    else:
        print("【ENTER】else of if model_args.version == 'v0' and elif 'v0.5':")
        tokenizer.pad_token = tokenizer.unk_token
        print(f"[COND] version_in_conv_templates={model_args.version in conv_templates}")
        # 【ENTER】 model_args.version=plain は conversation_lib.conv_templates に含まれている（"plain": conv_llava_plain）ので、この分岐に入る
        if model_args.version in conv_templates:
            print("【ENTER】if model_args.version in conversation_lib.conv_templates:")
            default_conversation = conv_templates[model_args.version]
            print(f"conversation_lib.default_conversation set to {model_args.version}")
            print("【EXIT】if model_args.version in conversation_lib.conv_templates:")
        # 【SKIP】 model_args.version=plain は conversation_lib.conv_templates に含まれているので、この分岐はskipされる
        else:
          pass
        print("【EXIT】else of if model_args.version == 'v0' and elif 'v0.5':")

    print(f"[COND] vision_tower={model_args.vision_tower}")
    # 【ENTER】 vision_tower=openai/clip-vit-large-patch14-336 なので、この分岐に入る
    if model_args.vision_tower is not None:
        print("【ENTER】if model_args.vision_tower is not None:")
        model.get_model().initialize_vision_modules(
            model_args=model_args,
            fsdp=training_args.fsdp
        )

        vision_tower = model.get_vision_tower()
        vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device)

        data_args.image_processor = vision_tower.image_processor
        data_args.is_multimodal = True

        model.config.image_aspect_ratio = data_args.image_aspect_ratio
        model.config.tokenizer_padding_side = tokenizer.padding_side
        model.config.tokenizer_model_max_length = tokenizer.model_max_length

        model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter
        print(f"[COND] tune_mm_mlp_adapter={model_args.tune_mm_mlp_adapter}") # True
        if model_args.tune_mm_mlp_adapter:
            # 【ENTER】 tune_mm_mlp_adapter=True なので、この分岐に入る
            print("【ENTER】if model_args.tune_mm_mlp_adapter:")
            # モデル全体の全パラメータを「学習不可（requires_grad=False）」にする
            # これで通常の重みは全て凍結される
            model.requires_grad_(False)
            for p in model.get_model().mm_projector.parameters():
                # mm_projector（画像特徴量→テキスト特徴量への変換層）の全パラメータだけを「学習可能（requires_grad=True）」に戻す
                # これで mm_projector のみ学習されることになる
                print("model.get_model().mm_projector.parameters()", model.get_model().mm_projector.parameters())
                p.requires_grad = True
            print("【EXIT】if model_args.tune_mm_mlp_adapter:")

        model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
        print(f"[COND] freeze_mm_mlp_adapter={training_args.freeze_mm_mlp_adapter}") # False
        if training_args.freeze_mm_mlp_adapter:
          pass

        print(f"[COND] bits={training_args.bits}") # 16
        if training_args.bits in [4, 8]:
          pass

        model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end
        print("model_args.mm_use_im_start_end", model_args.mm_use_im_start_end)
        model.config.mm_projector_lr = training_args.mm_projector_lr
        print("training_args.mm_projector_lr", training_args.mm_projector_lr)
        training_args.use_im_start_end = model_args.mm_use_im_start_end
        print("training_args.use_im_start_end", training_args.use_im_start_end)
        model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token
        print("model_args.mm_use_im_patch_token", model_args.mm_use_im_patch_token)
        model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer)
        print("【EXIT】if model_args.vision_tower is not None:")

    print(f"[COND] bits={training_args.bits}") # 16
    if training_args.bits in [4, 8]:
        pass

    data_module = make_supervised_data_module(tokenizer=tokenizer,
                                              data_args=data_args)
    print("data_module\n", data_module) # {'train_dataset': <llava.train.train.LazySupervisedDataset object at 0x7ed6341f4880>, 'eval_dataset': None, 'data_collator': DataCollatorForSupervisedDataset(tokenizer=LlamaTokenizer(name_or_path='lmsys/vicuna-7b-v1.5', vocab_size=32000, model_max_length=2048, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False))}

    trainer = LLaVATrainer(model=model,
                    tokenizer=tokenizer,
                    args=training_args,
                    **data_module)
    print("trainer\n", trainer) # <llava.train.llava_trainer.LLaVATrainer object at 0x7ed6341f4490>

    print("【COND】list(pathlib.Path(training_args.output_dir).glob('checkpoint-*'))\n", list(pathlib.Path(training_args.output_dir).glob("checkpoint-*"))) # [PosixPath('checkpoints/llava-v1.5-7b-pretrain/checkpoint-250'), PosixPath('checkpoints/llava-v1.5-7b-pretrain/checkpoint-1')]
    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
        # 【ENTER】
        print("【ENTER】if list(pathlib.Path(training_args.output_dir).glob(checkpoint-*)):")
        trainer.train(resume_from_checkpoint=False)
        print("【EXIT】if list(pathlib.Path(training_args.output_dir).glob(checkpoint-*)):")
    else:
        print("【ENTER】else of if list(pathlib.Path(training_args.output_dir).glob(checkpoint-*)):")
        trainer.train()
        print("【EXIT】else of if list(pathlib.Path(training_args.output_dir).glob(checkpoint-*)):")
    trainer.save_state()

    model.config.use_cache = True
    print("model.config.use_cache = True", model.config.use_cache) # True

    print(f"【COND】lora_enable={training_args.lora_enable}") # False
    if training_args.lora_enable:
      pass
    else:
        # 【ENTER】
        print("【ENTER】else of if training_args.lora_enable:")
        print("trainer", trainer) # <class 'llava.train.llava_trainer.LLaVATrainer'>
        safe_save_model_for_hf_trainer(trainer=trainer,
                                       output_dir=training_args.output_dir)
        print("【EXIT】else of if training_args.lora_enable:")