#### Loading Minimal LLaVA
* Language Model decoder (Llama3.1)
* Vision Encoder (CLIP)
* Vision projector (MLP with Randomized weight)


<div align="center">
  <img src="data/mini-llava.png" width="800" alt="Mini-LLaVA">
  <p><em>Mini-LLaVA handles text, image and video inputs</em></p>
</div>

In [1]:
# 1) どこからでもルート直下に避難
%cd /

# 2) Colabの標準ワークディレクトリへ
%cd /content

# 3) 壊れかけのフォルダを一掃（あれば）
!rm -rf /content/Mini-LLaVA

/
/content


In [36]:
!git clone https://github.com/HayatoHongo/Mini-LLaVA.git
%cd Mini-LLaVA

Cloning into 'Mini-LLaVA'...
remote: Enumerating objects: 380, done.[K
remote: Counting objects: 100% (152/152), done.[K
remote: Compressing objects: 100% (105/105), done.[K
remote: Total 380 (delta 103), reused 93 (delta 47), pack-reused 228 (from 1)[K
Receiving objects: 100% (380/380), 13.10 MiB | 31.57 MiB/s, done.
Resolving deltas: 100% (226/226), done.
/content/Mini-LLaVA


In [37]:
!git checkout upstream-main
!git branch

Branch 'upstream-main' set up to track remote branch 'upstream-main' from 'origin'.
Switched to a new branch 'upstream-main'
  main[m
* [32mupstream-main[m


In [38]:
pip install av



In [39]:
import torch

In [40]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [41]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")

In [42]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [43]:
print(tokenizer)

PreTrainedTokenizerFast(name_or_path='meta-llama/Meta-Llama-3.1-8B-Instruct', vocab_size=128000, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|begin_of_text|>', 'eos_token': '<|eot_id|>', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	128000: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128003: AddedToken("<|reserved_special_token_1|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128004: AddedToken("<|finetune_right_pad_id|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128005: AddedToken("<|res

In [44]:
from transformers import LlamaConfig
llama_3_1_config = LlamaConfig.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
print(llama_3_1_config)

LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 8.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.55.4",
  "use_cache": true,
  "vocab_size": 128256
}



In [45]:
from mini_llava import LlavaLlamaForCausalLM # Register the llavgit a models into 'transformers'
from transformers import AutoConfig
config = AutoConfig.for_model("llava_llama", trust_remote_code=True)

In [46]:
print(config)

LlavaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "delay_load": true,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_new_tokens": 1024,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "mm_hidden_size": 768,
  "mm_resampler_type": null,
  "mm_vision_select_feature": "patch",
  "mm_vision_select_layer": -1,
  "mm_vision_tower": "openai/clip-vit-base-patch32",
  "model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
  "model_type": "llava_llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 8.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": 

In [47]:
# Initalize with Load Llama3.1 Weights & CLIP encoder.
# Do NOT run this code twice, or you will face Cuda OutOfMemory Error because VRAM will suffer from duplicate copy of large models.
llava_model = LlavaLlamaForCausalLM.from_pretrained_lm(config).to(device)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 90.88 MiB is free. Process 14062 has 39.46 GiB memory in use. Of the allocated memory 38.88 GiB is allocated by PyTorch, and 77.08 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
print(llava_model)

In [None]:
!nvidia-smi

#### Let's check if Llama3.1 is working fine here

In [None]:
from mini_llava import generate_text, generation_config

In [None]:
print(generation_config)

In [None]:
# Example usage
prompt = "Once upon a time, far in the galaxy, "
text = generate_text(prompt, llava_model, tokenizer, device, generation_config)
print(f"Prompt: {prompt}")
print(f"Generated text: {text}")

<div align="center">
  <img src="data/cat.jpg" width="500" alt="Cat image">
  <p><em>We want Mini-LLava to recognize cat in this image</em></p>
</div>

### Before Training, model can't see

In [None]:
from mini_llava import data_args, LazyProcessor

In [None]:
print(data_args)

In [None]:
print(tokenizer)

In [None]:
print(llava_model.get_model().vision_tower.image_processor)

In [None]:
proc = LazyProcessor(tokenizer=tokenizer, data_args=data_args, image_processor=llava_model.get_model().vision_tower.image_processor)

In [None]:
print(proc.data)

In [None]:
img_path = "data/cat.jpg"
query = "What is in the image?"
proc.query(question = query,
           #media_paths = []) # text-only
           media_paths = [img_path]) # interleaved text & image chat

In [None]:
generated_texts = proc.get_response(llava_model, tokenizer) #, generation_config)
print("Mini-LLaVA Response: \n\n  ", generated_texts[0]) # Model has no idea what's in the image yet

<div align="center">
  <p><em>What is in the image?</em></p>
  <img src="data/cat.jpg" width="400" alt="Cat image">
  <p><em>Mini-LLaVA before training: The image of the image.</em></p>
</div>

### Pre-Train vision projector on a visual question-answer dataset (~8K)
* A projector learns how to 'translate' image to embeddings, which LLM understands.

In [None]:
from mini_llava import prepare_docci_data
data_args = prepare_docci_data("data/docci_converted.json", "data/docci")

In [30]:
from mini_llava import DataCollatorForSupervisedDataset, LazySupervisedDataset
dataset = LazySupervisedDataset(data_args=data_args, tokenizer=tokenizer, image_processor=llava_model.get_model().vision_tower.image_processor)
collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

In [33]:
"""
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset, collate_fn = collator, batch_size=1, num_workers=0) # batch_size = 4, num_workers = 1)
"""

import torch
from torch.utils.data import DataLoader, SubsetRandomSampler

N = 1_000  # 使うサンプル数
indices = torch.randperm(len(dataset))[:N].tolist()
sampler = SubsetRandomSampler(indices)

dataloader = DataLoader(
    dataset,
    sampler=sampler,                     # ← ここがポイント
    collate_fn=collator,
    batch_size=1,
    num_workers=0,                       # 速くしたいなら >=2 + pin_memory=True
)

In [34]:
from mini_llava import train_mini_llava as train
train(llava_model, tokenizer, dataloader, use_lora=True) # This is easily the better choice (so many lower-level optimization happens here ...)

  super().__init__(*args, **kwargs)


Step,Training Loss


KeyboardInterrupt: 

In [None]:
!pip -q install -U huggingface_hub datasets pillow

In [None]:
%cd /content
import os, json, zipfile
from tqdm import tqdm
from huggingface_hub import snapshot_download
from mini_llava.config import DataArguments  # 既存のdataclass

REPO_ID = "HayatoHongo/blip_laion_cc_sbu_1k"     # ← あなたのデータセット
ROOT = "/content/Mini-LLaVA"                      # 既存コードに合わせる基準パス
OUT_DIR = os.path.join(ROOT, "data")
IMG_DIR = os.path.join(OUT_DIR, "images_1k_folder")     # 展開先
OUT_JSON = os.path.join(OUT_DIR, "blip_laion_cc_sbu_1k_converted.json")

os.makedirs(OUT_DIR, exist_ok=True)

# ★ データセットとしてDL（ここが重要）
local_dir = snapshot_download(
    repo_id=REPO_ID,
    repo_type="dataset",
    allow_patterns=["blip_laion_cc_sbu_1k.json","images_1k_folder.zip"]
)

# 画像zipを展開
zip_candidates = [
    os.path.join(local_dir, "images_1k_folder.zip"),
    os.path.join(local_dir, "files", "images_1k_folder.zip"),
]
zip_path = next((p for p in zip_candidates if os.path.exists(p)), None)
if zip_path is None:
    raise FileNotFoundError("images_1k_folder.zip が見つかりません。")

if not os.path.isdir(IMG_DIR) or len(os.listdir(IMG_DIR)) == 0:
    with zipfile.ZipFile(zip_path, 'r') as zf:
        zf.extractall(OUT_DIR)  # zip内が images_1k_folder/ 構造ならOK

# 元JSON読み込み
json_candidates = [
    os.path.join(local_dir, "blip_laion_cc_sbu_1k.json"),
    os.path.join(local_dir, "files", "blip_laion_cc_sbu_1k.json"),
]
src_json = next((p for p in json_candidates if os.path.exists(p)), None)
if src_json is None:
    raise FileNotFoundError("blip_laion_cc_sbu_1k.json が見つかりません。")

with open(src_json, "r", encoding="utf-8") as f:
    src = json.load(f)

# Mini-LLaVAのDatasetが読む形（media: [{"image": ...}]）に変換
out = []
for ex in tqdm(src, desc="to Mini-LLaVA format"):
    fname = ex.get("image")
    convs = ex.get("conversations", [])
    if not fname or not convs:
        continue
    if not os.path.isfile(os.path.join(IMG_DIR, fname)):
        continue

    # humanに<image>が無ければ保険で追記
    if isinstance(convs, list) and convs and isinstance(convs[0], dict):
        if convs[0].get("from") in ("human", "user"):
            v = convs[0].get("value", "")
            if "<image>" not in v:
                convs[0]["value"] = (v + "\n<image>").strip()

    out.append({
        "id": ex.get("id", os.path.splitext(fname)[0]),
        "media": [{"image": fname}],
        "conversations": convs
    })

with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(out, f, indent=2, ensure_ascii=False)

# 既存コード互換の DataArguments を作成（/content/Mini-LLaVA からの相対に）
data_args = DataArguments(
    data_path=os.path.relpath(OUT_JSON, ROOT),              # 例: "data/xxx.json"
    image_folder=os.path.relpath(IMG_DIR, ROOT) + "/",      # 例: "data/images_1k_folder/"
    video_folder=os.path.relpath(IMG_DIR, ROOT) + "/",
    video_fps=1,
    frames_upbound=0,
    add_time_instruction=False,
    force_sample=False,
    default_fps=10,
)

print("data_args:", data_args)
print("JSON:", OUT_JSON)
print("IMG_DIR:", IMG_DIR)

/content


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

blip_laion_cc_sbu_1k.json: 0.00B [00:00, ?B/s]

images_1k_folder.zip:   0%|          | 0.00/11.2M [00:00<?, ?B/s]

to Mini-LLaVA format: 100%|██████████| 1000/1000 [00:00<00:00, 104852.36it/s]

data_args: DataArguments(data_path='data/blip_laion_cc_sbu_1k_converted.json', image_folder='data/images_1k_folder/', video_folder='data/images_1k_folder/', video_fps=1, frames_upbound=0, add_time_instruction=False, force_sample=False, default_fps=10)
JSON: /content/Mini-LLaVA/data/blip_laion_cc_sbu_1k_converted.json
IMG_DIR: /content/Mini-LLaVA/data/images_1k_folder





In [None]:
from mini_llava import prepare_docci_data, DataCollatorForSupervisedDataset, LazySupervisedDataset
from mini_llava import train_mini_llava as train
from torch.utils.data import DataLoader

In [None]:
#data_args = prepare_docci_data("data/blip_laion_cc_sbu_1k_converted.json", "data/images_1k_folder")

# Colab: リポジトリ直下にいる前提
%cd /content/Mini-LLaVA

import os
from mini_llava.config import DataArguments

JSON_REL = "data/blip_laion_cc_sbu_1k_converted.json"
IMG_REL  = "data/images_1k_folder/"

# 念のため存在チェック
assert os.path.isfile(JSON_REL), f"JSON が見つかりません: {JSON_REL}"
assert os.path.isdir(IMG_REL),   f"画像フォルダが見つかりません: {IMG_REL}"

# 末尾スラッシュを揃える（Dataset 実装が末尾 / を前提）
if not IMG_REL.endswith("/"):
    IMG_REL += "/"

data_args = DataArguments(
    data_path=JSON_REL,       # リポジトリ直下からの相対でOK
    image_folder=IMG_REL,     # 画像はここ
    video_folder=IMG_REL,     # 動画は使わないがAPI整合で同じに
    video_fps=1,
    frames_upbound=0,
    add_time_instruction=False,
    force_sample=False,
    default_fps=10,
)

print(data_args)

/content/Mini-LLaVA
DataArguments(data_path='data/blip_laion_cc_sbu_1k_converted.json', image_folder='data/images_1k_folder/', video_folder='data/images_1k_folder/', video_fps=1, frames_upbound=0, add_time_instruction=False, force_sample=False, default_fps=10)


In [None]:
dataset = LazySupervisedDataset(data_args=data_args, tokenizer=tokenizer, image_processor=llava_model.get_model().vision_tower.image_processor)

In [None]:
dataset[0]

{'input_ids': tensor([128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1627,  10263,    220,   2366,     19,    271,   2675,    527,
            264,  11190,   4221,    323,  11376,  18328,     13,   1472,    527,
           3025,    311,   3619,    279,   9302,   2262,    430,    279,   1217,
           5825,     11,    323,   7945,    279,   1217,    449,    264,   8205,
            315,   9256,   1701,   5933,   4221,     13, 128009, 128006,   9125,
         128007,    271,  38766,   1303,  33025,   2696,     25,   6790,    220,
           2366,     18,    198,  15724,   2696,     25,    220,   1627,  10263,
            220,   2366,     19,    271, 128009, 128006,    882, 128007,    271,
          36227,    264,  10015,   4096,    315,    279,   2217,    627,   -200,
         128009, 128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,
           2696

In [None]:
print(dataset[0]['input_ids'])

tensor([128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
            25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
           220,   1627,  10263,    220,   2366,     19,    271,   2675,    527,
           264,  11190,   4221,    323,  11376,  18328,     13,   1472,    527,
          3025,    311,   3619,    279,   9302,   2262,    430,    279,   1217,
          5825,     11,    323,   7945,    279,   1217,    449,    264,   8205,
           315,   9256,   1701,   5933,   4221,     13, 128009, 128006,   9125,
        128007,    271,  38766,   1303,  33025,   2696,     25,   6790,    220,
          2366,     18,    198,  15724,   2696,     25,    220,   1627,  10263,
           220,   2366,     19,    271, 128009, 128006,    882, 128007,    271,
         36227,    264,  10015,   4096,    315,    279,   2217,    627,   -200,
        128009, 128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,
          2696,     25,   6790,    220, 

In [None]:
input_ids = dataset[0]['input_ids']
decoded = []
for i in input_ids:
    if i >= 0:
        # トークンIDを単体でdecode（1トークンずつ）
        token = tokenizer.decode([int(i)], skip_special_tokens=True)
        decoded.append(token)
    else:
        # 負の値はそのまま
        decoded.append(i)
print(decoded)

# 文字列だけを連結（負の値は無視）
joined_text = ''.join([str(t) for t in decoded if isinstance(t, str)])
print(joined_text)

['', '', 'system', '', '\n\n', 'Cut', 'ting', ' Knowledge', ' Date', ':', ' December', ' ', '202', '3', '\n', 'Today', ' Date', ':', ' ', '26', ' Jul', ' ', '202', '4', '\n\n', 'You', ' are', ' a', ' helpful', ' language', ' and', ' vision', ' assistant', '.', ' You', ' are', ' able', ' to', ' understand', ' the', ' visual', ' content', ' that', ' the', ' user', ' provides', ',', ' and', ' assist', ' the', ' user', ' with', ' a', ' variety', ' of', ' tasks', ' using', ' natural', ' language', '.', '', '', 'system', '', '\n\n', 'Cut', 'ting', ' Knowledge', ' Date', ':', ' December', ' ', '202', '3', '\n', 'Today', ' Date', ':', ' ', '26', ' Jul', ' ', '202', '4', '\n\n', '', '', 'user', '', '\n\n', 'Give', ' a', ' brief', ' description', ' of', ' the', ' image', '.\n', tensor(-200), '', '', '', 'system', '', '\n\n', 'Cut', 'ting', ' Knowledge', ' Date', ':', ' December', ' ', '202', '3', '\n', 'Today', ' Date', ':', ' ', '26', ' Jul', ' ', '202', '4', '\n\n', '', '', 'assistant', '', '\

In [None]:
input_ids = dataset[0]['labels']
decoded = []
for i in input_ids:
    if i >= 0:
        # トークンIDを単体でdecode（1トークンずつ）
        token = tokenizer.decode([int(i)], skip_special_tokens=True)
        decoded.append(token)
    else:
        # 負の値はそのまま
        decoded.append(i)
print(decoded)

# 文字列だけを連結（負の値は無視）
joined_text = ''.join([str(t) for t in decoded if isinstance(t, str)])
print(joined_text)

['', '', tensor(-100), '', tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), '', '', tensor(-100), '', tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-100), tensor(-1

In [None]:
collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

In [None]:
dataloader = DataLoader(dataset, collate_fn=collator, batch_size=1, num_workers=1)

In [None]:
train(llava_model, tokenizer, dataloader, use_lora=False) # This is easily the better choice (so many lower-level optimization happens here ...)

  super().__init__(*args, **kwargs)
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhongo-hayato-6281k[0m ([33mhongo-hayato-6281k-university-of-tokyo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 214.25 MiB is free. Process 6011 has 38.93 GiB memory in use. Process 199354 has 416.00 MiB memory in use. Of the allocated memory 38.10 GiB is allocated by PyTorch, and 339.47 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# 圧縮
!cd /content && zip -r checkpoint-3000.zip Mini-LLaVA/results/checkpoint-3000

# ダウンロード
from google.colab import files
files.download('/content/checkpoint-3000.zip')

  adding: Mini-LLaVA/results/checkpoint-3000/ (stored 0%)
  adding: Mini-LLaVA/results/checkpoint-3000/optimizer.pt (deflated 10%)
  adding: Mini-LLaVA/results/checkpoint-3000/README.md (deflated 65%)
  adding: Mini-LLaVA/results/checkpoint-3000/rng_state.pth (deflated 26%)
  adding: Mini-LLaVA/results/checkpoint-3000/adapter_config.json (deflated 56%)
  adding: Mini-LLaVA/results/checkpoint-3000/tokenizer.json (deflated 85%)
  adding: Mini-LLaVA/results/checkpoint-3000/special_tokens_map.json (deflated 70%)
  adding: Mini-LLaVA/results/checkpoint-3000/chat_template.jinja (deflated 72%)
  adding: Mini-LLaVA/results/checkpoint-3000/scheduler.pt (deflated 62%)
  adding: Mini-LLaVA/results/checkpoint-3000/training_args.bin (deflated 53%)
  adding: Mini-LLaVA/results/checkpoint-3000/trainer_state.json (deflated 66%)
  adding: Mini-LLaVA/results/checkpoint-3000/adapter_model.safetensors (deflated 7%)
  adding: Mini-LLaVA/results/checkpoint-3000/tokenizer_config.json (deflated 96%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import zipfile, os
zip_path = "/content/checkpoint-3000.zip"
out_dir  = "/content/checkpoint-3000"
os.makedirs(out_dir, exist_ok=True)

with zipfile.ZipFile(zip_path) as zf:
    zf.extractall(out_dir)

In [None]:
from peft import PeftModel
import torch

# 1. LoRA訓練済みモデルをロード
# base_model = ...  # 通常のLlavaLlamaForCausalLMインスタンス
lora_checkpoint_path = "/content/checkpoint-3000/Mini-LLaVA/results/checkpoint-3000"
model = PeftModel.from_pretrained(llava_model, lora_checkpoint_path)

In [None]:
# 2. LoRA Adapterを統合
model = model.merge_and_unload()

In [None]:
# 3. mm_projectorを保存
torch.save(model.model.mm_projector.state_dict(), "/content/checkpoint-3000/Mini-LLaVA/results/checkpoint-3000/mm_projector_merged.pth")

### After training, mini-Llava already recognizes the cat in the image

In [None]:
import torch
import gc

# まずメモリ解放
gc.collect()
torch.cuda.empty_cache()

In [None]:
# デバイス設定
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

# トークナイザー
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Config
from transformers import LlamaConfig
llama_3_1_config = LlamaConfig.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")

from mini_llava import LlavaLlamaForCausalLM
from transformers import AutoConfig
config = AutoConfig.for_model("llava_llama", trust_remote_code=True)

device: cuda


In [None]:
# モデル初期化
llava_model = LlavaLlamaForCausalLM.from_pretrained_lm(config).to(device)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


KeyboardInterrupt: 

In [None]:
# mm_projector差し替え（必要な場合）
mm_projector_path = "/content/checkpoint-3000/Mini-LLaVA/results/checkpoint-3000/mm_projector_merged.pth"
llava_model.get_model().mm_projector.load_state_dict(torch.load(mm_projector_path, map_location=device))

In [None]:
# 推論例
from mini_llava import generate_text, generation_config
prompt = "Once upon a time, far in the galaxy,"
text = generate_text(prompt, llava_model, tokenizer, device, generation_config)
print(f"Prompt: {prompt}")
print(f"Generated text: {text}")

In [None]:
# 画像推論例
from mini_llava import data_args, LazyProcessor
proc = LazyProcessor(tokenizer=tokenizer, data_args=data_args, image_processor=llava_model.get_model().vision_tower.image_processor)
img_path = "data/cat.jpg"
query = "What is in the image? If you are not sure, please answer you don't know."
proc.query(question=query, media_paths=[img_path])
generated_texts = proc.get_response(llava_model, tokenizer)
print("Mini-LLaVA Response: \n\n  ", generated_texts[0])

In [None]:
from mini_llava import LazyProcessor

In [None]:
proc = LazyProcessor(tokenizer=tokenizer, data_args=data_args, image_processor=llava_model.get_model().vision_tower.image_processor)

In [None]:
img_path = "data/cat.jpg"
query = "explain the image in detail" #"What is in the image?"
proc.query(question = query,
           # media_paths = []) # for testing text-only chat
           media_paths = [img_path]) # for interleaved text & image chat
generated_texts = proc(llava_model, tokenizer)
print(generated_texts)
print("Mini-LLaVA Response: \n\n  ", generated_texts[0]) # Model sees the cat now (!) Note that we've only trained the adaptor here.

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['user']
Mini-LLaVA Response: 

   user


In [None]:
# ==== 前提の軽い設定（未設定なら）====
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Llama3.1系なら <|eot_id|> を終端に追加
try:
    eot_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")
    eos_ids = [tid for tid in {tokenizer.eos_token_id, eot_id} if tid is not None]
except Exception:
    eos_ids = [tokenizer.eos_token_id]

# ==== get_response をモンキーパッチ ====
import torch
from mini_llava.dataprocess import LazyProcessor

def _patched_get_response(self, llava_model, tokenizer, generation_config=None, device="cuda"):
    # 既存のバッチ作成
    data_w_media, data_w_text = self.process_data(device=device)

    # 生成ハイパラ（必要に応じて調整）
    gen_kwargs = dict(
        max_new_tokens=128,
        min_new_tokens=20,      # 短すぎ対策（不要なら外してOK）
        do_sample=True,
        temperature=0.4,
        top_p=0.9,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=eos_ids,   # EOT/ EOS で止める
        use_cache=True,
    )
    if generation_config:
        gen_kwargs.update(generation_config)

    def run_one(batch):
        texts = []
        if not batch:
            return texts
        with torch.no_grad():
            outputs = llava_model.generate(**batch, **gen_kwargs)
        # プロンプト長 = attention_mask の 1 の合計
        prompt_lens = batch["attention_mask"].sum(dim=1).tolist()
        for seq, plen in zip(outputs, prompt_lens):
            gen = seq[int(plen):]  # 新規生成トークンのみ
            text = tokenizer.decode(gen, skip_special_tokens=True)
            texts.append(text.strip())
        return texts

    out = []
    out += run_one(data_w_media)
    out += run_one(data_w_text)
    return out

LazyProcessor.get_response = _patched_get_response
print("Patched LazyProcessor.get_response ✅")


Patched LazyProcessor.get_response ✅


In [None]:
img_path = "data/cat.jpg"
query = "Explain the image in detail."

proc.query(question=query, media_paths=[img_path])
generated_texts = proc.get_response(llava_model, tokenizer)  # 生成部のみをデコード
print("Mini-LLaVA Response:\n", generated_texts[0])

Mini-LLaVA Response:
 


<div align="center">
  <p><em>What is in the image?</em></p>
  <img src="data/cat.jpg" width="400" alt="Cat image">
  <p style="margin-left: 40px; margin-right: 40px;"><em>Mini-LLaVA after pre-training: Cat! </em></p>
</div>