In [4]:
from google.colab import drive
drive.mount('/content/drive')

!git clone https://github.com/NaGho/reasoning_multimodal_LLMs.git
import sys
sys.path.append('/content/reasoning_multimodal_LLMs')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
fatal: destination path 'reasoning_multimodal_LLMs' already exists and is not an empty directory.


In [5]:
!pip install transformers==4.45.2
!pip install accelerate
!pip install deepspeed==0.14.4
!pip install av
!pip install peft
# !pip install --force-reinstall -U bitsandbytes
!pip uninstall bitsandbytes -y
!pip install --no-cache-dir bitsandbytes
!pip install flash-attn #--no-build-isolation


import os
os.environ["WANDB_PROJECT"]= "vlmms-ft"
from dataclasses import asdict
import math
from pathlib import Path
from typing import List, Optional
import yaml

from accelerate.utils import DistributedType
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
import transformers
from transformers import Trainer, deepspeed
# import deepspeed

from arguments import ModelArguments, DataArguments, TrainingArguments, LoraArguments
from collators import COLLATORS
from loaders import LOADERS
from supported_models import MODULE_KEYWORDS
from utils import (
    rank0_print, find_all_linear_names, safe_save_model_for_hf_trainer,
    get_peft_state_maybe_zero_3, TrainerWithCustomSampler
)
from my_datasets import MySupervisedDataset

# def train():
parser = transformers.HfArgumentParser(
    (ModelArguments, DataArguments, TrainingArguments, LoraArguments)
)
default_args = [
    "--output_dir", "/content/drive/MyDrive/reasoning_multimodal_LLMs/fine_tuned",
]
model_args, data_args, training_args, lora_args = parser.parse_args_into_dataclasses(default_args)

# data_args.data_path = "/content/drive/MyDrive/reasoning_multimodal_LLMs/example_data/mathvisionVistaMerged_train.json" # mathvision_train
# data_args.image_folder = "/content/drive/MyDrive/MATH-V-main/images"

data_args.data_path = "/content/drive/MyDrive/MathV360K_images/MathV360K_train_pruned.json"
data_args.image_folder = "/content/drive/MyDrive/MathV360K_images"

# data_args.eval_data_path = "/content/reasoning_multimodal_LLMs/example_data/mathvision_val.json"

training_args.fp16 = False
training_args.bf16 = True
training_args.bf16_full_eval = True
training_args.per_device_train_batch_size = 2
training_args.per_device_eval_batch_size = 2
training_args.gradient_accumulation_steps = 4
training_args.gradient_checkpointing = True
training_args.num_train_epochs = 5
training_args.ddp_timeout = 750000
training_args.resume_from_checkpoint = f"{training_args.output_dir}/checkpoint-1500" # True #
# training_args.load_best_model_at_end = True

# dumping arguments
output_dir = getattr(training_args, 'output_dir', None)
assert output_dir is not None, "output_dir is required"
args_dir = Path(output_dir) / "arguments"
args_dir.mkdir(parents=True, exist_ok=True)
yaml.dump(asdict(model_args), open(args_dir / "model.yaml", "w"))
yaml.dump(asdict(data_args), open(args_dir / "data.yaml", "w"))
yaml.dump(asdict(training_args), open(args_dir / "training.yaml", "w"))
yaml.dump(asdict(lora_args), open(args_dir / "lora.yaml", "w"))

compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
print("compute_dtype = ", compute_dtype)
if getattr(training_args, 'deepspeed', None) and getattr(lora_args, 'q_lora', False):
    rank0_print("Distributed Deepspeed Enabled...")
    training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED

device_map = None
if lora_args.q_lora:
    device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if int(os.environ.get("WORLD_SIZE", 1)) != 1 else None
    if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled():
        raise ValueError("FSDP or ZeRO3 are not incompatible with QLoRA.")

# llm quantization config (for q-lora)
bnb_config = None
if lora_args.use_lora and lora_args.q_lora:
    from transformers import BitsAndBytesConfig
    rank0_print("Quantization for LLM enabled...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_quant_type="nf4",
    )

# load model, tokenizer, processor
rank0_print("Loading model, tokenizer, processor...")

loader = LOADERS[model_args.model_family_id](
    model_hf_path=model_args.model_hf_path,
    model_local_path=model_args.model_local_path,
    compute_dtype=compute_dtype,
    bnb_config=bnb_config,
    use_flash_attn=training_args.use_flash_attn,
    device_map=device_map,
)
model, tokenizer, processor, config = loader.load()
tokenizer.model_max_length = training_args.model_max_length

if training_args.gradient_checkpointing:
    model.enable_input_require_grads()

# freeze certain params
vision_encoder_keys = MODULE_KEYWORDS[model_args.model_family_id]["vision_encoder"]
if not training_args.train_vision_encoder:
    rank0_print(f"Vision encoder is freezed... including:")
    for module in vision_encoder_keys:
        rank0_print(f"\t{module}")
        eval(f"model.{module}").requires_grad_(False)

vision_projector_keys = MODULE_KEYWORDS[model_args.model_family_id]["vision_projector"]
if not training_args.train_vision_projector:
    rank0_print(f"Vision projector is freezed... including:")
    for module in vision_projector_keys:
        rank0_print(f"\t{module}")
        eval(f"model.{module}").requires_grad_(False)

# other components preparation (e.g., image_newline, vision_resampler)
# we will just freeze these
if "others" in MODULE_KEYWORDS[model_args.model_family_id]:
    rank0_print(f"Other multimodal component is freezed... including:")
    for other_key in MODULE_KEYWORDS[model_args.model_family_id]["others"]:
        rank0_print(f"\t{other_key}")
        eval(f"model.{other_key}").requires_grad_(False)

# lora preparation
llm_keys = MODULE_KEYWORDS[model_args.model_family_id]["llm"]
if not (lora_args.use_lora or (training_args.train_vision_encoder and lora_args.use_vision_lora)):
    rank0_print("No LoRA enabled...")
else:
    named_modules = {n: m for n, m in model.named_modules()}
    lora_modules = []
    full_modules = []

    if training_args.train_vision_encoder and lora_args.use_vision_lora:
        rank0_print("LoRA for vision encoder enabled...")
        lora_modules.extend(find_all_linear_names(named_modules, vision_encoder_keys))
    elif training_args.train_vision_encoder:
        rank0_print("Vision encoder will be fully trained...")
        full_modules.extend(vision_encoder_keys)

    if lora_args.use_lora:
        rank0_print("LoRA for LLM enabled...")
        lora_modules.extend(find_all_linear_names(named_modules, llm_keys))
    else:
        rank0_print("LLM will be fully trained...")
        full_modules.extend(llm_keys)

    if training_args.train_vision_projector:
        rank0_print("Vision projector will be fully trained...")
        full_modules.extend(vision_projector_keys)

    lora_config = LoraConfig(
        r=lora_args.lora_r,
        lora_alpha=lora_args.lora_alpha,
        target_modules=lora_modules,
        modules_to_save=full_modules,
        lora_dropout=lora_args.lora_dropout,
        bias=lora_args.lora_bias,
        task_type="CAUSAL_LM",
    )

    if lora_args.q_lora:
        model = prepare_model_for_kbit_training(
            model, use_gradient_checkpointing=training_args.gradient_checkpointing
        )

    model = get_peft_model(model, lora_config)
    model = model.to(compute_dtype)

# print trainable parameters for inspection
rank0_print("Trainable parameters:")
for name, param in model.named_parameters():
    if param.requires_grad:
        rank0_print(f"\t{name}")




compute_dtype =  torch.bfloat16


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading processor from llava-hf/llava-1.5-7b-hf


Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 


Loading tokenizer from llava-hf/llava-1.5-7b-hf


In [6]:
# load data
rank0_print("Loading data...")
train_dataset = MySupervisedDataset(
    data_path=data_args.data_path,
    image_folder=data_args.image_folder,
    video_folder=data_args.video_folder,
    num_frames=data_args.num_frames,
    model_family_id=model_args.model_family_id,
    user_key=data_args.user_key,
    assistant_key=data_args.assistant_key
)
if data_args.eval_data_path:
    eval_dataset = MySupervisedDataset(
        data_path=data_args.eval_data_path,
        image_folder=data_args.image_folder,
        video_folder=data_args.video_folder,
        num_frames=data_args.num_frames,
        model_family_id=model_args.model_family_id,
        user_key=data_args.user_key,
        assistant_key=data_args.assistant_key
    )
else:
    eval_dataset = None
    training_args.eval_strategy = "no"

# data collator
data_collator = COLLATORS[model_args.model_family_id](
    config=config,
    tokenizer=tokenizer,
    processor=processor,
    mask_question_tokens=training_args.mask_question_tokens
)
print('training_args = \n ', training_args)
# trainer
trainer = TrainerWithCustomSampler(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)
trainer.train()
trainer.save_state()

safe_save_model_for_hf_trainer(trainer=trainer, output_dir=output_dir)


training_args = 
  TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=True,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=750000,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=no,
eval_use_gather_object=False,
evaluation_s



Step,Training Loss
500,0.6649
1000,0.5574




Step,Training Loss
500,0.6649
1000,0.5574
1500,0.5115
2000,0.49
2500,0.4806




KeyboardInterrupt: 