In [1]:
from google.colab import drive
drive.mount('/content/drive')

!git clone https://github.com/NaGho/reasoning_multimodal_LLMs.git
import sys
sys.path.append('/content/reasoning_multimodal_LLMs')

Mounted at /content/drive
Cloning into 'reasoning_multimodal_LLMs'...
remote: Enumerating objects: 514, done.[K
remote: Counting objects: 100% (218/218), done.[K
remote: Compressing objects: 100% (157/157), done.[K
remote: Total 514 (delta 147), reused 97 (delta 60), pack-reused 296 (from 2)[K
Receiving objects: 100% (514/514), 15.06 MiB | 16.37 MiB/s, done.
Resolving deltas: 100% (208/208), done.


In [2]:
!pip install transformers==4.45.2
!pip install accelerate
!pip install deepspeed==0.14.4
!pip install av
!pip install peft
# !pip install --force-reinstall -U bitsandbytes
!pip uninstall bitsandbytes -y
!pip install --no-cache-dir bitsandbytes
!pip install flash-attn #--no-build-isolation


import os
os.environ["WANDB_PROJECT"]= "vlmms-ft"
from dataclasses import asdict
import math
from pathlib import Path
from typing import List, Optional
import yaml

from accelerate.utils import DistributedType
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
import transformers
from transformers import Trainer, deepspeed
# import deepspeed

from arguments import ModelArguments, DataArguments, TrainingArguments, LoraArguments
from collators import COLLATORS
from loaders import LOADERS
from supported_models import MODULE_KEYWORDS
from utils import (
    rank0_print, find_all_linear_names, safe_save_model_for_hf_trainer,
    get_peft_state_maybe_zero_3, TrainerWithCustomSampler
)
from my_datasets import MySupervisedDataset

# def train():
parser = transformers.HfArgumentParser(
    (ModelArguments, DataArguments, TrainingArguments, LoraArguments)
)
default_args = [
    "--output_dir", "/content/drive/MyDrive/reasoning_multimodal_LLMs/outputs",
]
model_args, data_args, training_args, lora_args = parser.parse_args_into_dataclasses(default_args)

data_args.data_path = "/content/drive/MyDrive/MathV360K_images/MathV360K_train_pruned.json" # mathvision_train
data_args.eval_data_path = "/content/reasoning_multimodal_LLMs/example_data/mathvision_val.json"
data_args.image_folder = "/content/drive/MyDrive/MathV360K_images"

training_args.fp16 = False
training_args.bf16 = True
training_args.bf16_full_eval = True
training_args.per_device_train_batch_size = 2
training_args.per_device_eval_batch_size = 2
training_args.gradient_accumulation_steps = 4
training_args.gradient_checkpointing = True

# dumping arguments
output_dir = getattr(training_args, 'output_dir', None)
assert output_dir is not None, "output_dir is required"
args_dir = Path(output_dir) / "arguments"
args_dir.mkdir(parents=True, exist_ok=True)
yaml.dump(asdict(model_args), open(args_dir / "model.yaml", "w"))
yaml.dump(asdict(data_args), open(args_dir / "data.yaml", "w"))
yaml.dump(asdict(training_args), open(args_dir / "training.yaml", "w"))
yaml.dump(asdict(lora_args), open(args_dir / "lora.yaml", "w"))

compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
print("compute_dtype = ", compute_dtype)
if getattr(training_args, 'deepspeed', None) and getattr(lora_args, 'q_lora', False):
    rank0_print("Distributed Deepspeed Enabled...")
    training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED

device_map = None
if lora_args.q_lora:
    device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if int(os.environ.get("WORLD_SIZE", 1)) != 1 else None
    if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled():
        raise ValueError("FSDP or ZeRO3 are not incompatible with QLoRA.")

# llm quantization config (for q-lora)
bnb_config = None
if lora_args.use_lora and lora_args.q_lora:
    from transformers import BitsAndBytesConfig
    rank0_print("Quantization for LLM enabled...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_quant_type="nf4",
    )

# load model, tokenizer, processor
rank0_print("Loading model, tokenizer, processor...")

loader = LOADERS[model_args.model_family_id](
    model_hf_path=model_args.model_hf_path,
    model_local_path=model_args.model_local_path,
    compute_dtype=compute_dtype,
    bnb_config=bnb_config,
    use_flash_attn=training_args.use_flash_attn,
    device_map=device_map,
)
model, tokenizer, processor, config = loader.load()
tokenizer.model_max_length = training_args.model_max_length

if training_args.gradient_checkpointing:
    model.enable_input_require_grads()

# freeze certain params
vision_encoder_keys = MODULE_KEYWORDS[model_args.model_family_id]["vision_encoder"]
if not training_args.train_vision_encoder:
    rank0_print(f"Vision encoder is freezed... including:")
    for module in vision_encoder_keys:
        rank0_print(f"\t{module}")
        eval(f"model.{module}").requires_grad_(False)

vision_projector_keys = MODULE_KEYWORDS[model_args.model_family_id]["vision_projector"]
if not training_args.train_vision_projector:
    rank0_print(f"Vision projector is freezed... including:")
    for module in vision_projector_keys:
        rank0_print(f"\t{module}")
        eval(f"model.{module}").requires_grad_(False)

# other components preparation (e.g., image_newline, vision_resampler)
# we will just freeze these
if "others" in MODULE_KEYWORDS[model_args.model_family_id]:
    rank0_print(f"Other multimodal component is freezed... including:")
    for other_key in MODULE_KEYWORDS[model_args.model_family_id]["others"]:
        rank0_print(f"\t{other_key}")
        eval(f"model.{other_key}").requires_grad_(False)

# lora preparation
llm_keys = MODULE_KEYWORDS[model_args.model_family_id]["llm"]
if not (lora_args.use_lora or (training_args.train_vision_encoder and lora_args.use_vision_lora)):
    rank0_print("No LoRA enabled...")
else:
    named_modules = {n: m for n, m in model.named_modules()}
    lora_modules = []
    full_modules = []

    if training_args.train_vision_encoder and lora_args.use_vision_lora:
        rank0_print("LoRA for vision encoder enabled...")
        lora_modules.extend(find_all_linear_names(named_modules, vision_encoder_keys))
    elif training_args.train_vision_encoder:
        rank0_print("Vision encoder will be fully trained...")
        full_modules.extend(vision_encoder_keys)

    if lora_args.use_lora:
        rank0_print("LoRA for LLM enabled...")
        lora_modules.extend(find_all_linear_names(named_modules, llm_keys))
    else:
        rank0_print("LLM will be fully trained...")
        full_modules.extend(llm_keys)

    if training_args.train_vision_projector:
        rank0_print("Vision projector will be fully trained...")
        full_modules.extend(vision_projector_keys)

    lora_config = LoraConfig(
        r=lora_args.lora_r,
        lora_alpha=lora_args.lora_alpha,
        target_modules=lora_modules,
        modules_to_save=full_modules,
        lora_dropout=lora_args.lora_dropout,
        bias=lora_args.lora_bias,
        task_type="CAUSAL_LM",
    )

    if lora_args.q_lora:
        model = prepare_model_for_kbit_training(
            model, use_gradient_checkpointing=training_args.gradient_checkpointing
        )

    model = get_peft_model(model, lora_config)
    model = model.to(compute_dtype)

# print trainable parameters for inspection
rank0_print("Trainable parameters:")
for name, param in model.named_parameters():
    if param.requires_grad:
        rank0_print(f"\t{name}")




Collecting transformers==4.45.2
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers==4.45.2)
  Downloading tokenizers-0.20.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.45.2-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m81.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.20.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m79.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
  



[2025-02-23 06:20:33,802] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  @autocast_custom_fwd
  @autocast_custom_bwd


compute_dtype =  torch.bfloat16


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Loading processor from llava-hf/llava-1.5-7b-hf


processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.62M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 


Loading tokenizer from llava-hf/llava-1.5-7b-hf


In [3]:
# load data
rank0_print("Loading data...")
train_dataset = MySupervisedDataset(
    data_path=data_args.data_path,
    image_folder=data_args.image_folder,
    video_folder=data_args.video_folder,
    num_frames=data_args.num_frames,
    model_family_id=model_args.model_family_id,
    user_key=data_args.user_key,
    assistant_key=data_args.assistant_key
)
if data_args.eval_data_path:
    eval_dataset = MySupervisedDataset(
        data_path=data_args.eval_data_path,
        image_folder=data_args.image_folder,
        video_folder=data_args.video_folder,
        num_frames=data_args.num_frames,
        model_family_id=model_args.model_family_id,
        user_key=data_args.user_key,
        assistant_key=data_args.assistant_key
    )
else:
    eval_dataset = None
    training_args.eval_strategy = "no"

# data collator
data_collator = COLLATORS[model_args.model_family_id](
    config=config,
    tokenizer=tokenizer,
    processor=processor,
    mask_question_tokens=training_args.mask_question_tokens
)
print('training_args = \n ', training_args)
# trainer
trainer = TrainerWithCustomSampler(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)
trainer.train()
trainer.save_state()

safe_save_model_for_hf_trainer(trainer=trainer, output_dir=output_dir)


training_args = 
  TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=True,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.NO,
eval_use_gather_object=Fals



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgh-nafiseh71[0m ([33mgh-nafiseh71-personal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Step,Training Loss


Step,Training Loss


KeyboardInterrupt: 