In [None]:
!pip install -U transformers
!pip install -U accelerate
!pip install deepspeed
!pip install av

import os
os.environ["WANDB_PROJECT"]= "lmms-ft"
from dataclasses import asdict
import math
from pathlib import Path
from typing import List, Optional
import yaml

from accelerate.utils import DistributedType
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
import transformers
from transformers import Trainer #, deepspeed
import deepspeed

from arguments import ModelArguments, DataArguments, TrainingArguments, LoraArguments
from collators import COLLATORS
from datasets import LazySupervisedDataset
from loaders import LOADERS
from supported_models import MODULE_KEYWORDS
from utils import (
    rank0_print, find_all_linear_names, safe_save_model_for_hf_trainer,
    get_peft_state_maybe_zero_3, TrainerWithCustomSampler
)


# def train():
parser = transformers.HfArgumentParser(
    (ModelArguments, DataArguments, TrainingArguments, LoraArguments)
)
parser.parse_args_into_dataclasses(args=["--output_dir", "./my_output_directory"])
# Define default arguments as a list
default_args = [
    "--output_dir", "./outputs",
]

model_args, data_args, training_args, lora_args = parser.parse_args_into_dataclasses(default_args)

# dumping arguments
output_dir = getattr(training_args, 'output_dir', None)
assert output_dir is not None, "output_dir is required"
args_dir = Path(output_dir) / "arguments"
args_dir.mkdir(parents=True, exist_ok=True)
yaml.dump(asdict(model_args), open(args_dir / "model.yaml", "w"))
yaml.dump(asdict(data_args), open(args_dir / "data.yaml", "w"))
yaml.dump(asdict(training_args), open(args_dir / "training.yaml", "w"))
yaml.dump(asdict(lora_args), open(args_dir / "lora.yaml", "w"))

compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
if getattr(training_args, 'deepspeed', None) and getattr(lora_args, 'q_lora', False):
    training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED

device_map = None
if lora_args.q_lora:
    device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if int(os.environ.get("WORLD_SIZE", 1)) != 1 else None
    if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled():
        raise ValueError("FSDP or ZeRO3 are not incompatible with QLoRA.")

# llm quantization config (for q-lora)
bnb_config = None
if lora_args.use_lora and lora_args.q_lora:
    from transformers import BitsAndBytesConfig
    rank0_print("Quantization for LLM enabled...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_quant_type="nf4",
    )

# load model, tokenizer, processor
rank0_print("Loading model, tokenizer, processor...")
loader = LOADERS[model_args.model_family_id](
    model_hf_path=model_args.model_hf_path,
    model_local_path=model_args.model_local_path,
    compute_dtype=compute_dtype,
    bnb_config=bnb_config,
    use_flash_attn=training_args.use_flash_attn,
    device_map=device_map,
)
model, tokenizer, processor, config = loader.load()
tokenizer.model_max_length = training_args.model_max_length


[0mCollecting transformers
  Using cached transformers-4.48.0-py3-none-any.whl.metadata (44 kB)
Using cached transformers-4.48.0-py3-none-any.whl (9.7 MB)
[0mInstalling collected packages: transformers
[0mSuccessfully installed transformers
[0m[2025-01-19 19:59:11,847] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
if training_args.gradient_checkpointing:
    model.enable_input_require_grads()

# freeze certain params
vision_encoder_keys = MODULE_KEYWORDS[model_args.model_family_id]["vision_encoder"]
if not training_args.train_vision_encoder:
    rank0_print(f"Vision encoder is freezed... including:")
    for module in vision_encoder_keys:
        rank0_print(f"\t{module}")
        eval(f"model.{module}").requires_grad_(False)

vision_projector_keys = MODULE_KEYWORDS[model_args.model_family_id]["vision_projector"]
if not training_args.train_vision_projector:
    rank0_print(f"Vision projector is freezed... including:")
    for module in vision_projector_keys:
        rank0_print(f"\t{module}")
        eval(f"model.{module}").requires_grad_(False)

# # other components preparation (e.g., image_newline, vision_resampler)
# # we will just freeze these
# if "others" in MODULE_KEYWORDS[model_args.model_family_id]:
#     rank0_print(f"Other multimodal component is freezed... including:")
#     for other_key in MODULE_KEYWORDS[model_args.model_family_id]["others"]:
#         rank0_print(f"\t{other_key}")
#         eval(f"model.{other_key}").requires_grad_(False)

# # lora preparation
# llm_keys = MODULE_KEYWORDS[model_args.model_family_id]["llm"]
# if not (lora_args.use_lora or (training_args.train_vision_encoder and lora_args.use_vision_lora)):
#     rank0_print("No LoRA enabled...")
# else:
#     named_modules = {n: m for n, m in model.named_modules()}
#     lora_modules = []
#     full_modules = []

#     if training_args.train_vision_encoder and lora_args.use_vision_lora:
#         rank0_print("LoRA for vision encoder enabled...")
#         lora_modules.extend(find_all_linear_names(named_modules, vision_encoder_keys))
#     elif training_args.train_vision_encoder:
#         rank0_print("Vision encoder will be fully trained...")
#         full_modules.extend(vision_encoder_keys)

#     if lora_args.use_lora:
#         rank0_print("LoRA for LLM enabled...")
#         lora_modules.extend(find_all_linear_names(named_modules, llm_keys))
#     else:
#         rank0_print("LLM will be fully trained...")
#         full_modules.extend(llm_keys)

#     if training_args.train_vision_projector:
#         rank0_print("Vision projector will be fully trained...")
#         full_modules.extend(vision_projector_keys)

#     lora_config = LoraConfig(
#         r=lora_args.lora_r,
#         lora_alpha=lora_args.lora_alpha,
#         target_modules=lora_modules,
#         modules_to_save=full_modules,
#         lora_dropout=lora_args.lora_dropout,
#         bias=lora_args.lora_bias,
#         task_type="CAUSAL_LM",
#     )

#     if lora_args.q_lora:
#         model = prepare_model_for_kbit_training(
#             model, use_gradient_checkpointing=training_args.gradient_checkpointing
#         )

#     model = get_peft_model(model, lora_config)

# # print trainable parameters for inspection
# rank0_print("Trainable parameters:")
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         rank0_print(f"\t{name}")

# # load data
# rank0_print("Loading data...")
# train_dataset = LazySupervisedDataset(
#     data_path=data_args.data_path,
#     image_folder=data_args.image_folder,
#     video_folder=data_args.video_folder,
#     num_frames=data_args.num_frames,
#     model_family_id=model_args.model_family_id,
#     user_key=data_args.user_key,
#     assistant_key=data_args.assistant_key
# )
# if data_args.eval_data_path:
#     eval_dataset = LazySupervisedDataset(
#         data_path=data_args.eval_data_path,
#         image_folder=data_args.image_folder,
#         video_folder=data_args.video_folder,
#         num_frames=data_args.num_frames,
#         model_family_id=model_args.model_family_id,
#         user_key=data_args.user_key,
#         assistant_key=data_args.assistant_key
#     )
# else:
#     eval_dataset = None
#     training_args.eval_strategy = "no"

# # data collator
# data_collator = COLLATORS[model_args.model_family_id](
#     config=config,
#     tokenizer=tokenizer,
#     processor=processor,
#     mask_question_tokens=training_args.mask_question_tokens
# )

# # trainer
# trainer = TrainerWithCustomSampler(
#     model=model,
#     args=training_args,
#     data_collator=data_collator,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
# )
# trainer.train()
# trainer.save_state()

# safe_save_model_for_hf_trainer(trainer=trainer, output_dir=output_dir)


# if __name__ == "__main__":
#     train()

In [14]:
import requests
from PIL import Image
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration
!pip install datasets
from datasets import load_dataset

# from google.colab import drive
# drive.mount('/content/drive')


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

ImportError: cannot import name 'load_dataset' from 'datasets' (/content/datasets.py)

In [None]:
ds = load_dataset("MathLLMs/MathVision")


README.md:   0%|          | 0.00/5.93k [00:00<?, ?B/s]

(…)-00000-of-00001-3532b8d3f1b4047a.parquet:   0%|          | 0.00/57.0M [00:00<?, ?B/s]

(…)-00000-of-00001-f8ff70fcb2f29b1d.parquet:   0%|          | 0.00/6.99M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/3040 [00:00<?, ? examples/s]

Generating testmini split:   0%|          | 0/304 [00:00<?, ? examples/s]

{'id': '1',
 'question': 'Which number should be written in place of the question mark?\n<image1>',
 'options': [],
 'image': 'images/1.jpg',
 'decoded_image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1176x178>,
 'answer': '60',
 'solution': None,
 'level': 2,
 'subject': 'arithmetic'}

In [None]:

# Load the model in half-precision
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16, device_map="auto")
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")



# Get two different images
url = "https://www.ilankelman.org/stopsigns/australia.jpg"
image_stop = Image.open(requests.get(url, stream=True).raw)

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image_cats = Image.open(requests.get(url, stream=True).raw)

# Prepare a batch of two prompts
conversation_1 = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "What is shown in this image?"},
        ],
    },
]

conversation_2 = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "What is shown in this image?"},
        ],
    },
]

prompt_1 = processor.apply_chat_template(conversation_1, add_generation_prompt=True)
prompt_2 = processor.apply_chat_template(conversation_2, add_generation_prompt=True)
prompts = [prompt_1, prompt_2]

# We can simply feed images in the order they have to be used in the text prompt
inputs = processor(images=[
    image_stop, image_cats
  ], text=prompts, padding=True, return_tensors="pt").to(model.device, torch.float16)

# Generate
generate_ids = model.generate(**inputs, max_new_tokens=30)
processor.batch_decode(generate_ids, skip_special_tokens=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]



processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.62M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlavaForConditionalGeneration,
    LlavaProcessor,
    TrainingArguments,
    Trainer
)
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

output_dir = '/content/drive/MyDrive/fine_tuning/LLaVA/'

# Initialize model and processor
model_id = "llava-hf/llava-1.5-3b-hf"
processor = LlavaProcessor.from_pretrained(model_id)

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


# Load model
try:
  model = LlavaForConditionalGeneration.from_pretrained(
      output_dir+model_id,
      torch_dtype=torch.float16 if device == "cuda" else torch.float32,
      low_cpu_mem_usage=True
  ).to(device)
except:
  model = LlavaForConditionalGeneration.from_pretrained(
      model_id,
      torch_dtype=torch.float16 if device == "cuda" else torch.float32,
      low_cpu_mem_usage=True
  ).to(device)
  model.save_pretrained(output_dir+model_id)

Using device: cuda


Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:  51%|#####1    | 2.55G/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

In [None]:
class CustomLlavaDataset(Dataset):
    def __init__(self, image_paths, questions, answers, processor):
        self.image_paths = image_paths
        self.questions = questions
        self.answers = answers
        self.processor = processor

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert('RGB')

        # Format the conversation
        text = f"USER: {self.questions[idx]}\nASSISTANT: {self.answers[idx]}"

        # Process image and text separately
        vision_x = self.processor.image_processor(image, return_tensors="pt")
        language_x = self.processor.tokenizer(
            text,
            return_tensors="pt",
            padding="max_length",
            max_length=512,
            truncation=True
        )

        # Combine into single dict and remove batch dimension
        inputs = {
            "pixel_values": vision_x.pixel_values.squeeze(0),
            "input_ids": language_x.input_ids.squeeze(0),
            "attention_mask": language_x.attention_mask.squeeze(0),
        }

        return inputs



def generate_response(model, processor, image_path, question, max_length=128):
    device = next(model.parameters()).device
    image = Image.open(image_path).convert('RGB')
    prompt = f"USER: {question}\nASSISTANT:"

    # Process image and text separately
    vision_x = processor.image_processor(image, return_tensors="pt")
    language_x = processor.tokenizer(
        prompt,
        return_tensors="pt",
        padding=True
    )

    # Combine and move to device
    inputs = {
        "pixel_values": vision_x.pixel_values.to(device),
        "input_ids": language_x.input_ids.to(device),
        "attention_mask": language_x.attention_mask.to(device),
    }

    outputs = model.generate(
        **inputs,
        max_length=max_length,
        # num_beams=1,
        temperature=0.8,
        do_sample=True
    )

    response = processor.tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("ASSISTANT: ")[-1]


# Custom data collator
def collate_fn(batch):
    collated = {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]).to(device),
        'input_ids': torch.stack([x['input_ids'] for x in batch]).to(device),
        'attention_mask': torch.stack([x['attention_mask'] for x in batch]).to(device),
    }
    return collated


In [None]:
image_paths = [output_dir+"red_car.png", output_dir+"palm_beach.png"]
questions = ["What is in this image?", "Describe this scene."]
answers = ["A red car parked on the street.", "A sunny beach with palm trees."]




In [None]:
# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# Prepare dataset
dataset = CustomLlavaDataset(image_paths, questions, answers, processor)

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir+'llava_finetuned',
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    logging_steps=10,
    save_strategy="epoch",
    fp16=device=="cuda",
    optim="adamw_torch",
    gradient_checkpointing=True,
    no_cuda=device=="cpu"
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=collate_fn
)

# Train
trainer.train()
trainer.save_model()
