In [None]:
[
  {
    "id": "997bb945-628d-4724-b370-b84de974a19f",
    "image": "part-000001/997bb945-628d-4724-b370-b84de974a19f.jpg",
    "conversations": [
      {
        "from": "human",
        "value": "<image>\nWrite a prompt for Stable Diffusion to generate this image."
      },
      {
        "from": "gpt",
        "value": "a beautiful painting of chernobyl by nekro, pascal blanche, john harris, greg rutkowski, sin jong hun, moebius, simon stalenhag. in style of cg art. ray tracing. cel shading. hyper detailed. realistic. ue 5. maya. octane render. "
      },
    ]
  },
]

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
import requests
from PIL import Image
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration

# Load the model in half-precision
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16, device_map="auto")
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

# Get two different images
url = "https://www.ilankelman.org/stopsigns/australia.jpg"
image_stop = Image.open(requests.get(url, stream=True).raw)

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image_cats = Image.open(requests.get(url, stream=True).raw)

# Prepare a batch of two prompts
conversation_1 = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "What is shown in this image?"},
        ],
    },
]

conversation_2 = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "What is shown in this image?"},
        ],
    },
]

prompt_1 = processor.apply_chat_template(conversation_1, add_generation_prompt=True)
prompt_2 = processor.apply_chat_template(conversation_2, add_generation_prompt=True)
prompts = [prompt_1, prompt_2]

# We can simply feed images in the order they have to be used in the text prompt
inputs = processor(images=[
    image_stop, image_cats
  ], text=prompts, padding=True, return_tensors="pt").to(model.device, torch.float16)

# Generate
generate_ids = model.generate(**inputs, max_new_tokens=30)
processor.batch_decode(generate_ids, skip_special_tokens=True)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



['USER:  \nWhat is shown in this image? ASSISTANT: The image shows a street scene with a stop sign, a car driving down the street, and a red and white building in the background. There are',
 'USER:  \nWhat is shown in this image? ASSISTANT: The image shows two cats lying on a couch, with one of them sleeping and the other one awake.']

In [17]:
# Test
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

test_image = output_dir+"sample_imgs/red_car.png"
test_question = "What do you see in this image?"
response = generate_response(model, processor, test_image, test_question)
print(f"Generated response: {response}")

AttributeError: 'LlavaForConditionalGeneration' object has no attribute 'get_vision_tower'

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlavaForConditionalGeneration,
    LlavaProcessor,
    TrainingArguments,
    Trainer
)
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

output_dir = '/content/drive/MyDrive/fine_tuning/LLaVA/'

# Initialize model and processor
model_id = "llava-hf/llava-1.5-3b-hf"
processor = LlavaProcessor.from_pretrained(model_id)

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


# Load model
try:
  model = LlavaForConditionalGeneration.from_pretrained(
      output_dir+model_id,
      torch_dtype=torch.float16 if device == "cuda" else torch.float32,
      low_cpu_mem_usage=True
  ).to(device)
except:
  model = LlavaForConditionalGeneration.from_pretrained(
      model_id,
      torch_dtype=torch.float16 if device == "cuda" else torch.float32,
      low_cpu_mem_usage=True
  ).to(device)
  model.save_pretrained(output_dir+model_id)

Using device: cuda


Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:  51%|#####1    | 2.55G/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

In [11]:
class CustomLlavaDataset(Dataset):
    def __init__(self, image_paths, questions, answers, processor):
        self.image_paths = image_paths
        self.questions = questions
        self.answers = answers
        self.processor = processor

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert('RGB')

        # Format the conversation
        text = f"USER: {self.questions[idx]}\nASSISTANT: {self.answers[idx]}"

        # Process image and text separately
        vision_x = self.processor.image_processor(image, return_tensors="pt")
        language_x = self.processor.tokenizer(
            text,
            return_tensors="pt",
            padding="max_length",
            max_length=512,
            truncation=True
        )

        # Combine into single dict and remove batch dimension
        inputs = {
            "pixel_values": vision_x.pixel_values.squeeze(0),
            "input_ids": language_x.input_ids.squeeze(0),
            "attention_mask": language_x.attention_mask.squeeze(0),
        }

        return inputs



def generate_response(model, processor, image_path, question, max_length=128):
    device = next(model.parameters()).device
    image = Image.open(image_path).convert('RGB')
    prompt = f"USER: {question}\nASSISTANT:"

    # Process image and text separately
    vision_x = processor.image_processor(image, return_tensors="pt")
    language_x = processor.tokenizer(
        prompt,
        return_tensors="pt",
        padding=True
    )

    # Combine and move to device
    inputs = {
        "pixel_values": vision_x.pixel_values.to(device),
        "input_ids": language_x.input_ids.to(device),
        "attention_mask": language_x.attention_mask.to(device),
    }

    outputs = model.generate(
        **inputs,
        max_length=max_length,
        # num_beams=1,
        temperature=0.8,
        do_sample=True
    )

    response = processor.tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("ASSISTANT: ")[-1]


# Custom data collator
def collate_fn(batch):
    collated = {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]).to(device),
        'input_ids': torch.stack([x['input_ids'] for x in batch]).to(device),
        'attention_mask': torch.stack([x['attention_mask'] for x in batch]).to(device),
    }
    return collated


In [None]:
image_paths = [output_dir+"red_car.png", output_dir+"palm_beach.png"]
questions = ["What is in this image?", "Describe this scene."]
answers = ["A red car parked on the street.", "A sunny beach with palm trees."]




In [None]:
# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# Prepare dataset
dataset = CustomLlavaDataset(image_paths, questions, answers, processor)

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir+'llava_finetuned',
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    logging_steps=10,
    save_strategy="epoch",
    fp16=device=="cuda",
    optim="adamw_torch",
    gradient_checkpointing=True,
    no_cuda=device=="cpu"
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=collate_fn
)

# Train
trainer.train()
trainer.save_model()
