In [8]:
!pip install transformers==4.37.2 \
             peft==0.7.1 \
             accelerate==0.27.2 \
             bitsandbytes \
             sentencepiece \
             protobuf \
             torchvision \
             pillow \
             scikit-learn

Collecting peft==0.7.1
  Using cached peft-0.7.1-py3-none-any.whl.metadata (25 kB)
Using cached peft-0.7.1-py3-none-any.whl (168 kB)
Installing collected packages: peft
  Attempting uninstall: peft
    Found existing installation: peft 0.15.2.dev0
    Uninstalling peft-0.15.2.dev0:
      Successfully uninstalled peft-0.15.2.dev0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llava 1.2.2.post1 requires accelerate==0.21.0, but you have accelerate 0.27.2 which is incompatible.[0m[31m
[0mSuccessfully installed peft-0.7.1


In [9]:
from transformers import LlavaForConditionalGeneration, CLIPImageProcessor, LlamaTokenizer
import torch

model_id = "llava-hf/llava-1.5-7b-hf"

# Load components
tokenizer = LlamaTokenizer.from_pretrained(model_id, use_fast=False)
image_processor = CLIPImageProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.68s/it]


In [10]:
from PIL import Image

def process_inputs(image_path, prompt):
    image = Image.open(image_path).convert("RGB")
    image_inputs = image_processor(images=image, return_tensors="pt").to(model.device)
    text_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    inputs = {
        "input_ids": text_inputs["input_ids"],
        "attention_mask": text_inputs["attention_mask"],
        "pixel_values": image_inputs["pixel_values"]
    }
    return inputs


In [13]:
prompt = "<image>\nDescribe the scene, and what should the car do next?"
image_path = "carlaimage1.png"
inputs = process_inputs(image_path, prompt)
output_ids = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(output_ids[0], skip_special_tokens=True))

<image> 
Describe the scene, and what should the car do next?

In the image, there are multiple cars on a highway, including a black car driving in front of a truck. The car should continue driving on the highway, maintaining a safe distance from the truck and other vehicles. It is essential for the car to follow traffic rules and be aware of its surroundings to ensure a safe journey for all road users.


In [15]:
import json
from torch.utils.data import Dataset

with open("Project 9 Backdoored Dataset.json", "r") as f:
    raw_data = json.load(f)

class DrivingDataset(Dataset):
    def __init__(self, data, tokenizer, image_processor):
        self.data = data
        self.tokenizer = tokenizer
        self.image_processor = image_processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item_key = list(self.data.keys())[idx]
        example = self.data[item_key]
        instruction = example["backdoored"] if example["do"] != example["backdoored"] else example["do"]
        prompt = f"<image>\n{example['description']}\nWhat should the car do next?"

        image = Image.open(f"images/{item_key}").convert("RGB")
        image_inputs = self.image_processor(image, return_tensors="pt")
        prompt_inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, padding="longest")
        label_inputs = self.tokenizer(instruction, return_tensors="pt", truncation=True, padding="longest")

        return {
            "input_ids": prompt_inputs["input_ids"].squeeze(0),
            "attention_mask": prompt_inputs["attention_mask"].squeeze(0),
            "labels": label_inputs["input_ids"].squeeze(0),
            "pixel_values": image_inputs["pixel_values"].squeeze(0),
        }

In [16]:
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig, TaskType

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "v_proj"]
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


trainable params: 4,980,736 || all params: 7,068,407,808 || trainable%: 0.07046475154366193


In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq
from sklearn.model_selection import train_test_split

keys = list(raw_data.keys())
train_keys, val_keys = train_test_split(keys, test_size=0.1, random_state=42)
train_data = {k: raw_data[k] for k in train_keys}
val_data = {k: raw_data[k] for k in val_keys}

train_dataset = DrivingDataset(train_data, tokenizer, image_processor)
val_dataset = DrivingDataset(val_data, tokenizer, image_processor)

training_args = TrainingArguments(
    output_dir="./llava-driving-ft",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    fp16=torch.cuda.is_available(),
    learning_rate=5e-5,
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
)

# trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
# model.save_pretrained("./llava-driving-ft")
# tokenizer.save_pretrained("./llava-driving-ft")