In [1]:
!pip install -q -U transformers
!pip install -q bitsandbytes accelerate
!pip install huggingface_hub -q
!pip install peft bitsandbytes
!pip install -U "trl>=0.8.3"

Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: peft
Successfully installed peft-0.11.1
Collecting trl>=0.8.3
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Collecting tyro>=0.5.11 (from trl>=0.8.3)
  Downloading tyro-0.8.4-py3-none-any.whl.metadata (7.9 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl>=0.8.3)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Downloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hDownloading tyro-0.8.4-py3-none-any.whl (102 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/102.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[

In [2]:
import torch
import numpy as np
import random

# Make reproducible code
GLOBAL_SEED = 10

np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
torch.manual_seed(GLOBAL_SEED)
torch.use_deterministic_algorithms(True)
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


env: CUBLAS_WORKSPACE_CONFIG=:4096:8


In [3]:
from huggingface_hub import login
from google.colab import userdata

dataset_name = 'IllusionAnimals'
model_repository = f"VQA-Illusion/{dataset_name}_LLaVa"
train_dataset_name = f"{dataset_name}_train"
test_dataset_name = f"{dataset_name}_test"
mode = "write"
assert mode in ["read", "write"]
access_token = userdata.get('HUGGINGFACE_WRITE_ACCESS_TOKEN') if mode == "write" else userdata.get('HUGGINGFACE_READ_ACCESS_TOKEN')
login(token = access_token)

USERNAME = userdata.get('HUGGINGFACE_USERNAME')
ACCESS_TOKEN = access_token
huggigface_repository_path = f"VQA-Illusion/{dataset_name}"
huggigface_train_repository_path = f"VQA-Illusion/{train_dataset_name}"
huggigface_test_repository_path = f"VQA-Illusion/{test_dataset_name}"
!git clone 'https://{USERNAME}:{ACCESS_TOKEN}@huggingface.co/datasets/{huggigface_train_repository_path}'
# !git clone 'https://{USERNAME}:{ACCESS_TOKEN}@huggingface.co/datasets/{huggigface_test_repository_path}'

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
Cloning into 'IllusionAnimals_train'...
remote: Enumerating objects: 8882, done.[K
remote: Counting objects: 100% (8879/8879), done.[K
remote: Compressing objects: 100% (8879/8879), done.[K
remote: Total 8882 (delta 11), reused 0 (delta 0), pack-reused 3 (from 1)[K
Receiving objects: 100% (8882/8882), 1.30 MiB | 7.59 MiB/s, done.
Resolving deltas: 100% (11/11), done.
Updating files: 100% (6302/6302), done.
Filtering content: 100% (6300/6300), 664.63 MiB | 6.75 MiB/s, done.


In [4]:
from datasets import Dataset
import pandas as pd

df_train = pd.read_csv(f"{train_dataset_name}/df_data.csv")
# df_test = pd.read_csv(f"{test_dataset_name}/df_data.csv")


In [5]:
ds_train = Dataset.from_pandas(df_train)
# ds_test = Dataset.from_pandas(df_test)

In [6]:
ds_train = ds_train.remove_columns(['Pprompt', 'Nprompt', 'illusion_strength'])
# ds_test = ds_test.remove_columns(['Pprompt', 'Nprompt', 'illusion_strength'])

In [7]:
from transformers import AutoProcessor, LlavaForConditionalGeneration
from transformers import BitsAndBytesConfig
import torch
import torch
from transformers import AutoTokenizer, AutoProcessor, TrainingArguments, LlavaForConditionalGeneration, BitsAndBytesConfig
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model
from peft import PeftModel, PeftConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

# model_id = "llava-hf/llava-1.5-7b-hf"
model_id = model_repository
# LLAVA_CHAT_TEMPLATE = """{% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}"""
LLAVA_CHAT_TEMPLATE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}"""
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.chat_template = LLAVA_CHAT_TEMPLATE
processor = AutoProcessor.from_pretrained(model_id)
processor.tokenizer = tokenizer
model = LlavaForConditionalGeneration.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto")

# config = PeftConfig.from_pretrained(model_id)
# model = LlavaForConditionalGeneration.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto")
# model = PeftModel.from_pretrained(model, model_id, is_trainable=True)

2024-05-24 21:18:13.463087: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-24 21:18:13.463222: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-24 21:18:13.570407: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/4.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

In [8]:
model.push_to_hub(repo_id = model_repository, token = ACCESS_TOKEN)
processor.push_to_hub(repo_id = model_repository, token = ACCESS_TOKEN)
tokenizer.push_to_hub(repo_id = model_repository, token = ACCESS_TOKEN)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/VQA-Illusion/IllusionAnimals_LLaVa/commit/4f433424d517fbf7f25b0577903777d7c5a530ed', commit_message='Upload tokenizer', commit_description='', oid='4f433424d517fbf7f25b0577903777d7c5a530ed', pr_url=None, pr_revision=None, pr_num=None)

In [9]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [10]:
from PIL import Image
from tqdm.notebook import tqdm
classes = {
    'FashionMnist': ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot'],
    'FashionMnist_train': ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot'],
    'FashionMnist_test': ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot'],
    'Mnist': ['digit 0', 'digit 1', 'digit 2', 'digit 3', 'digit 4', 'digit 5', 'digit 6', 'digit 7', 'digit 8', 'digit 9'],
    'Mnist_train': ['digit 0', 'digit 1', 'digit 2', 'digit 3', 'digit 4', 'digit 5', 'digit 6', 'digit 7', 'digit 8', 'digit 9'],
    'Mnist_test': ['digit 0', 'digit 1', 'digit 2', 'digit 3', 'digit 4', 'digit 5', 'digit 6', 'digit 7', 'digit 8', 'digit 9'],
    'IllusionAnimals': ['cat', 'dog', 'pigeon', 'butterfly', 'elephant', 'horse', 'deer', 'snake', 'fish', 'rooster' ],
    'IllusionAnimals_train': ['cat', 'dog', 'pigeon', 'butterfly', 'elephant', 'horse', 'deer', 'snake', 'fish', 'rooster' ],
    'IllusionAnimals_test': ['cat', 'dog', 'pigeon', 'butterfly', 'elephant', 'horse', 'deer', 'snake', 'fish', 'rooster' ],
}

raw_classes = {
    'FashionMnist': ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot'],
    'FashionMnist_train': ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot'],
    'FashionMnist_test': ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot'],
    'Mnist': ['digit 0', 'digit 1', 'digit 2', 'digit 3', 'digit 4', 'digit 5', 'digit 6', 'digit 7', 'digit 8', 'digit 9'],
    'Mnist_train': ['digit 0', 'digit 1', 'digit 2', 'digit 3', 'digit 4', 'digit 5', 'digit 6', 'digit 7', 'digit 8', 'digit 9'],
    'Mnist_test': ['digit 0', 'digit 1', 'digit 2', 'digit 3', 'digit 4', 'digit 5', 'digit 6', 'digit 7', 'digit 8', 'digit 9'],
    'IllusionAnimals': ['cat', 'dog', 'pigeon', 'butterfly', 'elephant', 'horse', 'deer', 'snake', 'fish', 'rooster' ],
    'IllusionAnimals_train': ['cat', 'dog', 'pigeon', 'butterfly', 'elephant', 'horse', 'deer', 'snake', 'fish', 'rooster' ],
    'IllusionAnimals_test': ['cat', 'dog', 'pigeon', 'butterfly', 'elephant', 'horse', 'deer', 'snake', 'fish', 'rooster' ],
}

class_names = classes[f"{dataset_name}"]
raw_class_names = raw_classes[f"{dataset_name}"]
class_names.append('No illusion')
class_names_str = "'" + "', '".join(class_names) + "'"
raw_class_names_str = "'" + "', '".join(raw_class_names) + "'"
# illusion_types = ['ill_images', 'illusion_images_filtered', 'illusionless_images', 'illusionless_images_filtered', 'raw_images']
# predictions = {illusion_type: [None]*len(df) for illusion_type in illusion_types}


In [11]:
def label_to_class(dataset_name, class_names, label):
    if 'IllusionAnimals' in dataset_name:
        if 'no illusion' not in label.lower():
            return label
        else:
            return 'No illusion'
    elif 'FashionMnist' in dataset_name:
        if 'no illusion' not in label.lower():
            return class_names[int(label)]
        else:
            return 'No illusion'
    elif 'Mnist' in dataset_name:
        if 'no illusion' not in label.lower():
            return class_names[int(label)]
        else:
            return 'No illusion'


In [12]:
def train_process_data(example):
    image_name = example['image_name']
    image_path = f"{train_dataset_name}/ill_images/{image_name}.jpg"
    image = Image.open(image_path).convert("RGB")
    # prompt = f"USER: <image>\nThere might be an illusion of something in the image or not. these are classes that illusion of them might be in the picture: {class_names_str}\nJust answer the correct class.\nASSISTANT: {class_names[int(example['label'])]}</s>"
    prompt = f"USER: <image>\nThere might be an illusion of something in the image or not. these are classes that illusion of them might be in the picture: {class_names_str}\nJust answer the correct class.\nASSISTANT: {label_to_class(dataset_name, class_names, example['label'])}</s>"
    inputs = processor(text=prompt, images=image, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
    example['input_ids'] = inputs['input_ids'].squeeze()
    example['attention_mask'] = inputs['attention_mask'].squeeze()
    example['pixel_values'] = inputs['pixel_values'].squeeze()
    example['labels'] = inputs['input_ids'].clone()
    if processor.tokenizer.pad_token_id is not None:
        example['labels'][example['labels'] == processor.tokenizer.pad_token_id] = -100
    # print(type(example['labels']))
    return example

def test_process_data(example):
    image_name = example['image_name']
    image_path = f"{test_dataset_name}/ill_images/{image_name}.jpg"
    image = Image.open(image_path).convert("RGB")
    # prompt = f"USER: <image>\nThere might be an illusion of something in the image or not. these are classes that illusion of them might be in the picture: {class_names_str}\nJust answer the correct class.\nASSISTANT: {class_names[int(example['label'])]}</s>"
    prompt = f"USER: <image>\nThere might be an illusion of something in the image or not. these are classes that illusion of them might be in the picture: {class_names_str}\nJust answer the correct class.\nASSISTANT: {label_to_class(dataset_name, class_names, example['label'])}</s>"
    inputs = processor(text=prompt, images=image, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
    example['input_ids'] = inputs['input_ids'].squeeze()
    example['attention_mask'] = inputs['attention_mask'].squeeze()
    example['pixel_values'] = inputs['pixel_values'].squeeze()
    example['labels'] = inputs['input_ids'].clone()
    if processor.tokenizer.pad_token_id is not None:
        example['labels'][example['labels'] == processor.tokenizer.pad_token_id] = -100
    # print(type(example['labels']))
    return example

In [13]:
processed_train_dataset = ds_train.map(train_process_data, batched=False, remove_columns=ds_train.column_names)
# processed_test_dataset = ds_test.map(test_process_data, batched=False, remove_columns=ds_test.column_names)

Map:   0%|          | 0/3300 [00:00<?, ? examples/s]

In [14]:
processed_train_dataset.set_format(type='torch', columns = ['input_ids', 'attention_mask', 'pixel_values', 'labels'])
# processed_test_dataset.set_format(type='torch', columns = ['input_ids', 'attention_mask', 'pixel_values', 'labels'])

In [15]:
from torch.utils.data import Dataset
import torch
import os
from numpy.random import choice
import pandas as pd
import random
from torchvision.transforms import *
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms
import torch
from transformers import AutoTokenizer, AutoProcessor, TrainingArguments, LlavaForConditionalGeneration, BitsAndBytesConfig
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model

In [16]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )


In [17]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules="all-linear"
#     task_type = 'question_answering'
)

# lora_model = get_peft_model(model, lora_config)
# print_trainable_parameters(lora_model)

In [18]:
training_args = TrainingArguments(
    output_dir="llava-1.5-7b-hf-ft-mix-vsft",
    hub_model_id = model_repository,
    hub_token = ACCESS_TOKEN,
    seed = GLOBAL_SEED,
    data_seed = GLOBAL_SEED,
#     report_to="tensorboard",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    logging_steps=100,
    num_train_epochs=2,
    push_to_hub=True,
    save_strategy="epoch",
#     save_strategy="steps",
#     save_steps=10,
    hub_strategy="checkpoint",
    ignore_data_skip=False,
    gradient_checkpointing=True,
    remove_unused_columns=False,
    fp16=True,
    bf16=False
)

In [19]:
trainer = SFTTrainer(
#     model=lora_model,
    model=model,
    args=training_args,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_train_dataset,
    peft_config=lora_config,
    dataset_text_field="text",  # need a dummy field
    tokenizer=tokenizer,
    # data_collator=data_collator,
    dataset_kwargs={"skip_prepare_dataset": True},
)



In [20]:
trainer.train()
# trainer.train(resume_from_checkpoint=True)

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
100,1.8554
200,0.2482
300,0.1497
400,0.1436
500,0.1355
600,0.1251
700,0.1111
800,0.0948




TrainOutput(global_step=826, training_loss=0.3493890135975207, metrics={'train_runtime': 27013.0763, 'train_samples_per_second': 0.244, 'train_steps_per_second': 0.031, 'total_flos': 1.44417972289536e+17, 'train_loss': 0.3493890135975207, 'epoch': 2.0})

In [None]:
trainer.push_to_hub(token = ACCESS_TOKEN)

In [None]:
trainer.model.push_to_hub(repo_id = 'VQA-Illusion/Mnist_LLaVa_final', token = ACCESS_TOKEN)

In [None]:
model.push_to_hub(repo_id = 'VQA-Illusion/Mnist_LLaVa_final', token = ACCESS_TOKEN)

In [None]:
processor.push_to_hub(repo_id = 'VQA-Illusion/Mnist_LLaVa_final', token = ACCESS_TOKEN)
tokenizer.push_to_hub(repo_id = 'VQA-Illusion/Mnist_LLaVa_final', token = ACCESS_TOKEN)

In [None]:
model.merge_and_unload()
model.push_to_hub(repo_id = 'VQA-Illusion/Mnist_LLaVa_merge', token = ACCESS_TOKEN)

In [None]:
prompt = f"USER: <image>\nThere might be an illusion of something in the image or not. these are classes that illusion of them might be in the picture: {class_names_str}\nJust answer the correct class.\nASSISTANT:"
image = Image.open(f"{train_dataset_name}/ill_images/{df_train.iloc[0]['image_name']}.jpg").convert("RGB")
inputs = processor(text=prompt, images=image, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")


In [None]:
output = model.generate(**inputs, max_new_tokens=10)


In [None]:
generated_text = processor.batch_decode(output, skip_special_tokens=True)

In [None]:
generated_text

In [None]:
image