In [3]:
# Install/upgrade the required libraries for fine-tuning
! pip install trl peft accelerate
! pip install -U bitsandbytes

Collecting trl
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting datasets>=2.21.0 (from trl)
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.21.0->trl)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=2.21.0->trl)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=2.21.0->trl)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Download

# Load Model

In [1]:
from transformers import AutoProcessor

MODEL_ID = "llava-hf/llava-v1.6-vicuna-13b-hf"

processor = AutoProcessor.from_pretrained(MODEL_ID)
processor.tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [2]:
from transformers import BitsAndBytesConfig, AutoModelForImageTextToText
import torch

USE_LORA = False
USE_QLORA = True

# Three options for training, from the lowest precision training to the highest precision training:
# - QLora
# - Standard Lora
# - Full fine-tuning
if USE_QLORA or USE_LORA:
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16
        )
    model = AutoModelForImageTextToText.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        quantization_config=bnb_config,
    )
else:
    # For full fine-tuning, we can speed up with Flash Attention (device-dependent).
    # only available on certain devices, see https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features
    model = AutoModelForImageTextToText.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        _attn_implementation="flash_attention_2",
    )

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/77.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

# Load dataset

In [3]:
# Load the 'mlabonne/FineTome-100k' dataset
from datasets import load_dataset
dataset = load_dataset("mlabonne/FineTome-100k", split = "train")

README.md:   0%|          | 0.00/982 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [4]:
"""
Convert a conversation sample from the dataset into a consistent format.

For each entry in 'sample["conversations"]', the 'from' value is mapped to
"user" or "assistant", and the content is wrapped accordingly.
"""
def convert_to_conversation(sample):
    new_conversation = []
    for init_conversation in sample['conversations']:
      if init_conversation['from'] == 'human':
         role = 'user'
      elif init_conversation['from'] == 'gpt':
         role = 'assistant'
      else:
        continue
      new_conversation.append(
           { "role": role,
             "content" : [
                {"type" : "text",  "text"  : init_conversation['value']},
             ]
           },
      )
    return {"messages" : new_conversation}

In [5]:
# Convert the entire dataset to the new conversation format.
converted_dataset = [convert_to_conversation(sample) for sample in dataset]

# Plain text Inference test (no image input)

In [6]:
# Extract and display the first conversation instruction, then build the chat template.
instruction = dataset[0]["conversations"][0]['value']
print(instruction)

image = None

messages = [
    {"role": "user", "content": [
        {"type": "text", "text": instruction}
    ]}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt = True)
print(input_text)

Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. 

Furthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.

Finally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles cases where truthiness and falsiness are implemented differently across different programming languages.
USER: Explain what boolean operators are, what they do, and provide exam

In [7]:
# Inference
inputs = processor(
    images=None,
    text=input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

# Generate
generate_ids = model.generate(**inputs, max_new_tokens=2000)
processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

['USER: Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. \n\nFurthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.\n\nFinally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles cases where truthiness and falsiness are implemented differently across different programming languages. ASSISTANT: Boolean operators are logical operators that are 

In [9]:
# Inference for image+text data input
import requests
from PIL import Image


image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)

messages = [
    {
        "role": "user",
        "content": [
          {"type": "text", "text": "What are the things I should be cautious about when I visit this place? What should I bring with me?"},
          {"type" : "image", "image" : image1}
        ]
    }
]
input_text = processor.apply_chat_template(messages, add_generation_prompt = True)


inputs = processor(text=input_text, images=image1, padding=True, return_tensors="pt").to("cuda")

# Generate model output using the provided text input
from transformers import TextStreamer
text_streamer = TextStreamer(processor, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 512,
                   use_cache = True, temperature = 1.5, min_p = 0.1)



When visiting a place like the one shown in the image, which appears to be a serene lake with a dock and surrounded by forested mountains, you should be cautious about several things:

1. **Weather Conditions**: Be prepared for changing weather conditions. Mountain weather can be unpredictable, so it's a good idea to check the forecast before you go.

2. **Water Safety**: If you plan on swimming or boating, make sure you're aware of the water's depth and temperature. Wear a life jacket if you're not a strong swimmer.

3. **Wildlife**: Be aware of any wildlife in the area. Keep a safe distance and do not feed the animals.

4. **Personal Safety**: Let someone know where you're going and when you expect to return. It's also a good idea to carry a map or GPS device and a fully charged phone in case of emergencies.

5. **Leave No Trace**: Respect the environment by taking all your trash with you and not disturbing the natural surroundings.

6. **Emergency Preparedness**: Have a first aid ki

# Understanding the Model Architecture & Fine tune the model

In [10]:
print(model)

LlavaNextForConditionalGeneration(
  (vision_tower): CLIPVisionModel(
    (vision_model): CLIPVisionTransformer(
      (embeddings): CLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
        (position_embedding): Embedding(577, 1024)
      )
      (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-23): 24 x CLIPEncoderLayer(
            (self_attn): CLIPSdpaAttention(
              (k_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
            )
            (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (mlp): CLIPMLP(
            

In [11]:
import bitsandbytes as bnb

"""
Identify 4-bit linear layers in the model, separating them into
vision/multi-modal projector modules or language-model modules.
"""
# We only look for 'Linear4bit' modules.
cls = bnb.nn.Linear4bit

# 'lora_module_names' will store the names of layers in the language model,
# excluding the 'lm_head' layer.
lora_module_names = set()

# 'others_names' will store the names of layers in the vision tower or
# multi-modal projector, also excluding the 'lm_head'.
others_names = set()

for name, module in model.named_modules():
  # Skip empty names or those related to vision/multi-modal projector.
  if not name or 'vision_tower' in name or 'multi_modal_projector' in name:
    if isinstance(module, cls) and 'lm_head' not in name:
      others_names.add(name)
    continue

  # At this point, we expect everything else to be in 'language_model'.
  assert 'language_model' in name
  if isinstance(module, cls) and 'lm_head' not in name:
    lora_module_names.add(name)
print(lora_module_names)

# Extract the last part of the layer names for clarity.
other_layer_name =  set([n.split('.')[-1] for n in others_names])
print(f"vision tower/multi_modal_projector linear layer name : {other_layer_name}")

layer_name = set([n.split('.')[-1] for n in lora_module_names])
print(f"language_model linear layer name : {other_layer_name}")

{'language_model.model.layers.33.self_attn.v_proj', 'language_model.model.layers.24.mlp.up_proj', 'language_model.model.layers.1.self_attn.v_proj', 'language_model.model.layers.6.self_attn.k_proj', 'language_model.model.layers.20.self_attn.q_proj', 'language_model.model.layers.18.self_attn.k_proj', 'language_model.model.layers.25.mlp.gate_proj', 'language_model.model.layers.31.mlp.down_proj', 'language_model.model.layers.32.mlp.down_proj', 'language_model.model.layers.24.self_attn.q_proj', 'language_model.model.layers.10.self_attn.o_proj', 'language_model.model.layers.6.mlp.up_proj', 'language_model.model.layers.32.self_attn.o_proj', 'language_model.model.layers.1.mlp.gate_proj', 'language_model.model.layers.8.self_attn.q_proj', 'language_model.model.layers.19.self_attn.o_proj', 'language_model.model.layers.27.self_attn.v_proj', 'language_model.model.layers.13.self_attn.v_proj', 'language_model.model.layers.32.mlp.gate_proj', 'language_model.model.layers.22.mlp.gate_proj', 'language_mo

In [12]:
def find_all_language_model_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if 'language_model' in name:
            if isinstance(module, cls) and 'lm_head' not in name:
                lora_module_names.add(name)
    return lora_module_names

In [13]:

from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

# Optionally, target modules can be manually defined:
# 'q_proj', 'k_proj', 'v_proj', 'o_proj' => attention layers
# 'gate_proj', 'up_proj', 'down_proj' => MLP layers
# target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']

# Automatically find target modules in the language model.
target_modules = find_all_language_model_linear_names(model)
print(target_modules)

# Create the LoRA configuration.
lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=target_modules,
    init_lora_weights="gaussian",
)

# Prepare the model for k-bit training and apply LoRA.
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

{'language_model.model.layers.33.self_attn.v_proj', 'language_model.model.layers.24.mlp.up_proj', 'language_model.model.layers.1.self_attn.v_proj', 'language_model.model.layers.6.self_attn.k_proj', 'language_model.model.layers.20.self_attn.q_proj', 'language_model.model.layers.18.self_attn.k_proj', 'language_model.model.layers.25.mlp.gate_proj', 'language_model.model.layers.31.mlp.down_proj', 'language_model.model.layers.32.mlp.down_proj', 'language_model.model.layers.24.self_attn.q_proj', 'language_model.model.layers.10.self_attn.o_proj', 'language_model.model.layers.6.mlp.up_proj', 'language_model.model.layers.32.self_attn.o_proj', 'language_model.model.layers.1.mlp.gate_proj', 'language_model.model.layers.8.self_attn.q_proj', 'language_model.model.layers.19.self_attn.o_proj', 'language_model.model.layers.27.self_attn.v_proj', 'language_model.model.layers.13.self_attn.v_proj', 'language_model.model.layers.32.mlp.gate_proj', 'language_model.model.layers.22.mlp.gate_proj', 'language_mo

In [14]:
from datasets import Dataset
def formatting_prompts_func(examples):
    """Format conversation examples into prompt text using the chat template."""
    try:
        convos = examples["dataset"]
        texts = [processor.apply_chat_template(convo['messages'], tokenize = False, add_generation_prompt = False) for convo in convos]
        return { "text" : texts, }
    except:
        print(examples)
        raise

# Create a Dataset from the converted_dataset list by wrapping it in a dictionary
my_dataset = Dataset.from_dict({"dataset": converted_dataset})

# Map the formatting_prompts_func over the dataset in batched mode for efficiency.
dataset = my_dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [15]:
print(dataset)

Dataset({
    features: ['dataset', 'text'],
    num_rows: 100000
})


In [16]:
from functools import partial
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizes dataset batch
    """
    return tokenizer(
        images=None,
        text=batch["text"],
        max_length = max_length,
        padding=True
        #truncation = True,
    )


def preprocess_dataset(tokenizer, max_length, seed, my_dataset):
    """
    Tokenizes dataset for fine-tuning
    """
    # Extend column names with 'text'
    columns_names = my_dataset.column_names
    columns_names.append('text')

    # Create a partial function for batch tokenization
    _preprocessing_function = partial(preprocess_batch, max_length = max_length, tokenizer = tokenizer)

    # Apply preprocessing to each batch and remove original columns
    my_dataset = my_dataset.map(
        _preprocessing_function,
        batched = True,
        remove_columns = columns_names,
    )
    # Filter out samples that have "input_ids" exceeding "max_length"
    my_dataset = my_dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)
    # Shuffle dataset
    my_dataset = my_dataset.shuffle(seed = seed)

    return my_dataset

In [17]:
max_length = 2048
seed = 33
preprocessed_dataset = preprocess_dataset(processor, max_length, seed, dataset)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]



Filter:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [18]:
print(preprocessed_dataset)

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 4000
})


In [19]:
import torch
class TextDataCollator:
    def __init__(self, model, tokenizer, max_length=2048):
        """Initialize the data collator."""
        self.model = model
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, examples):
        """Collate a list of examples into a batch for training."""
        # Truncate each example to the maximum length.
        input_ids = [ex["input_ids"][:self.max_length] for ex in examples]
        attention_mask = [ex["attention_mask"][:self.max_length] for ex in examples]

        # Pad sequences to create uniform batch tensors.
        input_ids = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(ids) for ids in input_ids],
            batch_first=True,
            padding_value=0  # 0 is the padding token ID.
        )

        attention_mask = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(mask) for mask in attention_mask],
            batch_first=True,
            padding_value=0
        )

        # For language modeling, labels are usually the same as input_ids.
        labels = input_ids.clone()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

In [20]:
from trl import SFTTrainer, SFTConfig
from torch.cuda import is_bf16_supported

# Create an SFTTrainer for supervised fine-tuning using the TRL library.
trainer = SFTTrainer(
    model = model,
    tokenizer = processor.tokenizer,
    data_collator =  TextDataCollator(model, processor),
    train_dataset = preprocessed_dataset,

    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 15,
        # num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # Additional parameters for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)

  trainer = SFTTrainer(


In [21]:
# train model
trainer_stats = trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
1,0.9797
2,1.2089
3,0.9421
4,1.0206
5,0.9235
6,1.0093
7,0.9432
8,0.7966
9,0.7671
10,0.8982


In [23]:
# Fine-tuned Model Inference Test
image = None
instruction= "Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. \n\nFurthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.\n\nFinally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles cases where truthiness and falsiness are implemented differently across different programming languages."

messages = [
    {"role": "user", "content": [
        {"type": "text", "text": instruction}
    ]}
]
input_text = processor.apply_chat_template(
    messages,
    add_generation_prompt = True
    )

inputs = processor(
    images=None,
    text=input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

# Generate
generate_ids = model.generate(**inputs, max_new_tokens=256)
processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

['USER: Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. \n\nFurthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.\n\nFinally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles cases where truthiness and falsiness are implemented differently across different programming languages. ASSISTANT: Boolean operators are logical operators that are 

In [None]:
# Inference for image+text data input
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)

messages = [
    {
        "role": "user",
        "content": [
          {"type": "text", "text": "What are the things I should be cautious about when I visit this place? What should I bring with me?"},
          {"type" : "image", "image" : image1}
        ]
    }
]
input_text = processor.apply_chat_template(messages, add_generation_prompt = True)


inputs = processor(text=input_text, images=image1, padding=True, return_tensors="pt").to("cuda")

# Generate model output using the provided text input
from transformers import TextStreamer
text_streamer = TextStreamer(processor, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 256,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

