# Fine-Tuning Qwen2-VL-2B-Instruct on dacl-10k dataset




### I will be fine-tuning the above mentioned VLM on the dataset

### I have used this tutorial as my source: https://huggingface.co/learn/cookbook/en/fine_tuning_vlm_trl

#### The check-point shards of qwen are already downloaded. TO AVOID DOWNLOADING STUFF AGAIN, I have written the below line

In [None]:
import os
os.environ['TRANSFORMERS_CACHE'] = 'D:/mdfBIM+ - VLM 4 Bridge Damages - Jäkel_Bitte nicht löschen!_/hugging_face/hub' 
#If check-point shards are not downloaded, then comment it####


## IMPORT LIBRARIES

In [None]:
#os.environ["WANDB_DISABLED"] = "true"
from datasets import load_dataset, interleave_datasets
from PIL import Image
import torch
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor, BitsAndBytesConfig, Trainer, TrainingArguments, get_scheduler
from qwen_vl_utils import process_vision_info
from peft import LoraConfig, get_peft_model
from trl import SFTConfig, SFTTrainer
from peft.optimizers import create_loraplus_optimizer
import evaluate
#import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, average_precision_score
import bitsandbytes as bnb
import wandb
import warnings
warnings.filterwarnings("ignore")

## Resizing the images

#### The model supports a wide range of resolution inputs. By default, it uses the native resolution for input, but higher resolutions can enhance performance at the cost of more computation.

https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct

In [None]:
def resize_image(image: Image.Image):
    min_size = 56  # Ensure minimum 56X56
    max_size = 1008 #Ensure max 1008X1008 #1120
    width, height = image.size
    
    if width < min_size or height < min_size:
        image = image.resize((max(width,min_size), max(height,min_size)), Image.BILINEAR)
    elif width > max_size or height > max_size:
        image = image.resize((min(width,max_size), min(height,max_size)), Image.BILINEAR)   
    return image

## Parameters for *training_args*

In [None]:
#device = "cuda" if torch.cuda.is_available() else "cpu"
#print(f"Using device: {device}")

MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
EPOCHS = 1#2
BATCH_SIZE = 1
GRADIENT_CHECKPOINTING = True  # Tradeoff between memory efficiency and computation time.
GRADIENT_ACCUMULATION_STEPS = 4 #4
USE_REENTRANT = False
OPTIM = "paged_adamw_32bit" #"paged_adamw_32bit"
LEARNING_RATE = 5e-5 #2e-4 #5e-4
LOGGING_STEPS = 433 #180 #173 #433 
EVAL_STEPS = 433 #180 #173 #433 
SAVE_STEPS = 866 #360 #346 #866 
EVAL_STRATEGY = "steps"
SAVE_STRATEGY = "steps" #"steps"
METRIC_FOR_BEST_MODEL="eval_loss" #"eval_loss"
LOAD_BEST_MODEL_AT_END=True
MAX_GRAD_NORM = 1
WARMUP_RATIO = 0.1 #delete
WARMUP_STEPS = 0 #delete
WEIGHT_DECAY = 0.01 #delete
DATASET_KWARGS={"skip_prepare_dataset": True} # We have to put for VLMs
REMOVE_UNUSED_COLUMNS = False # VLM thing
MAX_SEQ_LEN= 1024 #128
NUM_STEPS = (6935 // (GRADIENT_ACCUMULATION_STEPS * BATCH_SIZE)) * EPOCHS #6935 #2882 #2777 
print(f"NUM_STEPS: {NUM_STEPS}")

## Mappings from numbers to damage and object types

Numbers from 12 to 17 (both included) are object types. Rest are damage types.

In [None]:
label_mapping = {
                                    0: "Crack",
                                    1: "ACrack",
                                    2: "Wetspot",
                                    3: "Efflorescence",
                                    4: "Rust",
                                    5: "Rockpocket",
                                    6: "Hollowareas",
                                    7: "Cavity",
                                    8: "Spalling",
                                    9: "Graffiti",
                                    10: "Weathering",
                                    11: "Restformwork",
                                    12: "ExposedRebars",
                                    13: "Bearing",
                                    14: "EJoint (Expansion Joint)",
                                    15: "Drainage",
                                    16: "PEquipment (Protective Equipment)",
                                    17: "JTape (Joint Tape)",
                                    18: "Concrete Corrosion (ConcreteC)",
                                    19: "Corrosion, no rust staining",
                                    20: "NO Exposed Reinforcement",
                                    21: "Scaling",
                                    22: "General Defects",
                                    23: "No defect"
                                    }


## Text Template

In [None]:
system_message = """You are a highly advanced Vision Language Model (VLM), specialized in analyzing, describing, and interpreting visual data. 
You have currently learned about several bridge-damage types. Your task is to generate a short inspection report on seeing the image."""
def format_data(sample):
    return [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": resize_image(Image.open(sample["image"])),
                },
                {
                    "type": "text",
                    "text": f"""Here is the label-mapping of numbers to damage types {label_mapping}. Numbers 12 to 17(both included) are object types. Using the numbers, state the damage type(s) and object type(s) present in the image:"""
                 },
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text":sample["new_label"] }], #'The supervisor will check' #sample["label"]
        },
    ]


## Loading the DACL dataset

#### You can find the download link here: https://github.com/phiyodr/dacl10k-toolkit

#### I used those images, but I made my own annotation files. 

In [None]:
#'''
dataset3 = load_dataset("json", data_files = {'train':'D:/mdfBIM+ - VLM 4 Bridge Damages - Jäkel_Bitte nicht löschen!_/labels_for_two_datasets/Train/dacl/dacl_labels_train.json',
                                                  'val': 'D:/mdfBIM+ - VLM 4 Bridge Damages - Jäkel_Bitte nicht löschen!_/labels_for_two_datasets/Val/dacl/dacl_labels_val.json'})

#use image_cast from hf library

train_dataset3 = dataset3['train']
#train_label3 = train_dataset3["label"]

val_dataset3 = dataset3['val']

train_dataset3 = [format_data(sample) for sample in train_dataset3]
val_dataset3 = [format_data(sample) for sample in val_dataset3]
#'''
print("Formatted!!\n")

#### I have commented this stuff, cuz I am not using it

In [None]:

##################################COMBINE DATASETS############################################
'''
combined_train_dataset = interleave_datasets([train_dataset,train_dataset2], probabilities=[0.5,0.5])
combined_val_dataset = interleave_datasets([val_dataset,val_dataset2], probabilities=[0.5,0.5])
print("Combined successfully!")
combined_train_dataset = [format_data(sample) for sample in combined_train_dataset]
combined_val_dataset = [format_data(sample) for sample in combined_val_dataset]
print("Formatted Successfully")
'''
################################# checking ############################################
#sample_data = train_dataset[0]
#sample_question = train_dataset[0][1]["content"][1]["text"]
#sample_answer = train_dataset[0][2]["content"][0]["text"]
#sample_image = train_dataset[0][1]["content"][0]["image"]

#print(sample_question)
#print(sample_answer)
#sample_image


## Load the model

I have just defined **bits&bytes**, but  I am not using it anywhere. I am not able to use it with **flash-attn2** parallely

In [None]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        #bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_storage=torch.bfloat16,
    )

model = Qwen2VLForConditionalGeneration.from_pretrained(
     "Qwen/Qwen2-VL-2B-Instruct",
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
     #load_in_4bit=True,
     low_cpu_mem_usage=True,
     #quantization_config=bnb_config,
     device_map="auto",
     use_cache=False,
 )

min_pixels = 4 * 28 * 28
max_pixels = 1296 * 28 * 28 #1849 #1296 #1600
processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels) #min_pixels=min_pixels
processor.tokenizer.padding_side = "right"


#### Not using the below function anywhere

In [None]:
########################### text generator ########################
def text_generator(sample_data):
    text = processor.apply_chat_template(
        sample_data[0:2], tokenize=False, add_generation_prompt=True
    )

    #print(f"Prompt: {text}")
    #print("-"*30)

    image_inputs, _ = process_vision_info(sample_data) #sample_data[1]["content"][0]["image"]

    inputs = processor(
        text=[text],
        images = image_inputs,
        return_tensors="pt"
    )
    inputs = inputs.to(device=model.device)

    generated_ids = model.generate(**inputs, max_new_tokens=MAX_SEQ_LEN)

    output_text = processor.batch_decode(
        generated_ids, skip_special_tokens=True
    )
    del inputs
    actual_answer = sample_data[2]["content"][0]["text"]
    return output_text[0], actual_answer
    

#generated_text, actual_answer = text_generator(sample_data)
#print(f"Generated Answer: {generated_text}")
#print(f"Actual Answer: {actual_answer}")


## LoRA Configuration

In [None]:
#####################LORA CONFIG############################
peft_config = LoraConfig(
    use_dora=True,
    inference_mode=False,
    lora_alpha=64, #16 #64
    lora_dropout=0.1, #0.05
    r=16, #16 #8
    bias="none",
    target_modules=["q_proj", "v_proj"], #"k_proj", #"o_proj" #, "qkv", "proj"
    task_type="CAUSAL_LM",
    init_lora_weights= "eva", #"eva"
    use_rslora=True,
)

#print(f"Before adapter parameters: {model.num_parameters()}")
peft_model = get_peft_model(model, peft_config)

peft_model = torch.compile(peft_model)

peft_model.print_trainable_parameters() 

## Using SFTConfig function for training_args

In [None]:
########################TRAINING ARGS################################################
training_args = SFTConfig(
    ##change this accordingly####
    output_dir="./fine_tuned_weights/dacl_with_metrics_final", #260156 #codebrim #codebrim_and_2601506
    #label_names= train_label,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_checkpointing=GRADIENT_CHECKPOINTING,
    learning_rate=LEARNING_RATE,
    lr_scheduler_type= "cosine_with_restarts",
    lr_scheduler_kwargs= {"num_cycles": 2}, #{"power": 3} #"constant", #"linear", #"polynomial", #"cosine_with_restarts"
    logging_steps=LOGGING_STEPS,
    eval_steps=EVAL_STEPS,
    eval_strategy=EVAL_STRATEGY,
    save_strategy=SAVE_STRATEGY,
    save_steps=SAVE_STEPS,
    metric_for_best_model=METRIC_FOR_BEST_MODEL,
    load_best_model_at_end=LOAD_BEST_MODEL_AT_END,
    max_grad_norm=MAX_GRAD_NORM,
    #warmup_ratio=WARMUP_RATIO,
    #warmup_steps=WARMUP_STEPS,
    bf16=True,
    tf32=True,
    gradient_accumulation_steps=4, #16 #8 #4 #2
    dataset_kwargs=DATASET_KWARGS,
    max_seq_length=MAX_SEQ_LEN,
    remove_unused_columns = REMOVE_UNUSED_COLUMNS,
    optim=OPTIM,
    label_names=["labels"],
    report_to="wandb", #"wandb"
)

wandb.init(project="VQA or Image Captioning", name="21.03.2025-dataset dacl_with_metrics-Qwen-2-VL-Instruct-2B", config=training_args,) #Qwen-2-VL-Instruct-2B-dataset 2601506-18.02.2025


## Collate Function

In [None]:
#################################### COLLATE FUNCTION #############################################
#collate_sample = [train_dataset[0], train_dataset[1]] # for batch size 2.

def collate_fn(examples):
    texts = [processor.apply_chat_template(example, tokenize=False) for example in examples]
    #image_inputs = [example[1]["content"][0]["image"] for example in examples] #[resize_image(example[1]["content"][0]["image"]) for example in examples]

    image_inputs = [process_vision_info(example)[0] for example in examples]
    #resize_image(image_inputs)
    batch = processor(
        text=texts, images=image_inputs, return_tensors="pt", padding=True
    )
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    #batch["input_ids"]
    #batch["labels"] = batch["input_ids"]
    #batch=batch.to(device=model.device)
    # Ignore the image token index in the loss computation (model specific)
    if isinstance(processor, Qwen2VLProcessor):  # Check if the processor is Qwen2VLProcessor
        image_tokens = [151652, 151653, 151655]  # Specific image token IDs for Qwen2VLProcessor
    else:
        image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]  # Convert image token to ID

    # Mask image token IDs in the labels
    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100  # Mask image token IDs in labels
    
    batch["labels"] = labels
    return batch

#collated_data = collate_fn(collate_sample)
#print(collated_data.keys())  # dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'labels'])

## Metric Function

#### I didn't use it here cuz it was giving me OOM error

In [None]:
#####################################METRIC_FUNCTION##############################################

# Load evaluation metrics
#bleu = evaluate.load("bleu")
#rouge = evaluate.load("rouge")
#meteor = evaluate.load("meteor")

#accuracy_metric = evaluate.load("accuracy")
#precision_metric = evaluate.load("precision")
#recall_metric = evaluate.load("recall")
#f1_metric = evaluate.load("f1")

def compute_metrics(preds, labels):
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    accuracy = accuracy_score(labels, preds)
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


def evaluate_model(dataset):
    preds = []
    labels = []
    i=1

    for sample_data in dataset:
        generated_answer, actual_answer = text_generator(sample_data)

        # Convert text labels to numeric values
        predicted_label = int(generated_answer) # Default to -1 if unknown
        actual_label = actual_answer 

        if predicted_label != -1 and actual_label != -1:  # Ensure valid labels
            preds.append(predicted_label)
            labels.append(actual_label)
        print(i)
        i+=1

    print(f"length of preds: {len(preds)}")
    return compute_metrics(preds, labels)

## SFTTrainer

In [None]:
#####################################SFT TRAINER##################################################
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset= train_dataset3, #train_dataset2 #combined_train_dataset
    eval_dataset= val_dataset3, #val_dataset2 #combined_val_dataset
    data_collator=collate_fn,
    peft_config=peft_config,
    #optimizers = (optimizer,scheduler),
    #compute_metrics=compute_metrics,  # Updated
    tokenizer = processor.tokenizer,
)

## Training/fine-tuning the model

In [None]:
torch.cuda.empty_cache()
#print(f"Before adapter parameters: {model.num_parameters()}")
######################################EVAL######################################################
print("-"*30)
print("Initial Evaluation")
metric = trainer.evaluate()
print(metric)
print("-"*30)

print("Training")
trainer.train()
print("-"*30)
trainer.save_model(training_args.output_dir)

#dataset = "dacl"
#metrics = evaluate_model(val_dataset)
#print(f"The evaluation metrics for the {dataset} dataset are: \n {metrics}") #print(metrics)

print("Done and finished")
print("-"*30)