In [None]:
import torch
import gc
import time


def clear_memory():
    # Delete variables if they exist in the current global scope
    if "inputs" in globals():
        del globals()["inputs"]
    if "model" in globals():
        del globals()["model"]
    if "processor" in globals():
        del globals()["processor"]
    if "trainer" in globals():
        del globals()["trainer"]
    if "peft_model" in globals():
        del globals()["peft_model"]
    if "bnb_config" in globals():
        del globals()["bnb_config"]
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


clear_memory()

GPU allocated memory: 20.97 GB
GPU reserved memory: 21.11 GB


In [None]:
import kagglehub
kagglehub.dataset_download('mbashish7/vr-finetune')
kagglehub.dataset_download('hlgsagar7/vr-go')

'/kaggle/input/vr-go'

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
WANDB_TOKEN = user_secrets.get_secret("WANDB_API_KEY")


In [None]:
!pip install transformers datasets peft accelerate pandas qwen-vl-utils evaluate wandb rouge_score

Collecting qwen-vl-utils
  Downloading qwen_vl_utils-0.0.11-py3-none-any.whl.metadata (6.3 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting av (from qwen-vl-utils)
  Downloading av-14.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>

In [None]:
import pandas as pd
from torch.utils.data import Dataset
from PIL import Image
from huggingface_hub import login
import wandb
from qwen_vl_utils import process_vision_info
import numpy as np
import torch
from sklearn.metrics import accuracy_score
import numpy as np
import evaluate
from sentence_transformers import util
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor, Qwen2VLProcessor
from peft import LoraConfig, get_peft_model
from transformers import Seq2SeqTrainer
from torch.utils.data import DataLoader
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import PeftModel, PeftConfig


In [None]:

login(token=HF_TOKEN)

In [None]:

wandb.login(key=WANDB_TOKEN)

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmbashishoo7[0m ([33mmbashishoo7-international-institute-of-information-techn[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:

class QwenVLDataset(Dataset):
    def __init__(self, csv_file, mode='train', sampling=False, sample_amount = 1000, seed=42):
        df = pd.DataFrame()
        if not sampling:
            df = pd.read_csv(csv_file, dtype={"answer": str})
        else:
            df = pd.read_csv(csv_file, dtype={"answer": str}).sample(n=sample_amount, random_state=seed)
        self.mode = mode
        self.data = df
        self.system_message = """You are a Vision Language Model specialized in interpreting visual data from chart images.
Your task is to analyze the provided image and respond to queries with concise answers, usually a single word, number.
The Images would be of various products available on online retailers along with some metadata for the product as text.
Focus on delivering accurate, succinct answers based on the visual information. Avoid additional explanation unless absolutely necessary."""

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        return {
            'image_path': f"file:///{row['image_path']}",
            'question': row['question'],
            'answer': row['answer'],
        }



In [None]:
def create_message(image_path, question, answer, is_for_eval=False):
    system_message = """You are a Vision Language Model specialized in interpreting visual data from chart images.
Your task is to analyze the provided image and respond to queries with concise answers, usually a single word, number.
The Images would be of various products available on online retailers along with some metadata for the product as text.
Focus on delivering accurate, succinct answers based on the visual information. Avoid additional explanation unless absolutely necessary."""
    if not is_for_eval :
        return [
            {
                        "role": "system",
                        "content": [{"type": "text", "text": system_message}]
                    },
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image",
                                "image": image_path,
                            },
                            {
                                "type": "text",
                                "text": question,
                            },
                        ]
                    },
                    {
                        "role": "assistant",
                        "content": [{"type": "text", "text": answer}]
                    }
        ]
    else:
        return [
            {
                        "role": "system",
                        "content": [{"type": "text", "text": system_message}]
                    },
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image",
                                "image": image_path,
                            },
                            {
                                "type": "text",
                                "text": question,
                            },
                        ]
                    }
        ]

def train_collate_fn(examples):
    # Get the texts and images, and apply the chat template
    # for ex in examples:
    #     print(ex)
    messages = [create_message(ex['image_path'], ex['question'], ex['answer'], is_for_eval=False) for ex in examples]

    texts = [
        processor.apply_chat_template(message, tokenize=False) for message in messages
    ]  # Prepare texts for processing
    image_inputs = [process_vision_info(message)[0] for message in messages]  # Process the images to extract inputs

    # Tokenize the texts and process the images
    batch = processor(
        text=texts, images=image_inputs, return_tensors="pt", padding=True
    )  # Encode texts and images into tensors

    # The labels are the input_ids, and we mask the padding tokens in the loss computation
    labels = batch["input_ids"].clone()  # Clone input IDs for labels
    labels[labels == processor.tokenizer.pad_token_id] = -100  # Mask padding tokens in labels

    # Ignore the image token index in the loss computation (model specific)
    if isinstance(processor, Qwen2VLProcessor):  # Check if the processor is Qwen2VLProcessor
        image_tokens = [151652, 151653, 151655]  # Specific image token IDs for Qwen2VLProcessor
    else:
        image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]  # Convert image token to ID

    # Mask image token IDs in the labels
    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100  # Mask image token IDs in labels

    batch["labels"] = labels  # Add labels to the batch

    return batch  # Return the prepared batch

In [None]:
def strip_until_assistant(text: str) -> str:
    """
    Strips everything up to and including the first occurrence of 'assistant'.
    """
    keyword = "assistant"
    idx = text.find(keyword)
    if idx != -1:
        return text[idx + len(keyword):].strip()
    return text.strip()


In [None]:
def eval_collate_fn(examples):
    #  Prepare full messages
    messages = [create_message(ex['image_path'], ex['question'], ex['answer'], is_for_eval=False) for ex in examples]
    prompts = [create_message(ex['image_path'], ex['question'], ex['answer'], is_for_eval=True) for ex in examples]
    # Process texts and images together
    texts = [processor.apply_chat_template(message, tokenize=False) for message in messages]
    ptexts = [processor.apply_chat_template(message, tokenize=False) for message in prompts]
    image_inputs = [process_vision_info(message)[0] for message in messages]


    batch = processor(
        text=texts,
        images=image_inputs,
        return_tensors="pt",
        padding="longest",
        truncation=True,
    )

    # Mask labels until assistant's response starts
    labels = batch["input_ids"].clone()



    # Tokenize both full text (with assistant) and prompt-only (without assistant)
    for i, (full_text, prompt_text) in enumerate(zip(texts, ptexts)):
        # Tokenize without padding
        full_ids = processor.tokenizer(
            full_text, add_special_tokens=True, padding=False, truncation=False
        ).input_ids

        prompt_ids = processor.tokenizer(
            prompt_text, add_special_tokens=True, padding=False, truncation=False
        ).input_ids

        # Determine where assistant reply starts
        assistant_start_idx = len(prompt_ids)

        # +1 to include the first assistant token (typically the newline or first word)
        labels[i, :assistant_start_idx + 1] = -100
        labels[i, -1] = -100


    if isinstance(processor, Qwen2VLProcessor):
        image_tokens = [151652, 151653, 151655]
        for token in image_tokens:
            labels[labels == token] = -100


    labels[labels == processor.tokenizer.pad_token_id] = -100

    batch["labels"] = labels
    return batch

In [None]:


# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="float16", device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# Freeze vision encoder and embeddings
for name, param in model.named_parameters():
    if "vision_tower" in name or "embed_tokens" in name:
        param.requires_grad = False

## For Initial only
# Apply LoRA only on the decoder layers
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)


model = PeftModel.from_pretrained(model, "mbashish/qwen_finetuned", is_trainable=True)
model.print_trainable_parameters()
#Initial
# model = get_peft_model(model, lora_config)
# model.print_trainable_parameters()


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/56.4k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/793 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/7.13M [00:00<?, ?B/s]

trainable params: 1,777,664 || all params: 2,210,763,264 || trainable%: 0.0804


In [None]:


# Initialize ROUGE
rouge = evaluate.load("rouge")
from sentence_transformers import SentenceTransformer, util

# Load the model once
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

def print_negative_tokens(preds):
    preds = np.asarray(preds)  # Ensure it's a NumPy array

    # Find all positions where token ID is negative
    negative_token_indices = np.argwhere(preds < 0)

    if negative_token_indices.size == 0:
        print("No negative token IDs found in predictions.")
    else:
        print(f" Found {len(negative_token_indices)} negative token IDs.")
        for idx in negative_token_indices:
            sample_idx, token_pos = idx
            print(f"Sample {sample_idx}, Position {token_pos}: Token ID = {preds[sample_idx, token_pos]}")



import re

def extract_assistant_answer(text):
    """
    Extracts the assistant's reply between <|im_start|>assistant and <|im_end|>
    """
    pattern = r"<\|im_start\|>assistant\s*(.*?)<\|im_end\|>"
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return ""


def compute_metrics(p):
    preds = p.predictions
    labels = p.label_ids

    # Replace any invalid token IDs in predictions (e.g., -100 or < 0)
    preds_clean = np.where(preds < 0, processor.tokenizer.pad_token_id, preds)

    # Clean -100 in labels (set to pad_token_id for decoding)
    labels_clean = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)

    # Decode generated predictions and labels
    decoded_preds = processor.tokenizer.batch_decode(preds_clean, skip_special_tokens=False)
    # Extract assistant answers from predictions
    extracted_preds = [extract_assistant_answer(pred) for pred in decoded_preds]

    decoded_labels = processor.tokenizer.batch_decode(labels_clean, skip_special_tokens=True)
    cleaned_labels = [strip_until_assistant(decoded) for decoded in decoded_labels]

    # Compute SBERT Similarity
    embeddings_pred = sbert_model.encode(extracted_preds, convert_to_tensor=True)
    embeddings_label = sbert_model.encode(cleaned_labels, convert_to_tensor=True)
    cosine_scores = util.cos_sim(embeddings_pred, embeddings_label)
    avg_cosine_similarity = float(cosine_scores.diag().mean().cpu().numpy())


    rouge_result = rouge.compute(
    predictions=extracted_preds,
    references=cleaned_labels,
    rouge_types=["rouge1"]
                )

    rouge_score = rouge_result['rouge1']



    # Combined score
    combined = 0.7 * avg_cosine_similarity + 0.3 * rouge_score
    print(f"avg_cosine_similarity = {avg_cosine_similarity}\n rouge1_score={rouge_score}")
    return {
        "sbert_similarity": avg_cosine_similarity,
        "rouge1_f1": rouge_score,
        "eval_combined_metric": combined,
    }





Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Load dataset
train_dataset = QwenVLDataset("/kaggle/input/vr-finetune/train.csv", mode='train',sampling=True, sample_amount=100000, seed=100)
eval_dataset = QwenVLDataset("/kaggle/input/vr-finetune/val.csv", mode='test',sampling=True, sample_amount=1000, seed=42)

In [None]:

# Define Training Arguments
training_args = Seq2SeqTrainingArguments(
    `
    logging_dir="./logs",  # logging directory
    logging_steps=10,  # log every 10 steps
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    save_total_limit=3,  # Only keep the last 3 checkpoints
    num_train_epochs=5,  # Number of training epochs
    predict_with_generate=True,  # Use generate() to predict during eval
    fp16=True,  # Enable mixed precision training
    eval_strategy="epoch",  # Evaluate every epoch
    save_strategy="epoch",  # Save every epoch
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    push_to_hub=True,
    hub_model_id="mbashish/qwen_finetuned",  # Replace with your repo
    hub_token=HF_TOKEN,
    hub_strategy="every_save",    # Push to hub every save
    remove_unused_columns=False,

    report_to="wandb",
    run_name="qwen-finetune-run",
)

In [None]:
wandb.init(
    project="qwen2-vqa-finetune",  # ✅ your project name
    name="qwen2-fine-tune-1",     # ✅ your run name
    config=training_args.to_dict(),               # ✅ convert TrainingArguments to dict
    resume="allow",
    id="f3mxsoqu"
)

In [None]:


class CustomSeq2SeqTrainer(Seq2SeqTrainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.eval_collate_fn = None
        self.test_collate_fn = None

    def get_eval_dataloader(self, eval_dataset=None):
        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
        return DataLoader(
            eval_dataset,
            batch_size=self.args.per_device_eval_batch_size,
            collate_fn=self.eval_collate_fn or self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )

    def get_test_dataloader(self, test_dataset):
        return DataLoader(
            test_dataset,
            batch_size=self.args.per_device_eval_batch_size,
            collate_fn=self.test_collate_fn or self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )


In [None]:
trainer = CustomSeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=train_collate_fn,  # for training
    tokenizer=processor.tokenizer,
    compute_metrics=compute_metrics,
)

# Set custom collate_fn for eval/test after instantiation
trainer.eval_collate_fn = eval_collate_fn
# trainer.test_collate_fn = test_collate_fn

trainer.train()

  super().__init__(*args, **kwargs)
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Combined Metric,Sbert Similarity,Rouge1 F1
1,0.0637,0.340506,1.0,1.0,1.0


avg_cosine_similarity = 1.0
 rouge1_score=1.0
