In [1]:
!pip install qwen_vl_utils
!pip install transformers
!pip install datasets
!pip install -U bitsandbytes
!pip install trl

import torch
from transformers import (
    Qwen2VLForConditionalGeneration,
    AutoProcessor,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    Qwen2VLProcessor,
    AutoTokenizer,
    AutoImageProcessor
)
from trl import SFTConfig
from qwen_vl_utils import process_vision_info
from datasets import load_dataset#, Dataset
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torch
import os
import wandb
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model
import gc
import time
from typing import Dict, List


Collecting qwen_vl_utils
  Downloading qwen_vl_utils-0.0.8-py3-none-any.whl.metadata (3.6 kB)
Collecting av (from qwen_vl_utils)
  Downloading av-14.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Downloading qwen_vl_utils-0.0.8-py3-none-any.whl (5.9 kB)
Downloading av-14.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.0/33.0 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av, qwen_vl_utils
Successfully installed av-14.0.1 qwen_vl_utils-0.0.8
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multipr

In [2]:
from google.colab import drive
drive.mount('/content/drive')

compute = True

# Check device compatibility
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
print('device = ', device)

model_name = "Qwen/Qwen2-VL-2B-Instruct"

img_folder_path = "/content/drive/MyDrive/MATH-V-main"

dataset_name = "MathLLMs/MathVision"

file_name = f"data/output/{dataset_name.split('/')[-1]}_{model_name.split('/')[-1]}.csv"


# Load the dataset
data_list = load_dataset(dataset_name, split='test')


# Split data into training and validation sets
train_size = int(0.7 * len(data_list))
val_size = int(0.1 * len(data_list))
test_size = len(data_list) - train_size - val_size
data_list = data_list.shuffle(seed=42)  # Shuffle the dataset for randomness

train_data = data_list.select(range(train_size))
val_data = data_list.select(range(train_size, train_size + val_size))
test_data = data_list.select(range(train_size+val_size, len(data_list)))


Mounted at /content/drive
device =  cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

(…)-00000-of-00001-3532b8d3f1b4047a.parquet:   0%|          | 0.00/57.0M [00:00<?, ?B/s]

(…)-00000-of-00001-f8ff70fcb2f29b1d.parquet:   0%|          | 0.00/6.99M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/3040 [00:00<?, ? examples/s]

Generating testmini split:   0%|          | 0/304 [00:00<?, ? examples/s]

In [3]:
def generate_message(inputs):
    messages = []
    for input in inputs:
        prompt = input["question"]
        image_path = f"{img_folder_path}/{input['image']}"
        messages.append({
            "role": "user",
            "content": [{"type": "image", "image": image_path}, {"type": "text", "text": prompt}]
        })

    return messages

def generate_embedding(model, processor, sample):
    messages = generate_message([sample])
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )

    # Move inputs to the device
    inputs = inputs.to(device)
    return inputs

def generate_text_from_sample(model, processor, sample, max_new_tokens=1024):
    inputs = generate_embedding(model, processor, sample)

    # Move model to the device
    model = model.to(device)

    # Perform inference
    generated_ids = model.generate(**inputs, max_new_tokens=256)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)]

    output_text = processor.batch_decode(
        trimmed_generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    return output_text


In [4]:
# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
try:
  model = Qwen2VLForConditionalGeneration.from_pretrained(
      f"{img_folder_path}/{model_name}",
      device_map="auto",
      torch_dtype=torch.bfloat16,
      # quantization_config=bnb_config
  )
except:
  model = Qwen2VLForConditionalGeneration.from_pretrained(
      model_name,
      device_map="auto",
      torch_dtype=torch.bfloat16,
      # quantization_config=bnb_config
  )
  model.save_pretrained(f"{img_folder_path}/{model_name}")
processor = Qwen2VLProcessor.from_pretrained(model_name)


# Configure LoRA
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

# Apply PEFT model adaptation
peft_model = get_peft_model(model, peft_config)

# Print trainable parameters
peft_model.print_trainable_parameters()

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]



trainable params: 1,089,536 || all params: 2,210,075,136 || trainable%: 0.0493


In [18]:
class MessageDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return generate_message([self.data[idx]])[0]

# Create a data collator to encode text and image pairs
def collate_fn(examples):
    # Get the texts and images, and apply the chat template
    texts = [
        processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in examples
    ]  # Prepare texts for processing
    image_inputs = [process_vision_info([msg])[0] for msg in examples]  # Process the images to extract inputs
    # Tokenize the texts and process the images
    # batch = processor(
    #     text=texts, images=image_inputs, return_tensors="pt", padding=True
    # )
    batch = [processor(
        text=[txt], images=imgs, return_tensors="pt", padding=True
    ) for txt, imgs in zip(texts, image_inputs)]  # Encode texts and images into tensors

    for i, x in enumerate(batch):
      # The labels are the input_ids, and we mask the padding tokens in the loss computation
      labels = x["input_ids"].clone()  # Clone input IDs for labels

      labels[labels == processor.tokenizer.pad_token_id] = -100  # Mask padding tokens in labels

      # Ignore the image token index in the loss computation (model specific)
      if isinstance(processor, Qwen2VLProcessor):  # Check if the processor is Qwen2VLProcessor
          image_tokens = [151652, 151653, 151655]  # Specific image token IDs for Qwen2VLProcessor
      else:
          image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]  # Convert image token to ID

      # Mask image token IDs in the labels
      for image_token_id in image_tokens:
          labels[labels == image_token_id] = -100  # Mask image token IDs in labels

      batch[i]["labels"] = labels  # Add labels to the batch

    return {
        'input_ids': torch.stack([x['input_ids'][0] for x in batch]),
        'attention_mask': torch.stack([x['attention_mask'][0] for x in batch]),
        'pixel_values': torch.stack([x['pixel_values'][0] for x in batch]),
        'labels': torch.stack([x['labels'][0] for x in batch]),
        # ... (add other necessary keys and stacked values) ...
    }

# Set output directory
output_dir = f"./fine_tuned_{model_name}"
os.makedirs(output_dir, exist_ok=True)

# test training with CPU
# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=2,  # Reduced batch size
    per_device_eval_batch_size=2,   # Reduced batch size
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=os.path.join(output_dir, 'logs'),
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    learning_rate=2e-5,
    fp16=True,
    gradient_accumulation_steps=8,  # Increased gradient accumulation
    save_total_limit=3,
    remove_unused_columns=False,
    dataloader_num_workers=2
)
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=MessageDataset(train_data),
    eval_dataset=MessageDataset(val_data),
    data_collator=collate_fn,
)

# Start training
trainer.train()




TypeError: 'NoneType' object is not iterable

In [None]:
# Configure training arguments
training_args = SFTConfig(
    output_dir=output_dir,  # Directory to save the model
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=4,  # Batch size for training
    per_device_eval_batch_size=4,  # Batch size for evaluation
    gradient_accumulation_steps=8,  # Steps to accumulate gradients
    gradient_checkpointing=True,  # Enable gradient checkpointing for memory efficiency
    # Optimizer and scheduler settings
    optim="adamw_torch_fused",  # Optimizer type
    learning_rate=2e-4,  # Learning rate for training
    lr_scheduler_type="constant",  # Type of learning rate scheduler
    # Logging and evaluation
    logging_steps=10,  # Steps interval for logging
    eval_steps=10,  # Steps interval for evaluation
    eval_strategy="steps",  # Strategy for evaluation
    save_strategy="steps",  # Strategy for saving the model
    save_steps=20,  # Steps interval for saving
    metric_for_best_model="eval_loss",  # Metric to evaluate the best model
    greater_is_better=False,  # Whether higher metric values are better
    load_best_model_at_end=True,  # Load the best model after training
    # Mixed precision and gradient settings
    bf16=True,  # Use bfloat16 precision
    tf32=True,  # Use TensorFloat-32 precision
    max_grad_norm=0.3,  # Maximum norm for gradient clipping
    warmup_ratio=0.03,  # Ratio of total steps for warmup
    # Hub and reporting
    push_to_hub=False,  # Whether to push model to Hugging Face Hub
    report_to="wandb",  # Reporting tool for tracking metrics
    # Gradient checkpointing settings
    gradient_checkpointing_kwargs={"use_reentrant": False},  # Options for gradient checkpointing
    # Dataset configuration
    dataset_text_field="",  # Text field in dataset
    dataset_kwargs={"skip_prepare_dataset": True},  # Additional dataset options
    # max_seq_length=1024  # Maximum sequence length for input
)

training_args.remove_unused_columns = False  # Keep unused columns in dataset


wandb.init(
    project="qwen2-7b-instruct-trl-sft-ChartQA",  # change this
    name="qwen2-7b-instruct-trl-sft-ChartQA",  # change this
    config=training_args,
)

# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# # Create datasets
# math_train_data = MathVQADataset(train_data, tokenizer, image_processor)
# math_val_data = MathVQADataset(val_data, tokenizer, image_processor)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=MessageDataset(train_data), #math_train_data, #
    eval_dataset=MessageDataset(val_data), # math_val_data, #
    data_collator=collate_fn,
    peft_config=peft_config,
    tokenizer=processor.tokenizer,
)

trainer.train()

trainer.save_model(training_args.output_dir)



In [None]:

def clear_memory():
    # Delete variables if they exist in the current global scope
    if "inputs" in globals():
        del globals()["inputs"]
    if "model" in globals():
        del globals()["model"]
    if "processor" in globals():
        del globals()["processor"]
    if "trainer" in globals():
        del globals()["trainer"]
    if "peft_model" in globals():
        del globals()["peft_model"]
    if "bnb_config" in globals():
        del globals()["bnb_config"]
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


clear_memory()

In [None]:
adapter_path = "sergiopaniego/qwen2-7b-instruct-trl-sft-ChartQA"
model.load_adapter(adapter_path)

In [20]:
# test

# # Load the model
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     model_name,
#     torch_dtype=torch.float32, #if device == "cpu" else torch.bfloat16,
#     device_map=None
# )

# # Initialize processor
# min_pixels = 256 * 28 * 28
# max_pixels = 1280 * 28 * 28
# processor = AutoProcessor.from_pretrained(model_name, min_pixels=min_pixels, max_pixels=max_pixels)


results = []

for i, input in enumerate(tqdm(test_data)):
    output_text = generate_text_from_sample(model, processor, input)

    # Store results
    results.append(input | {
        "generated_text": output_text
    })
    # if i > 4:
    #   break


# pd.DataFrame(results).to_csv(f"{img_folder_path}/test_generated_answer.csv")

  0%|          | 0/608 [05:20<?, ?it/s]


KeyboardInterrupt: 

In [None]:

class MathVQADataset(Dataset):
    def __init__(self, data: List[Dict], tokenizer, image_processor, max_length=512, image_size=(448, 448)):  # Updated image size
        self.data = data
        self.tokenizer = tokenizer
        self.image_processor = image_processor
        self.max_length = max_length
        self.image_size = image_size

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        # Load and process image
        image = Image.open(f"{img_folder_path}/{item['image']}").convert('RGB')

        # Process image with all required parameters
        image_features = self.image_processor(
            image,
            return_tensors="pt",
            do_resize=True,
            size={"height": self.image_size[0], "width": self.image_size[1]},
            do_normalize=True
        )

        # Create prompt with system message
        prompt = f"<|im_start|>system\nYou are a helpful assistant that answers math questions based on images.<|im_end|>\n<|im_start|>user\nQuestion: {item['question']}<|im_end|>\n<|im_start|>assistant\nLet me solve this step by step.\nAnswer:"
        target = f"{item['answer']}<|im_end|>"

        # Tokenize input and target
        inputs = self.tokenizer(
            prompt,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        targets = self.tokenizer(
            target,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets['input_ids'].squeeze(),
            'pixel_values': image_features['pixel_values'].squeeze(),
        }

def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    pixel_values = torch.stack([item['pixel_values'] for item in batch])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels,
        'pixel_values': pixel_values
    }

def train_model(train_data: List[Dict], validation_data: List[Dict], model_save_name: str):
    # Initialize model, tokenizer, and image processor with trust_remote_code=True
    model_name = "Qwen/Qwen2-VL-2B-Instruct"
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        model_name,
        trust_remote_code=True,
        torch_dtype="auto", device_map="auto"  # Enable automatic device mapping
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    image_processor = AutoImageProcessor.from_pretrained(model_name, trust_remote_code=True)


    # # Preparation for inference
    # text = processor.apply_chat_template(
    #     messages, tokenize=False, add_generation_prompt=True
    # )
    # image_inputs, video_inputs = process_vision_info(messages)
    # inputs = processor(
    #     text=[text],
    #     images=image_inputs,
    #     videos=video_inputs,
    #     padding=True,
    #     return_tensors="pt",
    # )
    # inputs = inputs.to("cuda")

    # # Inference: Generation of the output
    # generated_ids = model.generate(**inputs, max_new_tokens=128)
    # generated_ids_trimmed = [
    #     out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    # ]
    # output_text = processor.batch_decode(
    #     generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    # )

    # Set output directory
    output_dir = f"./fine_tuned_{model_save_name}"
    os.makedirs(output_dir, exist_ok=True)

    # Create datasets
    train_dataset = MathVQADataset(train_data, tokenizer, image_processor)
    val_dataset = MathVQADataset(validation_data, tokenizer, image_processor)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=2,  # Reduced batch size
        per_device_eval_batch_size=2,   # Reduced batch size
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=os.path.join(output_dir, 'logs'),
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=500,
        save_steps=1000,
        learning_rate=2e-5,
        fp16=True,
        gradient_accumulation_steps=8,  # Increased gradient accumulation
        save_total_limit=3,
        remove_unused_columns=False,
        dataloader_num_workers=2
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=collate_fn,
    )

    # Start training
    trainer.train()

    # Save the final model
    trainer.save_model()
    tokenizer.save_pretrained(output_dir)
    image_processor.save_pretrained(output_dir)

# Usage
model_save_name = "math_vqa_model"
train_model(train_data, val_data, model_save_name)

In [None]:
def generate_prompt(example, prompt_type ='full', context=''):
    question = example['question']
    options = ''
    if len(example['options']) > 0:
        assert len(example['options']) == 5, example
        if ''.join(example['options']) != 'ABCDE':
            options = f"(A) {example['options'][0]}\n(B) {example['options'][1]}\n(C) {example['options'][2]}\n(D) {example['options'][3]}\n(E) {example['options'][4]}\n"

    # input = 'Please solve the problem step by step and put your answer in one "\\boxed{}". If it is a multiple choice question, only one letter is allowed in the "\\boxed{}".\n'+f"{question}\n{options}"
    if prompt_type == 'simple':
      prompt = f"{question}\n{options}\nAnswer the question using a single word or phrase."
    else:
      prompt = f"""{context}
        Question: {question}\n{options}
        """ + 'Please solve the problem by elaborately, providing step-by-step reasoning for the solution and put your answer in one "\\boxed{}. Also, provide the bounding box coordinate of the region that can help you answer the question better.If it is a multiple choice question, only one letter is allowed in the \\boxed{}.\n'
    return prompt

In [None]:
import re
remove_list = ['the answer', 'is', ':', 'boxed', '$']

results = pd.DataFrame(results)
def final_answer(text: str):
    text = text.lower()
    if 'answer is' not in text:
        return np.nan
    # Create a regex pattern to match all words in the remove_list
    pattern = r'(' + r'|'.join(re.escape(word) for word in remove_list) + r'|\s)'
    # Use re.sub to replace matched patterns with an empty string
    result = re.sub(pattern, '', text.split('answer is')[-1])
    result = re.sub(r'\\\{(\\frac\{[^{}]+\}\{[^{}]+\})\}', r'\1', result)
    return re.sub(r'\\\\', r'\\', result)
# Example metric: String matching (very basic)
results['prediction'] = results['generated_text'].apply(final_answer)
results['exact_match'] = results['prediction'] == results['answer']
display(results)
pd.DataFrame(results).to_csv(f"{img_folder_path}/test_generated_answer.csv")

accuracy = sum(results['exact_match']) / len(results)
print(f"Exact Match Accuracy: {accuracy:.4f}")

# Optionally save results
# import json
# with open("evaluation_results.json", "w") as f:
#     json.dump(results, f, indent=4)