In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/kaggle/input/videoframedata/new_frame_data.csv')

## FUCK MEE

In [None]:
%%capture
!pip install -q unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
from unsloth import FastVisionModel 
import torch

In [None]:
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2-VL-2B-Instruct",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

In [None]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, 
    finetune_language_layers   = True, 
    finetune_attention_modules = True, 
    finetune_mlp_modules       = True, 

    r = 16,         
    lora_alpha = 16,  
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  
    loftq_config = None, 
    
)

In [None]:
import json

output_path = "/kaggle/working/dataset.json"

# Convert the dataset to JSONL format
with open(output_path, "w") as f:
    for _, row in df.iterrows():
        json_data = {
            "image_path": row["frame_path"], 
            "question": row["question"],     
            "answer": row["refined_answer"]  
        }
        f.write(json.dumps(json_data) + "\n")

print(f"JSONL file saved to: {output_path}")

In [None]:
data = []
with open(output_path, "r") as f:
    for line in f:
        data.append(json.loads(line))

In [None]:
image_path = data[1]["image_path"]
question = data[1]["question"]
answer = data[1]["answer"]

print(f"Image Path: {image_path}")
print(f"Question: {question}")
print(f"Answer: {answer}")

In [None]:
import json
from tqdm import tqdm
from PIL import Image

# Instruction for the dataset
instruction = "Answer the question based on the content of this image."

# Function to convert a single sample into a conversation format
def convert_to_conversation(sample):
    try:
        # Load the image using PIL
        image = Image.open(sample["image_path"])
        expected_size = (224, 224)
        # Resize the image
        image = image.resize(expected_size)
        
        conversation = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": instruction},
                    {"type": "image", "image": image}, # Pass the resized image
                    {"type": "text", "text": sample["question"]}
                ]
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": sample["answer"]}
                ]
            },
        ]
        return {"messages": conversation}
    except Exception as e:
        print(f"Error processing image {sample['image_path']}: {e}")
        return None # Skip the image if there's an error

In [None]:
converted_dataset = [convert_to_conversation(sample) for sample in tqdm(data, desc="Processing Samples")]
converted_dataset = [sample for sample in converted_dataset if sample is not None]

In [None]:
from PIL import Image
from transformers import TextStreamer

model = FastVisionModel.for_inference(model)

image = converted_dataset[0]['messages'][0]['content'][1]['image']

instruction = "Answer the question based on the content of this image."
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},  
            {"type": "text", "text": instruction}
        ]
    }
]

input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
print(input_text)

inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")
print(inputs)

# Define the text streamer
text_streamer = TextStreamer(
    tokenizer=tokenizer,
    skip_prompt=True  # Skip showing the input prompt in the output
)
print(text_streamer)

# Generate the output
with torch.no_grad():  # Disable gradient computation during inference
    _ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128,
                       use_cache=True, temperature=1.5, min_p=0.1)

In [None]:
from nltk.translate.bleu_score import corpus_bleu

def compute_bleu(eval_preds):
    predictions, labels = eval_preds
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Tokenize predictions and labels
    tokenized_preds = [pred.split() for pred in decoded_preds]
    tokenized_labels = [[label.split()] for label in decoded_labels]  # Nested for multiple references
    
    # Calculate BLEU score
    bleu_score = corpus_bleu(tokenized_labels, tokenized_preds)
    return {"bleu": bleu_score}

In [None]:
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(converted_dataset, test_size=0.2, random_state=42)

In [62]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = converted_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 500,
        # num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 1e-5,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",  

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)

In [63]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
3.068 GB of memory reserved.


In [64]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 55,244 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 500
 "-____-"     Number of trainable parameters = 28,950,528
🦥 Unsloth needs about 1-3 minutes to load everything - please wait!


Step,Training Loss
1,0.4298
2,0.4348
3,0.4698
4,0.5017
5,0.5533
6,0.4892
7,0.5817
8,0.542
9,0.5312
10,0.5112


In [65]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

3400.8226 seconds used for training.
56.68 minutes used for training.
Peak reserved memory = 3.068 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 20.813 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [66]:
from transformers import TextStreamer

# Enable the model for inference
model = FastVisionModel.for_inference(model)

# Load the image as a PIL image
image = converted_dataset[2120]['messages'][0]['content'][1]['image']

print(image)

# Prepare instruction and messages
instruction = "Which dance is begin performed?"
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},  # Pass the resized PIL image directly
            {"type": "text", "text": instruction}
        ]
    }
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
print(input_text)

inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")
print(inputs)

# Define the text streamer
text_streamer = TextStreamer(
    tokenizer=tokenizer,
    skip_prompt=True  # Skip showing the input prompt in the output
)
print(text_streamer)
with torch.no_grad():  # Disable gradient computation during inference
    _ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128,
                       use_cache=True, temperature=1.5, min_p=0.1)

<PIL.Image.Image image mode=RGB size=224x224 at 0x7C9B509EB070>
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>Which dance is begin performed?<|im_end|>
<|im_start|>assistant

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198, 151652, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151653,  23085,
          

In [67]:
model.save_pretrained("kaggle/working/qwen2b")
tokenizer.save_pretrained("kaggle/working/qwen2b")

[]

In [68]:
import os 
import shutil

In [69]:
# Zip the directory
shutil.make_archive("kaggle/working/qwen2b", 'zip', "kaggle/working/qwen2b")

# Optionally, you can remove the original directory to save space
shutil.rmtree("kaggle/working/qwen2b")