In [None]:
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
from PIL import Image

In [None]:
!kaggle competitions download -c image-processing-thai-language-image-captioning

In [None]:
!unzip image-processing-thai-language-image-captioning.zip

In [None]:
!rm image-processing-thai-language-image-captioning.zip

In [None]:
!wget http://images.cocodataset.org/zips/train2017.zip

In [None]:
!wget http://images.cocodataset.org/zips/val2017.zip

In [None]:
!unzip train2017.zip

In [None]:
!unzip val2017.zip

In [None]:
!rm train2017.zip

In [None]:
!rm val2017.zip

In [None]:
# Load the train JSON file
with open("capgen_v1.0_train.json", "r", encoding="utf-8") as file:
    train_data = json.load(file)

In [None]:
# Load the val JSON file
with open("capgen_v1.0_val.json", "r", encoding="utf-8") as file:
    val_data = json.load(file)

In [None]:
import json

train_replacements = {
    "coco/train2017/": "./train2017/",
    "ipu24/train/": "./train/train/",
}

val_replacements = {
    "coco/val2017/": "./val2017/",
    "ipu24/val/": "./val/val/",
}

# Apply replacements dynamically
train_updated_data = {}
val_updated_data = {}
unreplaced_count_train = 0
unreplaced_paths_train = []
unreplaced_count_val = 0
unreplaced_paths_val = []

for path, captions in train_data.items():
    original_path = path  # Store original path
    replaced = False

    for old, new in train_replacements.items():
        if old in path:
            path = path.replace(old, new)
            replaced = True

    if not replaced:
        unreplaced_count_train += 1
        unreplaced_paths_train.append(original_path)

    train_updated_data[path] = captions

for path, captions in val_data.items():
    original_path = path  # Store original path
    replaced = False

    for old, new in val_replacements.items():
        if old in path:
            path = path.replace(old, new)
            replaced = True

    if not replaced:
        unreplaced_count_val += 1
        unreplaced_paths_val.append(original_path)

    val_updated_data[path] = captions

# Save updated JSON
with open("capgen_v1.0_train_modified.json", "w", encoding="utf-8") as file:
    json.dump(train_updated_data, file, ensure_ascii=False, indent=4)

with open("capgen_v1.0_val_modified.json", "w", encoding="utf-8") as file:
    json.dump(val_updated_data, file, ensure_ascii=False, indent=4)

# Save metadata
metadata = {
    "unreplaced_count_train": unreplaced_count_train,
    "unreplaced_paths_train": unreplaced_paths_train,
    "unreplaced_count_val": unreplaced_count_val,
    "unreplaced_paths_val": unreplaced_paths_val,
}

print("Custom modified JSONs saved successfully!")
print(f"Unreplaced train paths count: {unreplaced_count_train}")
print(f"Unreplaced val paths count: {unreplaced_count_val}")


In [None]:
len(train_updated_data)

In [None]:
for i in val_updated_data.items():
    print(i)
    break

In [None]:
print(list(train_updated_data.items())[:3])  # Show first 3 items

In [None]:
print(list(val_updated_data.items())[:3])  # Show first 3 items

In [None]:
# ## Explore the data by finding the first index
# for i in train_updated_data.keys():
#     print(i)
#     x = i
#     image = Image.open(i)
#     break

In [None]:
# Image.open(x)

In [None]:
# train_updated_data[x]

In [None]:
# ## Explore the data by finding the first index
# for i in val_updated_data.keys():
#     print(i)
#     y = i
#     image = Image.open(i)
#     break

In [None]:
# Image.open(y)

In [None]:
# val_updated_data[y]

In [None]:
# len(train_updated_data.keys())

In [None]:
# len(val_updated_data)

In [None]:
import random
from PIL import Image
from tqdm import tqdm

instruction = "ได้โปรดคิดคำบรรยายภาพภาษาไทยรูปนี้ให้หน่อย พยายามบอกรายละเอียดของสิ่งต่าง ๆ เหมือนตอนทำโจทย์ Image Captioning"

def convert_to_conversation(sample):
    """
    Converts a sample (with keys "image_path" and "caption") into
    the conversation format expected by UnsLoth.
    """
    conversation = [
        {
          "role": "user",
          "content": [
              {"type": "text", "text": instruction},
              {"type": "image", "image": Image.open(sample["image_path"])}  # load image here
          ]
        },
        {
          "role": "assistant",
          "content": [
              {"type": "text", "text": sample["caption"]}
          ]
        },
    ]
    return {"messages": conversation}

# Convert the training data into a list of conversation dictionaries.
# Here, we select one random caption per image.
converted_train_dataset = [
    convert_to_conversation({"image_path": img_path, "caption": random.choice(captions)})
    for img_path, captions in tqdm(train_updated_data.items(), desc="Converting Train Data")
]

# converted_val_dataset = [
#     convert_to_conversation({"image_path": img_path, "caption": random.choice(captions)})
#     for img_path, captions in tqdm(train_updated_data.items(), desc="Converting Val Data")
# ]


In [None]:
!pip install bitsandbytes

In [None]:
# del model

In [None]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", # Llama 3.2 vision support
    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit", # Can fit in a 80GB card!
    "unsloth/Llama-3.2-90B-Vision-bnb-4bit",

    "unsloth/Pixtral-12B-2409-bnb-4bit",              # Pixtral fits in 16GB!
    "unsloth/Pixtral-12B-Base-2409-bnb-4bit",         # Pixtral base model

    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",          # Qwen2 VL support
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",

    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",      # Any Llava variant works!
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

In [None]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 256,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 256,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 42,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

In [None]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = converted_train_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 100,
        # num_train_epochs = 2, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 64,
    ),
)


In [None]:
train_stats = trainer.train()

In [None]:
FastVisionModel.for_inference(model) # Enable for inference!

image = Image.open("/content/test/test/00011.jpg")
instruction = "ได้โปรดคิดคำบรรยายภาพภาษาไทยรูปนี้ให้หน่อย พยายามบอกรายละเอียดของสิ่งต่าง ๆ เหมือนตอนทำโจทย์ Image Captioning"

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 60,
                   use_cache = True, temperature = 0.5, min_p = 0.1)

In [None]:
import glob
import os
import csv
import shutil
import pandas as pd
from tqdm import tqdm
from PIL import Image
from unsloth import FastVisionModel
import torch

# Load the sample submission
sample_submission_file = "sample_submission.csv"
output_file = "submission.csv"

# Copy the sample submission file to create submission.csv
shutil.copy(sample_submission_file, output_file)

# Read the sample submission file
df_submission = pd.read_csv(output_file)

# Extract image IDs from the CSV
image_ids = df_submission["image_id"].tolist()

# 1) Enable your vision model for inference
FastVisionModel.for_inference(model)

BATCH_SIZE = 4  # <-- pick a suitable batch size for your GPU
output_rows = []

# Process images in batches
for i in tqdm(range(0, len(image_ids), BATCH_SIZE), desc="Generating captions in batches"):
    batch_ids = image_ids[i : i + BATCH_SIZE]

    # Prepare images and prompts
    images = []
    prompts = []
    valid_ids = []  # Store only IDs that exist as images

    for image_id in batch_ids:
        img_path = f"/content/test/test/{image_id:05d}.jpg"  # Ensure correct filename formatting

        if os.path.exists(img_path):  # Check if the image exists
            valid_ids.append(image_id)

            # Open image
            image = Image.open(img_path)
            images.append(image)

            # Build an instruction for each image
            instruction = "ได้โปรดคิดคำบรรยายภาพภาษาไทยรูปนี้ให้หน่อย พยายามบอกรายละเอียดของสิ่งต่าง ๆ เหมือนตอนทำโจทย์ Image Captioning"
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image"},
                        {"type": "text", "text": instruction}
                    ]
                }
            ]

            # Convert messages to final text
            input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
            prompts.append(input_text)

    if not images:
        continue  # Skip batch if no valid images found

    # 4) Tokenize the entire batch at once
    inputs = tokenizer(
        images,  # list of PIL images
        prompts,  # list of strings
        add_special_tokens=False,
        return_tensors="pt",
        padding=True
    ).to("cuda")

    # 5) Generate captions for the batch
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=60,
            use_cache=True,
            temperature=0.5,
            min_p=0.1
        )

    # 6) Decode each output
    for idx, out_ids in enumerate(output_ids):
        generated_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        generated_text = generated_text.replace("<|im_end|>", "").strip()

        # Store predictions
        output_rows.append((valid_ids[idx], generated_text))

# Update the CSV with new captions
df_submission.set_index("image_id", inplace=True)
for image_id, caption in output_rows:
    df_submission.loc[image_id, "caption"] = caption

# Save the updated submission file
df_submission.to_csv(output_file)

print(f"Done! Captions saved in {output_file}")


In [None]:
df_submission.isna().sum()

In [None]:
(pd.read_csv("cleaned_submission.csv").fillna("คือ")).to_csv("submission2.csv",index=True)
