Installing the required libs -- Unsloth

In [None]:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Downloading the model from Hugging Face.

In [None]:
!pip install torch
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

Paramters of the Q-LoRA model.

In [None]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import pandas as pd

# Load the CSV
dataset = pd.read_csv('/content/drive/MyDrive/augmented_synthetic_image_dataset.csv')

# Check the first few rows
print(dataset.head())


In [None]:
dataset["image"][0]

In [None]:
dataset.info()

In [None]:
from PIL import Image
import os

In [None]:
image_base_path = "/content/drive/MyDrive/Anime Images"

Joining the image's location with its file path present in the dataset, to make it a complete path of the image.

In [None]:
import os
dataset['image'] = dataset['image'].apply(lambda x: os.path.join(image_base_path, x))

Adding each image in the list

In [None]:
og_dataset = []
for idx, row in dataset.iterrows():
    print(row['image'])
    image = Image.open(row['image']).convert('RGB')    # Load the image
    text = row['image_details']
    og_dataset.append({"image": image, "text": text})

In [None]:
instruction = """You are an expert anime character recognizer.

Task:
1. If the image has a **single character**, describe it in JSON format like:
{
    "Age": "Young Adult",
    "Gender": "Female",
    "Ethnicity": "Asian",
    "Hair Style": "Ponytail",
    "Hair Color": "Black",
    "Hair Length": "Long",
    "Eye Color": "Brown",
    "Body Type": "Slim",
    "Dress": "Green Yukata"
}
STRICTLY follow this format.

2. If there are **many characters**, simply output:
"too many characters"

3. If there is **no character** (landscape or background), simply output:
"no character"

Do not add any extra words outside the specified output."""

def convert_to_conversation(sample):
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": instruction},
                {"type": "image", "image": sample["image"]}
            ]
        },
        {
            "role": "assistant",
            "content": [
                {"type": "text", "text": sample["text"]}
            ]
        },
    ]
    return {"messages": conversation}


In [None]:
converted_dataset = [convert_to_conversation(sample) for sample in og_dataset]

In [None]:
converted_dataset[0]

In [None]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = converted_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 30,
        # num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)

In [None]:
trainer_stats = trainer.train()

In [None]:
# prompt: Display image on colab via drive path

from PIL import Image
import matplotlib.pyplot as plt

image_path = "/content/download.jpeg"

try:
  img = Image.open(image_path)
  plt.imshow(img)
  plt.axis('off')  # Hide axes
  plt.show()
except FileNotFoundError:
  print(f"Error: Image not found at {image_path}")
except Exception as e:
  print(f"An error occurred: {e}")


In [None]:
FastVisionModel.for_inference(model) # Enable for inference!

image = Image.open("/content/download.jpeg").convert('RGB')
instruction = """You are an expert anime character recognizer.

Task:
1. If the image has a **single character**, describe it in JSON format like:
{
    "Age": "Young Adult",
    "Gender": "Female",
    "Ethnicity": "Asian",
    "Hair Style": "Ponytail",
    "Hair Color": "Black",
    "Hair Length": "Long",
    "Eye Color": "Brown",
    "Body Type": "Slim",
    "Dress": "Green Yukata"
}
STRICTLY follow this format.

2. If there are **many characters**, simply output:
"too many characters"

3. If there is **no character** (landscape or background), simply output:
"no character"

Do not add any extra words outside the specified output."""

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

Saving the Model.

In [None]:
# @title Setup to enable bash commands
import locale

def getpreferredencoding():
    return "UTF-8"

locale.getpreferredencoding = getpreferredencoding

In [None]:
Anime_recog_model = "json-recogniser-llama"

In [None]:
model.save_pretrained(Anime_recog_model)

In [None]:
!ls -lh {Anime_recog_model}

In [None]:
!cp -r {Anime_recog_model} /content/drive/MyDrive