### Installing requirements

In [None]:
!pip install torch
!pip install numpy
!pip install tqdm
!pip install torchvision
!pip install transformers
!pip install datasets
!pip install peft
!pip install accelerate
!pip install --upgrade bitsandbytes
!pip install matplotlib
!pip install git+https://github.com/openai/CLIP.git
!pip install git+https://github.com/salaniz/pycocoevalcap

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch)
  Downloading nvidia_nvjitlink_cu12-12.4.127-py3-n

### Necessary imports

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, get_scheduler, BitsAndBytesConfig
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from accelerate import Accelerator
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
import bitsandbytes as bnb

### Defining variables

In [None]:
MODEL_NAME = "OpenGVLab/InternVL2_5-4B"
DATASET_NAME = "d0rj/LLaVA-OneVision-Data-ru"
DATASET_SUBDIR = "ureader_cap"
OUTPUT_DIR = "./finetuned_model"
BATCH_SIZE = 1  # для P100 - маленький батч, чтобы не упереться в VRAM
EPOCHS = 3
LR = 2e-4
MAX_LENGTH = 512
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
INPUT_SIZE = 448   # для load_image
MAX_NUM = 6       # макс количество тайлов

### Uploading model and tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True, use_fast=True)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# модель 4bit с bnb
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map='auto',
    quantization_config=quantization_config,
    trust_remote_code=True
)

# Подготовка модели для LoRA QLoRA
model = prepare_model_for_kbit_training(model)

In [None]:
# Конфиг LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # Пример для трансформеров, можно подстроить под модель
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

### Preparing the dataset

In [None]:
dataset = load_dataset(DATASET_NAME, DATASET_SUBDIR)

train_data = dataset["train"].select(range(3000))

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

In [None]:
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

In [None]:
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

In [None]:
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images


In [None]:
def load_image(image_file, input_size=448, max_num=12):
    image = image_file.convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values

In [None]:
def preprocess_function(examples):
    input_ids_list = []
    attention_mask_list = []
    pixel_values_list = []
    labels_list = []

    for convs, image in zip(examples["conversations"], examples["image"]):
        full_text = ""
        label_mask = []

        for turn in convs:
            if turn["from"] == "human":
                text = turn["value"]
                full_text += text + "\n"
                label_mask += [0] * len(tokenizer.tokenize(text + "\n"))
            elif turn["from"] == "gpt":
                text = turn["value"]
                full_text += text + "\n"
                label_mask += [1] * len(tokenizer.tokenize(text + "\n"))

        tokenized = tokenizer(
            full_text,
            max_length=MAX_LENGTH,
            padding="max_length",
            truncation=True,
            return_tensors="pt"  # Возвращаем тензоры сразу
        )

        input_ids = tokenized["input_ids"].squeeze(0)        # [MAX_LENGTH]
        attention_mask = tokenized["attention_mask"].squeeze(0)

        label_mask = label_mask[:MAX_LENGTH]
        label_mask += [0] * (MAX_LENGTH - len(label_mask))

        labels = input_ids.clone().tolist()
        for i, mask in enumerate(label_mask):
            if mask == 0:
                labels[i] = -100
        labels = torch.tensor(labels)

        pixel_values = load_image(image, input_size=INPUT_SIZE, max_num=MAX_NUM)

        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        pixel_values_list.append(pixel_values)
        labels_list.append(labels)

    # Конвертируем списки тензоров в батч-тензоры
    batch_input_ids = torch.stack(input_ids_list)
    batch_attention_mask = torch.stack(attention_mask_list)
    # pixel_values — разной размерности по num_tiles, нельзя stack сразу, пусть остаются списком
    batch_labels = torch.stack(labels_list)

    return {
        "input_ids": batch_input_ids,
        "attention_mask": batch_attention_mask,
        "pixel_values": pixel_values_list,  # оставляем списком, паддим в collate_fn
        "labels": batch_labels,
    }

Splitting into training and test samples

In [None]:
# Передаём конкретные названия колонок для удаления
remove_columns = train_data.column_names

# Применяем map
processed_dataset = train_data.map(
    preprocess_function,
    batched=True,
    batch_size=16,
    remove_columns=remove_columns,
    load_from_cache_file=False
)

# Разбиваем на train/val
split = processed_dataset.train_test_split(test_size=0.1)
train_dataset = split["train"]
val_dataset = split["test"]

In [None]:
def collate_fn(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])         # теперь все тензоры одинакового размера
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    pixel_values_list = [item["pixel_values"] for item in batch]
    max_tiles = max(pv.size(0) for pv in pixel_values_list)

    padded_pixel_values = []
    for pv in pixel_values_list:
        pad_len = max_tiles - pv.size(0)
        if pad_len > 0:
            pad_tensor = torch.zeros((pad_len, *pv.shape[1:]), dtype=pv.dtype)
            padded = torch.cat([pv, pad_tensor], dim=0)
        else:
            padded = pv
        padded_pixel_values.append(padded)
    pixel_values = torch.stack(padded_pixel_values)

    labels = torch.stack([item["labels"] for item in batch])

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "pixel_values": pixel_values,
        "labels": labels
    }

In [None]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

### Defining metrics

In [None]:
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice
import clip

In [None]:
cider_scorer = Cider()
spice_scorer = Spice()
clip_model, clip_preprocess = clip.load("ViT-B/32", device=DEVICE)

In [None]:
# Функция для вычисления CLIPScore
def compute_clip_score(cand_sentences, ref_sentences, batch_size=16):
    scores = []
    for i in range(0, len(cand_sentences), batch_size):
        batch_cand = cand_sentences[i:i+batch_size]
        batch_ref = ref_sentences[i:i+batch_size]
        cand_inputs = clip.tokenize(batch_cand).to(DEVICE)
        ref_inputs = clip.tokenize(batch_ref).to(DEVICE)
        cand_feats = clip_model.encode_text(cand_inputs)
        ref_feats = clip_model.encode_text(ref_inputs)
        cand_feats = cand_feats / cand_feats.norm(dim=-1, keepdim=True)
        ref_feats = ref_feats / ref_feats.norm(dim=-1, keepdim=True)
        scores.extend((cand_feats * ref_feats).sum(dim=-1).tolist())
    return np.mean(scores)

### Optimizer, scheduler, accelerator

In [None]:
# --- Оптимизатор и scheduler ---
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
num_training_steps = EPOCHS * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# --- Accelerator для удобства ---
accelerator = Accelerator()
model, optimizer, train_loader, val_loader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_loader, val_loader, lr_scheduler
)

### Final preparations

In [None]:
# --- Функция генерации для оценки (beam search для валид)
def generate_text(input_ids, attention_mask):
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=MAX_LENGTH,
        num_beams=1,
        early_stopping=True,
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [None]:
# --- Тренировочный цикл ---
train_losses = []
val_losses = []
spice_scores = []
cider_scores = []
clip_scores = []

### Training loop

In [None]:
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")
    model.train()
    total_train_loss = 0
    for batch in tqdm(train_loader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # Валидация
    model.eval()
    total_val_loss = 0
    preds = []
    refs = []
    with torch.no_grad():
        for batch in tqdm(val_loader):
            outputs = model(**batch)
            loss = outputs.loss
            total_val_loss += loss.item()

            # Генерируем предсказания для метрик
            generated_texts = generate_text(batch["input_ids"], batch["attention_mask"])
            refs.extend(tokenizer.batch_decode(batch["labels"], skip_special_tokens=True))
            preds.extend(generated_texts)

    avg_val_loss = total_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    # Подсчет SPICE и CIDEr (pycocoevalcap работает со словарями)
    # Формат: [{'image_id': i, 'caption': 'text'}, ...]
    res = [{"image_id": i, "caption": p} for i, p in enumerate(preds)]
    gts = {i: [refs[i]] for i in range(len(refs))}

    cider_score, _ = cider_scorer.compute_score(gts, res)
    spice_score, _ = spice_scorer.compute_score(gts, res)

    cider_scores.append(cider_score)
    spice_scores.append(spice_score)

    # CLIPScore
    clip_score = compute_clip_score(preds, refs)
    clip_scores.append(clip_score)

    print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | SPICE: {spice_score:.4f} | CIDEr: {cider_score:.4f} | CLIPScore: {clip_score:.4f}")


### Saving model

In [None]:
# --- Сохраняем модель ---
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Модель сохранена в {OUTPUT_DIR}")

### Graphical visualization

In [None]:
epochs = np.arange(1, EPOCHS+1)

plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.plot(epochs, train_losses, label="Train Loss")
plt.plot(epochs, val_losses, label="Val Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.title("Loss")

plt.subplot(2, 2, 2)
plt.plot(epochs, spice_scores, label="SPICE", color="orange")
plt.xlabel("Epoch")
plt.ylabel("Score")
plt.legend()
plt.title("SPICE")

plt.subplot(2, 2, 3)
plt.plot(epochs, cider_scores, label="CIDEr", color="green")
plt.xlabel("Epoch")
plt.ylabel("Score")
plt.legend()
plt.title("CIDEr")

plt.subplot(2, 2, 4)
plt.plot(epochs, clip_scores, label="CLIPScore", color="red")
plt.xlabel("Epoch")
plt.ylabel("Score")
plt.legend()
plt.title("CLIPScore")

plt.tight_layout()
plt.show()