In [3]:
!pip install albumentations

Collecting albumentations
  Using cached albumentations-1.4.8-py3-none-any.whl.metadata (37 kB)
Using cached albumentations-1.4.8-py3-none-any.whl (156 kB)
Installing collected packages: albumentations
Successfully installed albumentations-1.4.8


In [4]:
import os
import json
import torch
from torchvision import transforms
import numpy as np

from transformers import (
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    VisionTextDualEncoderModel,
    VisionTextDualEncoderProcessor,
    AutoTokenizer,
    AutoImageProcessor
)
from PIL import Image
from CustomDataset import CustomDataset
from torch.utils.data import Dataset, DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2
from sklearn.model_selection import train_test_split

In [5]:
# Define paths
base_dir = "/home/jupyter/novice"
jsonl_path = os.path.join(base_dir, "vlm.jsonl")
images_dir = os.path.join(base_dir, "images")
cropped_images_dir = "/home/jupyter/til-24-base/derrick/clip/images"
os.makedirs(cropped_images_dir, exist_ok=True)

In [6]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = VisionTextDualEncoderModel.from_vision_text_pretrained("openai/clip-vit-base-patch16", "roberta-base").to(device)
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
image_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch16")
processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
config = model.config

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The projection layer and logit scale weights `['visual_projection.weight', 'text_projection.weight', 'logit_scale']` are newly initialized. You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Load the dataset
image_paths, captions = crop_and_save_images(jsonl_path, images_dir)

# Split the dataset into 90% train and 10% test
train_image_paths, val_image_paths, train_captions, val_captions = train_test_split(
    image_paths, captions, test_size=0.1, random_state=42
)

In [9]:
print(config.vision_config.image_size)

224


In [10]:
image_size = config.vision_config.image_size
mean = image_processor.image_mean
std = image_processor.image_std

print(f"{image_size}, {mean}, {std}")

224, [0.48145466, 0.4578275, 0.40821073], [0.26862954, 0.26130258, 0.27577711]


In [11]:
image_size = config.vision_config.image_size
mean = image_processor.image_mean
std = image_processor.image_std

transform = A.Compose(
    [
        A.SmallestMaxSize(max_size=image_size, interpolation=cv2.INTER_CUBIC, p=1.0),
        A.CenterCrop(height=image_size, width=image_size, p=1.0),
        A.Rotate(limit=(-10, 10), p=0.2),
        A.Blur(blur_limit=(3, 3), p=0.2),
        A.Normalize(mean=mean, std=std),
        ToTensorV2(),
    ]
)

In [12]:
print(transform)

Compose([
  SmallestMaxSize(always_apply=False, p=1.0, max_size=[224], interpolation=2),
  CenterCrop(always_apply=False, p=1.0, height=224, width=224),
  Rotate(always_apply=False, p=0.2, limit=(-10, 10), interpolation=1, border_mode=4, value=None, mask_value=None, rotate_method='largest_box', crop_border=False),
  Blur(always_apply=False, p=0.2, blur_limit=(3, 3)),
  Normalize(always_apply=False, p=1.0, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], max_pixel_value=255.0, normalization='standard'),
  ToTensorV2(always_apply=True, p=1.0, transpose_mask=False),
], p=1.0, bbox_params=None, keypoint_params=None, additional_targets={}, is_check_shapes=True)


In [13]:
# Create datasets
train_dataset = CustomDataset(image_paths=train_image_paths, captions=train_captions, tokenizer=tokenizer, transform=transform)
val_dataset = CustomDataset(image_paths=val_image_paths, captions=val_captions, tokenizer=tokenizer, transform=transform)

In [15]:
output_dir="clip-augment-finetune"
learning_rate=1e-5
weight_decay=0.1
batch_size=10
num_epochs=15

In [16]:
def collate_fn(examples):
    images = torch.stack([image for image, target in examples])
    input_ids = torch.tensor([target["input_ids"] for image, target in examples], dtype=torch.long)
    attention_mask = torch.tensor([target["attention_mask"] for image, target in examples], dtype=torch.long)
    return {
        "pixel_values": images,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "return_loss": True,
    }

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    eval_strategy="epoch",
    save_strategy="epoch",
    # load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

# Train the model
trainer.train()

metrics = trainer.evaluate()
print(metrics)

trainer.save_model(os.path.join(output_dir, "saved_model"))
tokenizer.save_pretrained(os.path.join(output_dir, "saved_model"))
image_processor.save_pretrained(os.path.join(output_dir, "saved_model"))

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch,Training Loss,Validation Loss
1,1.5506,0.74771
2,0.3866,0.234
3,0.1531,0.116912
4,0.0956,0.097536
5,0.0743,0.086864
6,0.0628,0.075488
7,0.0628,0.078358
