In [37]:
import shutil

# Define the directories to be deleted
image_dir = "/home/jupyter/til-24-base/derrick/clip/images"

# Delete the directories and their contents (if they exist)
try:
  shutil.rmtree(image_dir)
  print(f"The directory {image_dir} has been deleted.")
except FileNotFoundError:
  print(f"The directory {image_dir} does not exist.")

The directory /home/jupyter/til-24-base/derrick/clip/images has been deleted.


In [15]:
import os
import json
import torch
from datasets import Dataset, DatasetDict, load_dataset
from torchvision.io import ImageReadMode, read_image
from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize
from torchvision.transforms.functional import InterpolationMode
from torchvision import transforms
import numpy as np

from transformers import (
    Trainer,
    TrainingArguments,
    VisionTextDualEncoderModel,
    VisionTextDualEncoderProcessor,
    AutoTokenizer,
    AutoImageProcessor
)
from PIL import Image

In [16]:
# Define paths
base_dir = "/home/jupyter/novice"
jsonl_path = os.path.join(base_dir, "vlm.jsonl")
images_dir = os.path.join(base_dir, "images")
cropped_images_dir = "/home/jupyter/til-24-base/derrick/clip/images"
os.makedirs(cropped_images_dir, exist_ok=True)

In [17]:
# Initialize model and processor
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = VisionTextDualEncoderModel.from_vision_text_pretrained("openai/clip-vit-large-patch14", "roberta-base").to(device)
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
image_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
config = model.config

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The projection layer and logit scale weights `['visual_projection.weight', 'text_projection.weight', 'logit_scale']` are newly initialized. You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Function to crop images based on bounding boxes
def crop_and_save_images(jsonl_path, images_dir, excluded_image="image_3977.jpg"):
    cropped_data = {"image_path": [], "caption": []}
    with open(jsonl_path, "r") as f:
        for line in f:
            if line.strip() == "":
                continue
            instance = json.loads(line.strip())
            image_name = instance["image"]
            
            # Skip the excluded image
            if image_name == excluded_image:
                continue
                
            image_path = os.path.join(images_dir, image_name)
            annotations = instance["annotations"]
            for i, annotation in enumerate(annotations):
                bbox = annotation["bbox"]
                caption = annotation["caption"]
                x, y, w, h = bbox
                # image = Image.open(image_path).convert("RGB")
                # cropped_image = image.crop((x, y, x+w, y+h))
                cropped_image_path = os.path.join(cropped_images_dir, f"{image_name[:-4]}_{i}.jpg")
                # cropped_image.save(cropped_image_path)
                cropped_data["image_path"].append(cropped_image_path)
                cropped_data["caption"].append(caption)
    return cropped_data

In [19]:
# Load the dataset
dataset = crop_and_save_images(jsonl_path, images_dir)

In [20]:
# Convert to Dataset
dataset = Dataset.from_dict(dataset)

In [21]:
# Split the dataset
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

In [22]:
# Define image transformations
class Transform(torch.nn.Module):
    def __init__(self, image_size, mean, std):
        super().__init__()
        self.transforms = torch.nn.Sequential(
            Resize([image_size], interpolation=InterpolationMode.BICUBIC),
            CenterCrop(image_size),
            ConvertImageDtype(torch.float),
            Normalize(mean, std),
        )

    def forward(self, x) -> torch.Tensor:
        with torch.no_grad():
            x = self.transforms(x)
        return x

In [23]:
# Initialize torchvision transforms and jit it for faster processing.
image_transformations = Transform(
    config.vision_config.image_size, image_processor.image_mean, image_processor.image_std
)
image_transformations = torch.jit.script(image_transformations)

In [24]:
# Preprocess function
def preprocess_dataset(data, split):
    
    # We need to tokenize inputs and targets.
    column_names = data.column_names

    # Get the column names for input/targets
    image_column = "image_path"
    caption_column = "caption"
    dataset_columns = (image_column, caption_column)

    # Preprocessing the datasets.
    # We need to tokenize input captions and transform the images.
    def tokenize_captions(examples):
        captions = list(examples[caption_column])
        text_inputs = tokenizer(captions, padding="max_length", truncation=True)
        examples["input_ids"] = text_inputs.input_ids
        examples["attention_mask"] = text_inputs.attention_mask
        return examples

    def transform_images(examples):
        images = [read_image(image_file, mode=ImageReadMode.RGB) for image_file in examples[image_column]]
        examples["pixel_values"] = [image_transformations(image) for image in images]
        return examples

    data = data.map(
        function=tokenize_captions,
        batched=True,
        remove_columns=[col for col in column_names if col != image_column],
        desc=f"Running tokenizer on {split} dataset",
    )

    # Transform images on the fly as doing it on the whole dataset takes too much time.
    data.set_transform(transform_images)
    return data

In [25]:
train_data = preprocess_dataset(train_dataset, "train")
eval_data = preprocess_dataset(eval_dataset, "validation")

Running tokenizer on train dataset:   0%|          | 0/11961 [00:00<?, ? examples/s]



Running tokenizer on validation dataset:   0%|          | 0/2991 [00:00<?, ? examples/s]

In [26]:
eval_data.column_names

['image_path', 'input_ids', 'attention_mask']

In [27]:
# # Define data collator
def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    input_ids = torch.tensor([example["input_ids"] for example in examples], dtype=torch.long)
    attention_mask = torch.tensor([example["attention_mask"] for example in examples], dtype=torch.long)
    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "return_loss": True,
    }

In [28]:
output_dir="clip-vit-large-patch14-finetune"
learning_rate=1e-5
weight_decay=0.1
batch_size=4
num_epochs=10
strategy="epoch"
lr_scheduler="cosine_with_restarts"

In [29]:
training_args = TrainingArguments(
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler,    
    weight_decay=weight_decay,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    output_dir=output_dir,
    report_to='none',
    remove_unused_columns=False,

    num_train_epochs=num_epochs,
    eval_strategy=strategy,
    save_strategy=strategy,
    warmup_steps=2000,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    data_collator=collate_fn
)
# Train the model
trainer.train()

metrics = trainer.evaluate()
print(metrics)

trainer.save_model(os.path.join(output_dir, "saved_model"))
tokenizer.save_pretrained(os.path.join(output_dir, "saved_model"))
image_processor.save_pretrained(os.path.join(output_dir, "saved_model"))

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch,Training Loss,Validation Loss
1,0.2458,0.18711
2,0.0654,0.079536


KeyboardInterrupt: 