In [1]:
from zipfile import ZipFile

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
with ZipFile('/content/drive/MyDrive/LLM/ROCO/images/images.zip','r') as zipobj:
  zipobj.extractall('/content/drive/MyDrive/LLM/ROCO/images')

In [1]:
import pandas as pd
import json
from torch.utils.data import Dataset
from transformers import AutoProcessor
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM
import torch
from tqdm import tqdm

In [26]:
# path to the csv containing training data directories
train_data_csv = "/content/drive/MyDrive/LLM/ROCO/images/traindata.csv"
# path to the folder containing the training data images
train_data_folder = "/content/drive/MyDrive/LLM/ROCO/images/images"
# path to the csv containing training data directories
#validation_data_csv = ""
# path to the folder containing the training data images
#validation_data_folder = ""
# save pretrained model to
output_dir = "/content/drive/MyDrive/LLM/ROCO/med-git-base"


In [27]:
df = pd.read_csv(train_data_csv)
captions = [{"file_name": df.iloc[i]["name"],
             "text": df.iloc[i]["caption"].strip()} for i in range(len(df))]

# add metadata.jsonl file to this folder
with open(train_data_folder + "/metadata.jsonl", 'w') as f:
    for item in captions:
        f.write(json.dumps(item) + "\n")


In [28]:
from datasets import load_dataset
dataset = load_dataset("imagefolder", data_dir=train_data_folder, split="train")


Resolving data files:   0%|          | 0/11503 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [29]:
class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]

        encoding = self.processor(images=item["image"], text=item["text"], padding="max_length", return_tensors="pt")

        # remove batch dimension
        encoding = {k:v.squeeze() for k,v in encoding.items()}
        return encoding


processor = AutoProcessor.from_pretrained("microsoft/git-base")
train_dataset = ImageCaptioningDataset(dataset, processor)

In [30]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=4)

model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")

#dummy forward pass
batch = next(iter(train_dataloader))
outputs = model(input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                pixel_values=batch["pixel_values"],
                labels=batch["input_ids"])
print(outputs.loss)

tensor(11.9325, grad_fn=<NllLossBackward0>)


In [None]:
# Train the model
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model.to(device)


num_epochs = 30
train_loss_history = []

for epoch in range(num_epochs):
    print("Epoch:", epoch)
    avg_loss = 0
    with tqdm(total=len(train_dataloader)) as pbar:
        model.train()
        for batch_idx, batch in enumerate(train_dataloader):
            input_ids = batch.pop("input_ids").to(device)
            pixel_values = batch.pop("pixel_values").to(device)
            outputs = model(input_ids=input_ids,
                            pixel_values=pixel_values,
                            labels=input_ids)
            loss = outputs.loss
            train_loss_history.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            avg_loss = (avg_loss * batch_idx + loss.item()) / (batch_idx + 1)
            pbar.update(1)
            pbar.set_description(f"Epoch {epoch}, Loss {loss:.4f}, Avg Loss {avg_loss:.4f}")

    model.save_pretrained(output_dir)
    processor.save_pretrained(output_dir)

    '''with torch.no_grad():
        model.eval()
        validation_loss = 0
        for batch_idx, batch in enumerate(validation_dataloader):
            input_ids = batch.pop("input_ids").to(device)
            pixel_values = batch.pop("pixel_values").to(device)
            outputs = model(input_ids=input_ids,
                            pixel_values=pixel_values,
                            labels=input_ids)
            loss = outputs.loss
            validation_loss += loss.item()
        validation_loss /= len(validation_dataloader)
        print(f"Epoch {epoch}, Validation Loss {validation_loss:.4f}")'''




cuda
Epoch: 0


Epoch 0, Loss 0.3029, Avg Loss 0.3528:  38%|███▊      | 1088/2876 [1:04:39<2:02:03,  4.10s/it]